# 1.2 Data Transformation and Preprocessing
### settings: CPU 4,  memory 69G, chunk: 2, GPU model: P100, number of GPU:2

In [1]:
import numpy as np
import pandas as pd
import torch
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cuda')

# Dataset Utils

In [1]:
%%writefile dataset.py



from data_loader import *
import numpy as np
import pandas as pd
import torch
import sklearn
import os
import random



def create_train_test_file_list(file_name= "all_files_list.txt",person_name = 'wenkanw',
                     out_path = "../data-file-indices/",root_path= "../",
                     test_ratio = 0.2, print_flag = True, shuffle=False, random_state=None):
    """
    This function is used to split test set and training set based on file names
    
    """
    shm_file_ls = []
    event_file_ls = []
    new_files = []
    if person_name == "CAD":
        fp = open(out_path+ person_name+ "/" +"batch-unix.txt", "r")
        txt = fp.read()
        fp.close()
        # save all file list
        fp = open(out_path+ person_name+ "/" +"all_files_list.txt", "w")
        fp.write(txt)
        fp.close()
        
        txt_ls = txt.split("\n")
        txt_ls.remove("")
        txt_ls= [txt+"\n" for txt in txt_ls]
        test_size = int(len(txt_ls)*test_ratio)
        test = "".join(txt_ls[len(txt_ls) - test_size: ])
        train = "".join(txt_ls[:len(txt_ls) - test_size ])
        
        fp = open(out_path+ person_name+ "/" +"test_files.txt", "w")
        fp.write(test)
        fp.close()
        
        fp = open(out_path+ person_name+ "/" +"train_files.txt", "w")
        fp.write(train)
        fp.close()
        
        if print_flag:
            print("Train:", len(txt_ls) - test_size)
            print(train)
            print("test: ",test_size)
            print(test)
        return 
        
        
    for dirname, _, filenames in os.walk(root_path + 'data/IndividualData'):
        for filename in filenames:
            # check every file name in the individual data folder
            path = os.path.join(dirname, filename)
#             print("Path: ",path)
            # check if datafile is shm file and is not a test file
            if ".shm" in filename and person_name in path and 'test' not in path:
                # If the data file has label file as well, then it is valid
                # and we add it to the filename list
                event_file_name =  filename.replace(".shm","-events.txt")
                
                if event_file_name in filenames:
                    # if both shm and event files exist
                    new_file = path.replace(root_path+"data/","")
                    new_file += "\n"
                    new_files.append(new_file)
    if shuffle:
        random.seed(random_state)
        random.shuffle(new_files)
        pass
    else:
        new_files.sort()
        
    if test_ratio > 0.:
        # split train files and test files
        test_size = int(len(new_files)*test_ratio)
        test_files = new_files[:test_size]
        train_files = new_files[test_size:]
        # write train files
        fp = open(out_path+ person_name+ "/" +"train_files.txt", "w")
        train = "".join(train_files)
        
        fp.write(train)
        fp.close()
        # write test files
        fp = open(out_path+ person_name+ "/" +"test_files.txt", "w")
        test = "".join(test_files)
        fp.write(test)
        fp.close()
        
        if print_flag:
            print("Train:")
            print(train)
            print("test: ")
            print(test)
    
    fp = open(out_path+person_name+ "/"+file_name, "w")
    all_files = "".join(new_files)
    fp.write(all_files)
    fp.close()
    
    if print_flag:
        print("All files: ")
        print(all_files)
        





class Person_MealsDataset(torch.utils.data.Dataset):
    def __init__(self, dataset = None,person_name= "wenkanw", 
                 data_indices_file = "../data-file-indices/",
                 file_name = "all_files_list",
                 remove_trend = 0,
                 remove_walk = 0,
                 remove_rest = 0,
                 smooth_flag = 1,
                 normalize_flag = 1,
                 winmin = 6,
                 stridesec = 15,
                 gtperc = 0.5,
                 device = 'cpu',
                 ratio_dataset=1,
                load_splitted_dataset = False,
                 enable_time_feat = False,
                 debug_flag= False,
                 get_numpy_data= False
                ):
        
        if file_name == "train":
            file_name = data_indices_file + person_name +"/"+"train_files.txt"
        elif file_name == "test":
            file_name = data_indices_file + person_name +"/"+"test_files.txt"
        else:
            file_name = data_indices_file + person_name +"/"+ file_name+".txt"
            
        # Note: file_name is the name of file that contain the list of shm files' names
        self.file_name = file_name
        self.get_numpy_data = get_numpy_data
        self.dataset = dataset
        self.person_name = person_name
        self.winmin = winmin
        self.stridesec = stridesec
        self.load_splitted_dataset = load_splitted_dataset
        self.remove_trend = remove_trend
        self.remove_walk = remove_walk
        self.remove_rest = remove_rest
        self.smooth_flag = smooth_flag
        self.normalize_flag = normalize_flag
        self.gtperc = gtperc,
        self.ratio_dataset = ratio_dataset
        self.enable_time_feat = enable_time_feat
        self.device = device
        self.debug_flag= debug_flag
        if not self.dataset:
            self.get_data(person_name)

    def get_data(self, person_name):
            
            
            # files_counts, data, samples_indices, labels_array
            # Note: the data preprocessing in this function is for global time series dataset
            
            self.dataset, self.data, self.data_indices, self.labels = load_train_test_data(data_file_list =self.file_name,
                                    load_splitted_dataset = False,
                                     ratio_dataset=self.ratio_dataset,
                                     enabled_time_feat = self.enable_time_feat, 
                                     winmin = self.winmin, stridesec = self.stridesec,gtperc = self.gtperc,
                                     removerest = self.remove_rest,
                                     removewalk = self.remove_walk, smooth_flag = self.smooth_flag, normalize_flag=self.normalize_flag, 
                                     remove_trend = self.remove_trend,
                                     debug_flag=self.debug_flag )
            
            if self.load_splitted_dataset:
                self.dataset = self.get_dataset()
                
            
        
    def __getitem__(self, index):
        # TODO
        # 1. Read one data from file (e.g. using numpy.fromfile, PIL.Image.open).
        # 2. Preprocess the data (e.g. torchvision.Transform).
        # 3. Return a data pair (e.g. image and label).
        #这里需要注意的是，第一步：read one data，是一个data
        data = self.get_item(index)
        if self.get_numpy_data:
            return data['data'].numpy() ,data['label']
        return data['data'],data['label']
        
        pass
    def __len__(self):
        # You should change 0 to the total size of your dataset.
        return  len(self.dataset) if self.load_splitted_dataset else len(self.data_indices)
    def get_item(self, index, tensor_type=True):
        """
        This function is used to obtain one sample data point
        """
        f,start_time, end_time = self.data_indices[index,0], self.data_indices[index,1], self.data_indices[index,2]
        sample = self.data[f][start_time : end_time]
        data = pd.DataFrame(columns=['data','label'])    
        # Add time feature to data
        if self.enable_time_feat:
            time_offset = self.data_indices[index,3]
            freq = 1.0/15.0
            time_feat = np.array([[i for i in range(len(sample))]],dtype=float).transpose()
            time_feat *= freq
            time_feat += float(start_time)* freq
            time_feat += time_offset
            sample = np.concatenate((sample, time_feat),axis=1)
        label = self.labels[index]
        if tensor_type:
            data = {"data":torch.tensor(sample, dtype =torch.float, device =  self.device ), 'label': label}
        else:
            data = {"data":sample, 'label': label}
        return data
    
    def get_dataset(self, start_index = None, end_index = None):
        """
        This function is used to obtain the whole dataset in pandas or part of whole dataset
        It is good to use this to sample some data to analyze
        """
        start_i = 0 if not start_index else start_index
        end_i = self.__len__() if not end_index else end_index
        
        dataset = pd.DataFrame(columns=['data','label'])
        for i in tqdm(range(start_i, end_i)):
            data = self.get_item(i)
            dataset = dataset.append(data,ignore_index=True)
        self.dataset = dataset
        return self.dataset
    
    def sample(self, num = 1000,random_state = None):
        """
        Simply sample part of data for analysis
        """
        if random_state != None:
            np.random.seed(random_state)
            
        sample_data = pd.DataFrame(columns=['data','label'])
        indices = np.random.choice(len(self.labels), num)
        for i in tqdm(indices):
            data = self.get_item(i)
            data["data"] = data["data"].numpy()
            sample_data = sample_data.append(data,ignore_index=True)
        return sample_data
    
    def get_subset(self, indices_ls):
        axdata = []
        aydata = []
        for i in indices_ls:
            data = self.get_item(i, tensor_type=False)
            sample = data['data']
            label = data['label']
            axdata.append(sample)
            aydata.append(label)
        subsetData = np.array(axdata, copy=True) # Undersampled Balanced Training Set
        subsetLabels = np.array(aydata, copy=True)
        del axdata
        del aydata
        return subsetData, subsetLabels
    
    def get_GT_segment(self,root_path = "../data/",print_file=False):
        file_ls = []
        fp = open(self.file_name,"r")
        txt = fp.read()
        fp.close()
        file_ls = txt.split("\n")
        while '' in file_ls:
            file_ls.remove('')
        
        start_ls = []
        end_ls = []
        total_events =[]
        for file_name in file_ls:
            file_name = root_path + file_name
            TotalEvents, EventStart, EventEnd, EventNames, TimeOffset,EndTime = loadEvents(file_name, debug_flag = False, print_file=print_file)
            start_ls.append(EventStart[:TotalEvents])
            end_ls.append(EventEnd[:TotalEvents])
            
        return  start_ls, end_ls
        
        
    def get_mealdataset_info(self,person_name = None,file_ls = [], file_ls_doc=None,root_path = "../data/",print_file=False):
        """
        if file_ls is not given, then get file_ls according to person_name
        file path = root_path + file name in all_files_list.txt

        return:
            meal event count, total minutes of all meals, total hours of all meals,total day counts

        """
        if person_name ==None:
            person_name = self.person_name
        if len(file_ls) ==0:
            if file_ls_doc != None:
                data_indices_file = "../data-file-indices/" +person_name+"/"+ file_ls_doc
                fp = open(data_indices_file,"r")
            else:
                fp = open(self.file_name,"r")
            txt = fp.read()
            fp.close()
            file_ls = txt.split("\n")
            while '' in file_ls:
                file_ls.remove('')

        meal_counts = 0
        sec_counts = 0
        min_counts = 0
        hour_counts = 0
        total_hours = 0
        total_mins = 0
        total_sec = 0
        day_counts = len(file_ls)
        for file_name in file_ls:
            file_name = root_path + file_name
            TotalEvents, EventStart, EventEnd, EventNames, TimeOffset,EndTime = loadEvents(file_name, debug_flag = False, print_file=print_file)
            meal_counts += TotalEvents
            total_sec +=  abs(EndTime - TimeOffset)
#             total_hours += (EndTime//(60*60) - TimeOffset//(60*60))
#             total_mins  += (EndTime%(60*60) - TimeOffset//(60*60))
            for i in range(len(EventStart)):
                sec_counts += ( EventEnd[i]- EventStart[i])//(15)
        total_hours = total_sec//(60*60)
        min_counts = sec_counts//60
        hour_counts = min_counts//60
        
        return meal_counts, min_counts,hour_counts, day_counts, total_hours


        
        
            
                
def balance_data_indices(labels, data_indices=None,sample_num = 4000,mode= "under", replace = False,shuffle=True, random_state = 1000):
    """
    sample_num: number of samples of each class after balancing
    mode: 
        under - undersampling
        over - oversampling
        mix - undersampling negative samples + oversampling positive samples, each class has sample_num amount samples in this mode
    return:
        balanced indices
    """
    if data_indices:
        eat_labels_index = [data_indices[i] for i, e in enumerate(labels) if e >= 0.5]
        not_eat_labels_index = [data_indices[i] for i, e in enumerate(labels) if e < 0.5]
    else:
        eat_labels_index = [i for i, e in enumerate(labels) if e >= 0.5]
        not_eat_labels_index = [i for i, e in enumerate(labels) if e < 0.5]
        
    eat_index = eat_labels_index
    not_eat_index = not_eat_labels_index
    if random_state != None:
        np.random.seed(random_state)
        
    if mode == "over":
        eat_index = np.random.choice(eat_labels_index,len(not_eat_labels_index)).tolist()
        pass
    elif mode == "under":
        not_eat_index = np.random.choice(not_eat_labels_index,len(eat_labels_index),replace = replace).tolist()
        pass
    else:
        #default as mix
        eat_index = np.random.choice(eat_labels_index,sample_num, replace = replace).tolist()
        not_eat_index = np.random.choice(not_eat_labels_index,sample_num, replace = replace).tolist()
        pass
    
    indices_balanced = eat_index + not_eat_index
    if shuffle:
        np.random.shuffle(indices_balanced)
    
    return indices_balanced

def create_datasets(names=[], winmin = 6,stridesec = 5,smooth_flag = 1,normalize_flag = 1):
    """
    generate a dictionary of datasets
    """
    datasets = {}
    for person in names:
        meal_data = Person_MealsDataset(person_name= person, file_name = "all_files_list", winmin = winmin,stridesec = stridesec,smooth_flag = smooth_flag,
                     normalize_flag = normalize_flag)
        datasets[person]  = meal_data
    return datasets


from collections import defaultdict
from data_loader import loadEvents
def get_meal_info(person_name = None,file_ls = [], file_ls_doc=None,root_path = "../data/",print_file=False,round_decimal=1):
        """
        if file_ls is not given, then get file_ls according to person_name
        file path = root_path + file name in all_files_list.txt

        return:
            meal event count, total minutes of all meals, total hours of all meals,total day counts

        """
        meal_counts = 0
        sec_counts = 0
        min_counts = 0
        hour_counts = 0
        total_hours = 0
        total_mins = 0
        total_sec = 0
        day_counts = 0 
        if person_name ==None:
            return meal_counts, min_counts,hour_counts, day_counts, total_hours

        data_indices_file = "../data-file-indices/" +person_name+"/all_files_list.txt"
        fp = open(data_indices_file,"r")
        txt = fp.read()
        fp.close()
        file_ls = txt.split("\n")
        while '' in file_ls:
            file_ls.remove('')
       
        day_counts = len(file_ls)
        
        for file_name in file_ls:
            file_name = root_path + file_name
            TotalEvents, EventStart, EventEnd, EventNames, TimeOffset,EndTime = loadEvents(file_name, debug_flag = False, print_file=print_file)
            meal_counts += TotalEvents
            total_sec +=  abs(EndTime - TimeOffset)
            for i in range(len(EventStart)):
                sec_counts += ( EventEnd[i]- EventStart[i])//15
                
        total_hours = total_sec/(60*60)
        min_counts = sec_counts/60
        hour_counts = min_counts/60
        average_meal_per_day = meal_counts/len(file_ls)
        average_hour_per_meal = hour_counts/meal_counts
        # round numbers
        total_hours = round(total_hours, round_decimal)
        min_counts = round(min_counts, round_decimal)
        hour_counts = round(hour_counts, round_decimal)
        average_meal_per_day = round(average_meal_per_day,round_decimal)
        average_hour_per_meal = round(average_hour_per_meal, round_decimal)
        
        no_eating_hours = total_hours - hour_counts
        weight_ratio = round(no_eating_hours/hour_counts, round_decimal)
        result = pd.DataFrame({"dataset": person_name,"Days":day_counts, 
                      "Total Hours":total_hours,"Meal Counts":meal_counts,
                      "Average Meal Counts Per Day":average_meal_per_day,"Average Hours Per Meal": average_hour_per_meal,
                      "Eating Hours":hour_counts, "No Eating Hours":no_eating_hours,
                     "Balance Ratio(no_eat/eat)":weight_ratio},index=[0])
    
        return result

          
def get_dataset_info(names= ["wenkanw"],winmin=6,stridesec=5):
    meal_info = defaultdict(list)
    dataset_results = pd.DataFrame()
    for name in names:
        result = get_meal_info(person_name=name)
        if dataset_results.empty:
            dataset_results = result
        else:
            dataset_results = dataset_results.append(result,ignore_index=True)
    
    # append total summary
#     print( dataset_results)
    total_result=pd.DataFrame({"dataset":"total"},columns = dataset_results.columns,index=[0])
    # append average summary
    average_result=pd.DataFrame({"dataset":"average"},columns = dataset_results.columns,index=[0])
    key_ls = ["Days","Total Hours","Meal Counts","Eating Hours","No Eating Hours"]
    for key in dataset_results.columns:
        if key in key_ls:
            total_result[key].at[0] = round(dataset_results[key].sum() ,1)
            average_result[key].at[0] = round(dataset_results[key].mean(),1)

    ls = [total_result, average_result]
    for df in ls:
        df["Average Meal Counts Per Day"].at[0] = round(df["Meal Counts"].values[0]/df["Days"].values[0], 1)
        df["Average Hours Per Meal"].at[0] =round( df["Eating Hours"].values[0]/df["Meal Counts"].values[0], 1)
        df["Balance Ratio(no_eat/eat)"].at[0] =round(df["No Eating Hours"].values[0]/df["Eating Hours"].values[0],1)
        dataset_results =dataset_results.append(df,ignore_index=True)

    return dataset_results

Overwriting dataset.py


# Pytorch Utils

In [4]:
%%writefile utils.py
import sys
import os
import datetime
from tqdm import tqdm
from sklearn.metrics import classification_report,recall_score, precision_score
import torch
from torch import nn
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
from sklearn.metrics import classification_report,recall_score, precision_score


def data_parallel(module, input, device_ids, output_device=None):
    if not device_ids:
        return module(input)

    if output_device is None:
        output_device = device_ids[0]

    replicas = nn.parallel.replicate(module, device_ids)
    inputs = nn.parallel.scatter(input, device_ids)
    replicas = replicas[:len(inputs)]
    outputs = nn.parallel.parallel_apply(replicas, inputs)
    return nn.parallel.gather(outputs, output_device)


def eval_model(model,dataloader,device="cpu"):
    correct = 0.
    total = 0.
    TP = 0
    FN = 0
    # without update
    with torch.no_grad():
        for samples, labels in dataloader:
            samples = samples.to(device)
            labels = labels.to(device)
            outputs = model(samples).squeeze()
            #print("Output: ", outputs)
            outputs = torch.round(torch.sigmoid(outputs))
            preds = outputs>=0.5
            preds = preds.to(dtype = torch.float)
            preds.requires_grad = False
#             _,preds = torch.max(outputs,1)
            for i in range(len(preds)):
                if preds[i] == 1 and labels[i] == 1:
                    TP += 1
                if preds[i] == 0 and labels[i] == 1:
                    FN += 1
            correct += torch.sum((preds == labels)).item()
            total += float(len(labels))
        acc =100 * correct/ total
        recall = TP/(TP+FN)
#         print("Evaluation Acc: %.4f %%,  Recall: %.4f "%(acc , recall))
    return acc, recall
            
            
            
            

def train_model(model,dataloader, optimizer, criterion,lrscheduler,device="cpu" , n_epochs=20,
                earlystopping=True, patience= 5, l1_enabled=True,checkpoint_name ="checkpoint.pt" ):
    loss_ls = [0.0]
    train_acc_ls = [0.0]
    valid_acc_ls = [0.0]
    valid_acc = 0.0
    loss =0.0
    train_acc = 0.0
    patience_count = 0
    best_val_score = 0.0
    prev_val_score = 0.0
    best_model = None
    
    train_dataloader, valid_dataloader = dataloader
    print("Training set batch amounts:", len(train_dataloader))
    print("Test set :", len(valid_dataloader))
    print("Start Training..")
    
    for e in range(n_epochs):
        running_loss = 0.0
        epoch_loss = 0.0
        running_correct = 0.0
        correct_cnt = 0.0
        total_cnt = 0.0
        TP = 0.
        FN = 0.
        model.train()
        for i, (samples, labels) in enumerate(train_dataloader):
            samples = samples.to(device)
            labels = labels.to(device, dtype=torch.float32)
            
            optimizer.zero_grad()
            # reshape samples
            outputs = model(samples).squeeze()

            #print("Output: ", outputs, "label: ", labels)
            
            # Compute loss
            loss = criterion(outputs, labels)
            if l1_enabled:
                L1_loss = model.l1_loss(0.01).to(device)
                loss += L1_loss
            loss.backward()
            optimizer.step()
            
            # prediction
            #_,preds = torch.max(outputs,1)
            outputs = torch.round(torch.sigmoid(outputs))
            preds = outputs>=0.5
            preds = preds.to(dtype = torch.float)
            preds.requires_grad = False
            
            # Compute count of TP, FN
            for j in range(len(preds)):
                if preds[j] == 1. and labels[j] == 1.:
                    TP += 1
                if preds[j] == 0. and labels[j] == 1.:
                    FN += 1
            
            running_loss += loss.item()
            correct_cnt += torch.sum((preds == labels)).item()
            total_cnt += float(len(labels))
            batch_acc = 100. * (preds == labels).sum().item()/ float(len(labels))
            if i %50 ==0:
                #print("===> Batch: %d,  Batch_Loss: %.4f, Train Acc: %.4f %%,  Recall: %.f\n"%(i, loss,batch_acc, recall))
                pass

            
        
        # Compute accuracy and loss of one epoch
        epoch_loss = running_loss / len(train_dataloader)  
        epoch_acc = 100* correct_cnt/ total_cnt  # in percentage
        correct_cnt = 0.0
        total_cnt = 0.0
        train_recall = TP/(TP+FN)
        
        #Validation mode
        model.eval()
        valid_acc, valid_recall= eval_model(model,valid_dataloader,device=device)
        
        # record loss and accuracy
        valid_acc_ls.append(valid_acc)  
        train_acc_ls.append(epoch_acc)
        loss_ls.append(epoch_loss)
        
        if e %1==0:
            print("Epoch: %d,  Epoch_Loss: %.4f, Train Acc: %.4f %%, Train Recall: %.4f, Validation Acc:  %.4f %%,  Validation Recall: %.4f  "%(e, epoch_loss,
                                                                                     epoch_acc,train_recall,valid_acc, valid_recall))
        
        # Reset train mode
        model.train()
        lrscheduler.step(valid_acc)
        
        
        # If earlystopping is enabled, then save model if performance is improved
        if earlystopping:
            if prev_val_score !=0. and valid_acc < prev_val_score :
                patience_count += 1
            else:
                patience_count = 0
                
            if patience_count >= patience:
                break 
                
            prev_val_score = valid_acc
            if valid_acc > best_val_score or best_val_score == 0.0:
                best_val_score = valid_acc
                torch.save(model,checkpoint_name)
                print("Checkpoint Saved")
            
                
        print("\n")
        
        
            
    # Load best model
    best_model = torch.load(checkpoint_name)
    print("Load Best Model.")
    print("Training completed")
        
    return model, best_model,best_val_score,loss_ls, train_acc_ls, valid_acc_ls
            

def plot_data(train_acc_ls,valid_acc_ls,loss_ls ):
    """
    Plot validation accuracy, training accuracy and loss
    """
    fig, ax = plt.subplots(1,2,figsize=(20,5))
    epochs = [i for i in range(len(train_acc_ls))]
    _ = sns.lineplot(x=epochs, y= train_acc_ls,ax=ax[0])
    _ = sns.lineplot(x=epochs, y= valid_acc_ls,ax=ax[0])
    ax[0].set_xlabel("Epoches")
    ax[0].set_ylabel("Accuracy")
    ax[0].legend(["Training Accuracy", "Validation Accuracy"])
    
    _ = sns.lineplot(x=epochs[1:], y= loss_ls[1:],ax=ax[1])
    ax[1].set_xlabel("Epoches")
    ax[1].set_ylabel("Training Loss")
    ax[1].set(yscale="log")
    plt.show()
    
def split_train_test_indices(X, y, test_size, random_seed = None):
    """
    This function is to split the training set indices into validation set indices and training set indices
    
    X: indices of dataset/ subset of dataset
    y: labels of dataset / subset of dataset
    
    """
    from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, train_test_split

    train_indices = []
    test_indices = []
    y_train = []
    y_test = []
    
    if test_size ==0:
        train_indices = X
        y_train = y
    elif test_size == 1:
        test_indices = X
        y_test = y
    elif test_size >0 and test_size <1:
        train_indices, test_indices, y_train, y_test = train_test_split(X,y,
                                                            stratify=y, 
                                                            test_size=test_size,random_state = random_seed)
        
    else:
        print("Invalid split ratio: %.3f"%(test_size))
    if len(train_indices)>0:
        print("Train set size: %d, with %d positive samples and %d negative samples"%(len(y_train),sum(y_train==1),
                                                                          sum(y_train==0)))
    if len(test_indices)>0:
        print("Test set size: %d, with %d positive samples and %d negative samples"%(len(y_test),
                                                                          sum(y_test==1),
                                                                           sum(y_test==0)))
    
    return train_indices, test_indices
            

    
def print_settings(winmin,stridesec, EPOCHS):
    """
    This is just a function to print information of training settings
    """
    outfile = sys.stdout

    winlength = int(winmin * 60 * 15)
    step = int(stridesec * 15)
    start_time = datetime.datetime.now()
    arr = ["echo -n 'PBS: node is '; cat $PBS_NODEFILE",\
          "echo PBS: job identifier is $PBS_JOBID",\
          "echo PBS: job name is $PBS_JOBNAME"]

    [os.system(cmd) for cmd in arr]
    print("*****************************************************************\n", file=outfile, flush=True)
    print("Execution Started at " + start_time.strftime("%m/%d/%Y, %H:%M:%S"), file=outfile, flush=True)
    print("WindowLength: {:.2f} min ({:d} datum)\tSlide: {:d} ({:d} datum)\tEpochs:{:d}\n".format(winmin, winlength, stridesec, step, EPOCHS), file=outfile, flush=True)
    
    

def cross_validation(dataset, data_indices, model,n_epochs=30,k=5, device="cpu", random_state = 1000, checkpoint_path = "./"  ):
    from sklearn.model_selection import StratifiedKFold
    
    best_val_score = 0
    overall_best_model = None
    best_fold = None
    all_loss_ls = []
    all_train_acc_ls = []
    all_valid_acc_ls = []
    data_indices = np.array(data_indices)
    
    skf = StratifiedKFold(n_splits=k)
    
    labels = dataset.labels[data_indices]
    np.random.seed(random_state)
    seeds = np.random.randint(low=0, high=1000,size=k)
    
    
    for fold_ind, (train_fold, valid_fold) in enumerate(skf.split(data_indices, labels)):
        torch.manual_seed(seeds[fold_ind])
        
        print("===========================> Running Fold: %d"%(fold_ind))
        print()
        train_indices = data_indices[train_fold]
        valid_indices = data_indices[valid_fold]
        # Train set    
        train_set_fold = torch.utils.data.Subset(dataset, train_indices)
        train_loader_fold = torch.utils.data.DataLoader(train_set_fold,batch_size=32, shuffle=True)

        # validation set
        valid_set_fold = torch.utils.data.Subset(dataset, valid_indices)
        valid_loader_fold = torch.utils.data.DataLoader(valid_set_fold,batch_size=32, shuffle=True)
          
        # Re-initialize models
        cv_model = model
        # Since I use a dynamic created layer in network, need to input a sample to initialize the model first
        cv_model.apply(weights_init)
        cv_model.to(device)
        criterion = nn.BCEWithLogitsLoss()

        optimizer = optim.Adam(cv_model.parameters(),lr=0.01,  weight_decay=0.1)
        lrscheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',patience= 2, factor = 0.1,threshold=0.01)


        dataloader = (train_loader_fold,valid_loader_fold )
        cv_model, best_model,val_score,loss_ls, train_acc_ls, valid_acc_ls = train_model(cv_model,dataloader, optimizer, criterion, 
                                                                                      lrscheduler, device= device,
                                                                            n_epochs=n_epochs, patience = 5, l1_enabled=False,
                                                                            checkpoint_name =checkpoint_path+"cross_valid_checkpoint_"+str(fold_ind)+".pt")
        best_model.eval()
        valid_acc, recall = eval_model(best_model, valid_loader_fold,device)
        
        all_valid_acc_ls.append(valid_acc)
        
        print("Fold %d Completed"%(fold_ind))
    print("Cross Validation Completed，score is %.4f %%"%( np.mean(all_valid_acc_ls)))
    
    return all_valid_acc_ls
        

    
        
    
    
    

    
    
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        nn.init.normal_(m.weight.data, 0.0, 1.)
#         nn.init.xavier_uniform_(m.weight.data, gain=nn.init.calculate_gain('relu'))
#         nn.init.normal_(m.weight.data, 0.0, 0.01)
    elif classname.find('BatchNorm') != -1:
        nn.init.normal_(m.weight.data, 1.0, 0.02)
        nn.init.constant_(m.bias.data, 0)
    elif classname.find('Linear') != -1:
        nn.init.normal_(m.weight.data, 0.0, 1.)
        nn.init.constant_(m.bias.data, 0)

        
        
def test_model(model, winmin=3, stridesec = 15,names= ["wenkanw"],random_seed=1000, split_day=False):
    """
    A function to test tensorflow model
    """
    perf = {"name":[],"model":[],"win(sec)":[], "acc":[],"recall":[], "auc":[]}
    for name in names:
        person = name
        if split_day:
            meal_data_test = Person_MealsDataset(person_name= person, file_name = "test_files", winmin = winmin,stridesec = stridesec)

            # balance test set
            testset_labels = meal_data_test.labels
            test_indices = balance_data_indices(testset_labels,data_indices=[i for i in range(len(meal_data_test))] ,mode="under", shuffle=True,random_state = random_seed,replace= False)
            # get numpy dataset
            test_Data, test_Labels = meal_data_test.get_subset(test_indices)
        else:            
            meal_data = Person_MealsDataset(person_name= person, file_name = "all_files_list", winmin = winmin,stridesec = stridesec)
            samples,labels =  meal_data.data_indices, meal_data.labels
            # split train set and test set
            train_indices, test_indices = split_train_test_indices(X= [i for i in range(len(labels))],
                                                                            y = labels, test_size = 0.2,
                                                                           random_seed = random_seed)
            testset_labels = labels[test_indices]
            test_indices = balance_data_indices(testset_labels,data_indices= test_indices,mode="under", shuffle=True,random_state = random_seed,replace= False)
            test_Data, test_Labels = meal_data.get_subset(test_indices)
            
        from sklearn.metrics import accuracy_score, recall_score, roc_auc_score
        predictions = model.predict(x=test_Data)
        threshold = 0.5
        acc =  accuracy_score(predictions>=threshold,test_Labels)
        recall = recall_score(predictions>=threshold,test_Labels)
        auc = roc_auc_score(predictions>=threshold,test_Labels)
        print("Test Accuracy:", acc)
        print("Recall Accuracy:", recall)
        print("AUC Score:", auc)
        perf["name"].append(name)
        perf["model"].append("ActiModel")
        perf["win(sec)"].append(winmin*60)
        perf["acc"].append(acc)
        perf["recall"].append(recall)
        perf["auc"].append(auc)

    perf_df = pd.DataFrame(perf)
    return perf_df

def train_models_v2(model, win_ls = [],EPOCHS = 10,stridesec = 1,name = "wenkanw",model_name="v2" ,random_seed= 1000, split_day=False):
    """
    A function to train tensorflow models
    """
    from numpy.random import seed
    seed(random_seed)
    random.seed(random_seed)
#     tf.set_random_seed(random_seed)
    from datetime  import datetime
    batch_size = 128
    outfile = sys.stdout
    perf = {"model":[],"win(sec)":[], "acc":[],"recall":[], "auc":[]}
    model_ls = []
    hist_ls = []
    for winsize in win_ls:
        tf.random.set_seed(random_seed)
        seed(random_seed)
        
        winmin = winsize
        winlength = int(winmin * 60 * 15)
        step = int(stridesec * 15)
        start_time = datetime.now()
        arr = ["echo -n 'PBS: node is '; cat $PBS_NODEFILE",\
              "echo PBS: job identifier is $PBS_JOBID",\
              "echo PBS: job name is $PBS_JOBNAME"]
        [os.system(cmd) for cmd in arr]
        print("*****************************************************************\n", file=outfile, flush=True)
        print("Execution Started at " + start_time.strftime("%m/%d/%Y, %H:%M:%S"), file=outfile, flush=True)
        print("WindowLength: {:.2f} min ({:d} datum)\tSlide: {:d} ({:d} datum)\tEpochs:{:d}\n".format(winmin, winlength, stridesec, step, EPOCHS), file=outfile, flush=True)


        pathtemp = "../models/" + name+"_models" +"/"+model_name+"_M_F_"
        modelpath = pathtemp + "{:f}Min.h5".format(winmin)
        jsonpath = pathtemp + "{:f}Min.json".format(winmin)
        print("Model to Save: ",modelpath)
        print()
        # Load the dataset
        
        person = name
        if split_day:
            create_train_test_file_list(file_name= "all_files_list.txt",person_name =name,
                         out_path = "../data-file-indices/",root_path= "../",
                         test_ratio = 0.2, print_flag = True, shuffle=True, random_state=random_seed)

            meal_data_train = Person_MealsDataset(person_name= person, file_name = "train_files", winmin = winmin,stridesec = stridesec)
            meal_data_test = Person_MealsDataset(person_name= person, file_name = "test_files", winmin = winmin,stridesec = stridesec)

            train_indices, valid_indices = split_train_test_indices(X= [i for i in range(len(meal_data_train.labels))],
                                                                    y = meal_data_train.labels, test_size = 0.2,
                                                                   random_seed = random_seed)
            #balanced train set
            trainset_labels = meal_data_train.labels[train_indices]
            train_indices = balance_data_indices(trainset_labels,data_indices= train_indices,mode="under", shuffle=True,random_state = random_seed,replace= False)

            # balance test set
            testset_labels = meal_data_test.labels
            test_indices = balance_data_indices(testset_labels,data_indices=[i for i in range(len(meal_data_test))] ,mode="under", shuffle=True,random_state = random_seed,replace= False)
            # get numpy dataset
            balancedData, balancedLabels = meal_data_train.get_subset(train_indices)
            valid_balancedData, valid_balancedLabels = meal_data_train.get_subset(valid_indices)
            test_Data, test_Labels = meal_data_test.get_subset(test_indices)

        else:
        
            meal_data = Person_MealsDataset(person_name= person, file_name = "all_files_list", winmin = winmin,stridesec = stridesec)
            samples,labels =  meal_data.data_indices, meal_data.labels
            # split train set and test set
            train_indices, test_indices = split_train_test_indices(X= [i for i in range(len(labels))],
                                                                    y = labels, test_size = 0.2,
                                                                   random_seed = random_seed)
            # balance train set
            trainset_labels = labels[train_indices]
            train_indices_balanced = balance_data_indices(trainset_labels,data_indices= train_indices,mode="under", shuffle=True,random_state = random_seed,replace= False)
            #balance test set
            testset_labels = labels[test_indices]
            test_indices = balance_data_indices(testset_labels,data_indices= test_indices,mode="under", shuffle=True,random_state = random_seed,replace= False)

            train_set_balanced = torch.utils.data.Subset(meal_data, train_indices_balanced)
            test_set = torch.utils.data.Subset(meal_data, test_indices)

            train_loader = torch.utils.data.DataLoader(train_set_balanced,batch_size=batch_size, shuffle=True,num_workers=2)
            test_loader = torch.utils.data.DataLoader(test_set ,batch_size=batch_size, shuffle=True,num_workers=2)

            print("Data Loader Created")            
            
            # split validation set
            balanced_trainset_labels = labels[train_indices_balanced]
            train_indices, valid_indices = split_train_test_indices(X= train_indices_balanced,
                                                                    y = balanced_trainset_labels, test_size = 0.2,
                                                                   random_seed = random_seed)
            valid_set_balanced = torch.utils.data.Subset(meal_data, valid_indices)
            valid_loader = torch.utils.data.DataLoader(valid_set_balanced,batch_size=batch_size, shuffle=True,num_workers=2)

            balancedData, balancedLabels = meal_data.get_subset(train_indices)
            valid_balancedData, valid_balancedLabels = meal_data.get_subset(valid_indices)
            test_Data, test_Labels = meal_data.get_subset(test_indices)
        

        #training settings
        mcp_save = tf.keras.callbacks.ModelCheckpoint(modelpath, save_best_only=True, monitor='accuracy')
        

        scheduler = tf.keras.callbacks.ReduceLROnPlateau( monitor='val_loss', factor=0.1, patience=3, verbose=0,
                                             mode='auto', min_delta=0.0001, cooldown=0, min_lr=0.)
        #train model
        H = model.fit(x=balancedData, y = balancedLabels,
                       validation_data=(valid_balancedData, valid_balancedLabels),
                    epochs = EPOCHS, batch_size=batch_size, verbose=1,
                    callbacks=[mcp_save,scheduler]) # removed addons.LossHistory(jsonpath) for compatibility with TensorFlow 2.2.0, needs to be re-added at some point

        print("Max value: ", max(H.history['accuracy']), " at epoch", np.argmax(H.history['accuracy']) + 1)

        from sklearn.metrics import accuracy_score, recall_score, roc_auc_score
        predictions = model.predict(x=test_Data)
        threshold = 0.5
        acc =  accuracy_score(predictions>=threshold,test_Labels)
        recall = recall_score(predictions>=threshold,test_Labels)
        auc = roc_auc_score(predictions>=threshold,test_Labels)
        print("Test Accuracy:", acc)
        print("Recall Accuracy:", recall)
        print("AUC Score:", auc)

        perf["model"].append("ActiModel")
        perf["win(sec)"].append(winmin*60)
        perf["acc"].append(acc)
        perf["recall"].append(recall)
        perf["auc"].append(auc)
        model_ls.append(model)
        hist_ls.append(H)
    perf_df = pd.DataFrame(perf)
    print(perf_df)
    return perf_df, model_ls, hist_ls
    

Overwriting utils.py


# Pytorch Model Utils

In [16]:
%%writefile model.py

import torch
from torch import nn
from torch.nn import functional as F


class Discriminator_BN_Bias(nn.Module):
    def __init__(self, ngpu, input_shape , out_fea = 1, bias=True):
        super(Discriminator_BN_Bias, self).__init__()
        
        # (input_shape[0], input_shape[1] )=  (number of data point, 6 axis channel )
        win_size,in_channels = input_shape[0], input_shape[1]
        self.in_channels = in_channels
        self.ngpu = ngpu  
        self.out_fea = out_fea
        filter_num = 10
        
        self.conv0 = nn.Conv1d(in_channels = self.in_channels, 
                               out_channels = filter_num, 
                               kernel_size= 44,  stride= 2, padding=0, bias=bias)
        
        self.relu0= nn.ReLU()
        num_fea = (win_size-44)//2 +1
        self.conv1 = nn.Conv1d(filter_num,filter_num, kernel_size= 20,stride= 2, padding=0, bias=bias)
        self.relu1= nn.ReLU()
        num_fea = (num_fea-20)//2 +1
        
        self.bn1 = nn.BatchNorm1d(filter_num)
        
        self.conv2 = nn.Conv1d(filter_num,filter_num, kernel_size= 4, stride= 2, padding=0, bias=bias)
        self.relu2= nn.ReLU()
        
        num_fea = (num_fea-4)//2 +1
        self.bn2 = nn.BatchNorm1d(filter_num)
        
        
        self.avgpool = nn.AvgPool1d(kernel_size=10)
        self.flatten = nn.Flatten()
        self.linear1 = None 
        self.relu4 = nn.ReLU()
        self.linear2 = nn.Linear(in_features=200, out_features=out_fea, bias=bias)
        self.softmax = nn.Softmax(dim=out_fea)
        self.sigmoid = nn.Sigmoid()
        
        nn.init.normal_(self.conv0.weight.data, 0.0, 1.)
        nn.init.normal_(self.conv1.weight.data, 0.0, 1.)
        nn.init.normal_(self.conv2.weight.data, 0.0, 1.)
        nn.init.normal_(self.linear2.bias.data, 0.0, 1.)
        #nn.init.normal_(self.conv0.bias.data, 0.0, 1.)
        #nn.init.normal_(self.conv1.bias.data, 0.0, 1.)
        #nn.init.normal_(self.conv2.bias.data, 0.0, 1.)
#         nn.init.normal_(self.avgpool.weight.data, 0.0, 1.)
        
    def l1_loss(self,factor=0.01):
        l1_crit = nn.L1Loss(size_average=False)
        reg_loss = 0.
        loss = 0.
        layers = [self.conv0, self.conv1, self.conv2]
        for layer in layers:
            for p in layer.parameters():
                #print(p)
                reg_loss += l1_crit(p, torch.zeros(p.shape))

        loss = factor * reg_loss
        return loss

    def forward(self, input):

        x = input.permute(0,2,1)
        x = self.conv0(x)
        x = self.relu0(x)
        x = self.conv1(x)
        x=  self.bn1(x)
        x = self.relu1(x)
        x = self.conv2(x)
        x=  self.bn2(x)
        x = self.relu2(x)
        x = self.avgpool(x)
        #print("Pooling shape:",x.shape)
        x = self.flatten(x)
        if self.linear1 == None:
            self.linear1 = nn.Linear(in_features=x.shape[1], out_features=200, bias=True)
            nn.init.normal_(self.linear1.weight.data, 0.0, 1.)
            nn.init.normal_(self.linear1.bias.data, 0.0, 1.)
            
        x = self.relu4(self.linear1(x))
        out = self.linear2(x)

        return out



class BasicBlock(nn.Module):
    def __init__(self, in_channels ,bias= True, filter_num = 10):
        super(BasicBlock, self).__init__()       
        self.conv0 = nn.Conv1d(in_channels = in_channels, 
                               out_channels = filter_num, 
                               kernel_size= 3,  stride= 1, padding=1, bias=bias)
        self.bn0 = nn.BatchNorm1d(filter_num)
        self.relu0 = nn.ReLU()
        
        self.conv1 = nn.Conv1d(in_channels = in_channels, 
                               out_channels = filter_num, 
                               kernel_size= 3,  stride= 1, padding=1, bias=bias)
        self.bn1 = nn.BatchNorm1d(filter_num)
        self.relu1 = nn.ReLU()
        pass
    def forward(self, x):
        residual = x
        out = self.conv0(x)
        out = self.bn0(out)
        out = self.relu0(out)
        out = self.conv1(out)
        out = self.bn1(out)
        
        out += residual
        out = self.relu1(out)
        
        return out

class Discriminator_ResNet(nn.Module):
    def __init__(self, ngpu, input_shape , out_fea = 1):
        super(Discriminator_ResNet, self).__init__()
        
        # (input_shape[0], input_shape[1] )=  (number of data point, 6 axis channel )
        win_size, in_channels= input_shape[0], input_shape[1]
        self.in_channels = in_channels
        self.ngpu = ngpu  
        self.out_fea = out_fea
        filter_num = 10
        
        self.conv0 = nn.Conv1d(in_channels = self.in_channels, 
                               out_channels = filter_num, 
                               kernel_size= 44,  stride= 2, padding=0, bias=True)
        
        self.relu0= nn.ReLU()
        num_fea = (win_size-44)//2 +1
        self.conv1 = nn.Conv1d(filter_num,filter_num, kernel_size= 20,stride= 2, padding=0, bias=True)
        self.relu1= nn.ReLU()
        self.bn1 = nn.BatchNorm1d(filter_num)
        
        self.block1 =  BasicBlock( in_channels=filter_num  ,bias= True, filter_num = filter_num)
        self.block2 =  BasicBlock( in_channels=filter_num  ,bias= True, filter_num = filter_num)
        
        
        self.avgpool = nn.AvgPool1d(kernel_size=10)
        self.flatten = nn.Flatten()
        self.linear1 = None 
        self.relu2 = nn.ReLU()
        self.linear2 = nn.Linear(in_features=200, out_features=out_fea, bias=True)
        self.softmax = nn.Softmax(dim=out_fea)
        self.sigmoid = nn.Sigmoid()
        
        nn.init.normal_(self.conv0.weight.data, 0.0, 1.)
        nn.init.normal_(self.conv1.weight.data, 0.0, 1.)
        nn.init.normal_(self.linear2.bias.data, 0.0, 1.)
        
    def l1_loss(self,factor=0.01):
        l1_crit = nn.L1Loss(size_average=False)
        reg_loss = 0.
        loss = 0.
        layers = [self.conv0, self.conv1]
        for layer in layers:
            for p in layer.parameters():
                #print(p)
                reg_loss += l1_crit(p, torch.zeros(p.shape))

        loss = factor * reg_loss
        return loss

    
    def forward(self, input):

        x = input.permute(0,2,1)
        x = self.conv0(x)
        x = self.relu0(x)
        x = self.conv1(x)
        x=  self.bn1(x)
        x = self.relu1(x)
        x = self.block1(x)
        x = self.block2(x)

        
        x = self.avgpool(x)
        #print("Pooling shape:",x.shape)
        x = self.flatten(x)
        if self.linear1 == None:
            self.linear1 = nn.Linear(in_features=x.shape[1], out_features=200, bias=True)
            nn.init.normal_(self.linear1.weight.data, 0.0, 1.)
            nn.init.normal_(self.linear1.bias.data, 0.0, 1.)
            
        x = self.relu2(self.linear1(x))
        out = self.linear2(x)

        return out
    
    
from torch import nn
class Discriminator(nn.Module):
    def __init__(self, ngpu, input_shape , out_fea = 1, device="cpu"):
        super(Discriminator, self).__init__()
        self.device =device
        # (input_shape[0], input_shape[1] )=  (number of data point, 6 axis channel )
        win_size, in_channels = input_shape[0], input_shape[1]
        self.in_channels = in_channels
        self.ngpu = ngpu  
        self.out_fea = out_fea
        filter_size = 10
        self.conv0 = nn.Conv1d(in_channels = self.in_channels, 
                               out_channels = filter_size, 
                               kernel_size= 44,  stride= 2, padding=0, bias=False)
        
        self.relu0= nn.ReLU()
        
        self.conv1 = nn.Conv1d(filter_size,filter_size, kernel_size= 20,stride= 2, padding=0, bias=False)
        self.relu1= nn.ReLU()
        
        self.conv2 = nn.Conv1d(filter_size,filter_size, kernel_size= 4, stride= 2, padding=0, bias=False)
        self.relu2= nn.ReLU()
        
#         self.conv3 = nn.Conv1d(filter_size,filter_size, kernel_size= 4, stride= 2, padding=0, bias=False)
#         self.relu3= nn.LeakyReLU(0, inplace=True)
        
        self.avgpool = nn.AvgPool1d(kernel_size=10)
        self.flatten = nn.Flatten()
        self.linear1 = None 
        self.relu4 = nn.ReLU()
        self.linear2 = nn.Linear(in_features=200, out_features=out_fea, bias=True)
        self.softmax = nn.Softmax(dim=out_fea)
        self.sigmoid = nn.Sigmoid()
        
        nn.init.normal_(self.conv0.weight.data, 0.0, 1.)
        nn.init.normal_(self.conv1.weight.data, 0.0, 1.)
        nn.init.normal_(self.conv2.weight.data, 0.0, 1.)
        nn.init.normal_(self.linear2.bias.data, 0.0, 1.)
        #nn.init.normal_(self.conv0.bias.data, 0.0, 1.)
        #nn.init.normal_(self.conv1.bias.data, 0.0, 1.)
        #nn.init.normal_(self.conv2.bias.data, 0.0, 1.)
#         nn.init.normal_(self.avgpool.weight.data, 0.0, 1.)
        
    def l1_loss(self,factor=0.01):
        l1_crit = nn.L1Loss(size_average=False)
        reg_loss = 0.
        loss = 0.
        layers = [self.conv1, self.conv2]
        for layer in layers:
            for p in layer.parameters():
                #print(p)
                reg_loss += l1_crit(p, torch.zeros(p.shape, device= self.device))

        loss = factor * reg_loss
        return loss

    def forward(self, input):
#         print("input shape:",input.shape)
        x = input.permute(0,2,1)
        x = self.conv0(x)
        x = self.relu0(x)
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.avgpool(x)
        #print("Pooling shape:",x.shape)
        x = self.flatten(x)
        if self.linear1 == None:
            self.linear1 = nn.Linear(in_features=x.shape[1], out_features=200, bias=True)
            nn.init.normal_(self.linear1.weight.data, 0.0, 1.)
            nn.init.normal_(self.linear1.bias.data, 0.0, 1.)
            
        x = self.relu4(self.linear1(x))
        out = self.linear2(x)
#         out = self.sigmoid(x)

        return out


Overwriting model.py


# Tensorflow utils

In [2]:
%%writefile tf_utils.py

import numpy as np
import pandas as pd
import torch
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
from data_loader import loadEvents
def get_meal_info(person_name = None,file_ls = [], file_ls_doc=None,root_path = "../data/",print_file=False,round_decimal=1):
        """
        if file_ls is not given, then get file_ls according to person_name
        file path = root_path + file name in all_files_list.txt

        return:
            meal event count, total minutes of all meals, total hours of all meals,total day counts

        """
        meal_counts = 0
        sec_counts = 0
        min_counts = 0
        hour_counts = 0
        total_hours = 0
        total_mins = 0
        total_sec = 0
        day_counts = 0 
        if person_name ==None:
            return meal_counts, min_counts,hour_counts, day_counts, total_hours

        data_indices_file = "../data-file-indices/" +person_name+"/all_files_list.txt"
        fp = open(data_indices_file,"r")
        txt = fp.read()
        fp.close()
        file_ls = txt.split("\n")
        while '' in file_ls:
            file_ls.remove('')
       
        day_counts = len(file_ls)
        
        for file_name in file_ls:
            file_name = root_path + file_name
            TotalEvents, EventStart, EventEnd, EventNames, TimeOffset,EndTime = loadEvents(file_name, debug_flag = False, print_file=print_file)
            meal_counts += TotalEvents
            total_sec +=  abs(EndTime - TimeOffset)
            for i in range(len(EventStart)):
                sec_counts += ( EventEnd[i]- EventStart[i])//15
                
        total_hours = total_sec/(60*60)
        min_counts = sec_counts/60
        hour_counts = min_counts/60
        average_meal_per_day = meal_counts/len(file_ls)
        average_hour_per_meal = hour_counts/meal_counts
        # round numbers
        total_hours = round(total_hours, round_decimal)
        min_counts = round(min_counts, round_decimal)
        hour_counts = round(hour_counts, round_decimal)
        average_meal_per_day = round(average_meal_per_day,round_decimal)
        average_hour_per_meal = round(average_hour_per_meal, round_decimal)
        
        no_eating_hours = total_hours - hour_counts
        weight_ratio = round(no_eating_hours/hour_counts, round_decimal)
        result = pd.DataFrame({"dataset": person_name,"Days":day_counts, 
                      "Total Hours":total_hours,"Meal Counts":meal_counts,
                      "Average Meal Counts Per Day":average_meal_per_day,"Average Hours Per Meal": average_hour_per_meal,
                      "Eating Hours":hour_counts, "No Eating Hours":no_eating_hours,
                     "Balance Ratio(no_eat/eat)":weight_ratio},index=[0])
    
        return result

          
def get_dataset_info(names= ["wenkanw"],winmin=6,stridesec=5):
    """
    Function to get information of meal dataset
    """
    meal_info = defaultdict(list)
    dataset_results = pd.DataFrame()
    for name in names:
        result = get_meal_info(person_name=name)
        if dataset_results.empty:
            dataset_results = result
        else:
            dataset_results = dataset_results.append(result,ignore_index=True)
    
    # append total summary
#     print( dataset_results)
    total_result=pd.DataFrame({"dataset":"total"},columns = dataset_results.columns,index=[0])
    # append average summary
    average_result=pd.DataFrame({"dataset":"average"},columns = dataset_results.columns,index=[0])
    key_ls = ["Days","Total Hours","Meal Counts","Eating Hours","No Eating Hours"]
    for key in dataset_info.columns:
        if key in key_ls:
            total_result[key].at[0] = round(dataset_results[key].sum() ,1)
            average_result[key].at[0] = round(dataset_results[key].mean(),1)

    ls = [total_result, average_result]
    for df in ls:
        df["Average Meal Counts Per Day"].at[0] = round(df["Meal Counts"].values[0]/df["Days"].values[0], 1)
        df["Average Hours Per Meal"].at[0] =round( df["Eating Hours"].values[0]/df["Meal Counts"].values[0], 1)
        df["Balance Ratio(no_eat/eat)"].at[0] =round(df["No Eating Hours"].values[0]/df["Eating Hours"].values[0],1)
        dataset_results =dataset_results.append(df,ignore_index=True)

    return dataset_results







from dataset import create_train_test_file_list,  balance_data_indices  #Person_MealsDataset,
from utils import *
from model import *
def train_models(model, win_ls = [],EPOCHS = 10,stridesec = 5,name = "wenkanw",model_name="acti_6min" ,
                 random_seed= 1000, split_day=False,test_balanced=False,
                create_file_ls = False):
    """
    Train model using train/test spit
    """
    from numpy.random import seed
    seed(random_seed)
    random.seed(random_seed)
#     tf.set_random_seed(random_seed)
    from datetime  import datetime
    batch_size = 128
    outfile = sys.stdout
    perf = {"model":[],"data":[],"win(sec)":[], "wacc":[],"f1":[],"recall":[],"acc":[]}
    model_ls = []
    hist_ls = []
    for winsize in win_ls:
        tf.random.set_seed(random_seed)
        seed(random_seed)
        
        winmin = winsize
        winlength = int(winmin * 60 * 15)
        step = int(stridesec * 15)
        start_time = datetime.now()
        arr = ["echo -n 'PBS: node is '; cat $PBS_NODEFILE",\
              "echo PBS: job identifier is $PBS_JOBID",\
              "echo PBS: job name is $PBS_JOBNAME"]
        [os.system(cmd) for cmd in arr]
        print("*****************************************************************\n", file=outfile, flush=True)
        print("Execution Started at " + start_time.strftime("%m/%d/%Y, %H:%M:%S"), file=outfile, flush=True)
        print("WindowLength: {:.2f} min ({:d} datum)\tSlide: {:d} ({:d} datum)\tEpochs:{:d}\n".format(winmin, winlength, stridesec, step, EPOCHS), file=outfile, flush=True)


        if split_day:
            pathtemp = "../models/" + name+"_models" +"/"+model_name+"_split_day_M_F_"
        else:
            pathtemp = "../models/" + name+"_models" +"/"+model_name+"_M_F_"
            
        #pathtemp = "../models/" + name +"/"+model_name+"_M_F_"
        modelpath = pathtemp + "{:f}Min.h5".format(winmin)
        jsonpath = pathtemp + "{:f}Min.json".format(winmin)
        
        print("Model to Save: ",modelpath)
        print()
        ########### Load the dataset################
        person = name
        if create_file_ls:
            create_train_test_file_list(file_name= "all_files_list.txt",person_name =name,
                         out_path = "../data-file-indices/",root_path= "../",
                         test_ratio = 0.2, print_flag = True, shuffle=True, random_state=random_seed)
        
        if split_day:
            

            meal_data_train = Person_MealsDataset(person_name= person, file_name = "train_files", winmin = winmin,stridesec = stridesec)
            meal_data_test = Person_MealsDataset(person_name= person, file_name = "test_files", winmin = winmin,stridesec = stridesec)

            train_indices, valid_indices = split_train_test_indices(X= [i for i in range(len(meal_data_train.labels))],
                                                                    y = meal_data_train.labels, test_size = 0.2,
                                                                   random_seed = random_seed)
            #balanced train set
            trainset_labels = meal_data_train.labels[train_indices]
            train_indices = balance_data_indices(trainset_labels,data_indices= train_indices,mode="under", shuffle=True,random_state = random_seed,replace= False)

            # balance test set
            testset_labels = meal_data_test.labels
            if test_balanced:
                test_indices = balance_data_indices(testset_labels,data_indices=[i for i in range(len(meal_data_test))] ,mode="under", shuffle=True,random_state = random_seed,replace= False)
            else:
                # without balancing data
                test_indices = [i for i in range(len(meal_data_test))] 
                
            # get numpy dataset
            balancedData, balancedLabels = meal_data_train.get_subset(train_indices)
            valid_balancedData, valid_balancedLabels = meal_data_train.get_subset(valid_indices)
            test_Data, test_Labels = meal_data_test.get_subset(test_indices)

        else:
        
            meal_data = Person_MealsDataset(person_name= person, file_name = "all_files_list", winmin = winmin,stridesec = stridesec)
            samples,labels =  meal_data.data_indices, meal_data.labels
            # split train set and test set
            train_indices, test_indices = split_train_test_indices(X= [i for i in range(len(labels))],
                                                                    y = labels, test_size = 0.2,
                                                                   random_seed = random_seed)
            # balance train set
            trainset_labels = labels[train_indices]
            train_indices_balanced = balance_data_indices(trainset_labels,data_indices= train_indices,mode="under", shuffle=True,random_state = random_seed,replace= False)
            
            
            testset_labels = labels[test_indices]
            if test_balanced:
                #balance test set
                test_indices = balance_data_indices(testset_labels,data_indices= test_indices,mode="under", shuffle=True,random_state = random_seed,replace= False)
            else:
                test_indices = test_indices 
            
            
            train_set_balanced = torch.utils.data.Subset(meal_data, train_indices_balanced)
            test_set = torch.utils.data.Subset(meal_data, test_indices)

            train_loader = torch.utils.data.DataLoader(train_set_balanced,batch_size=batch_size, shuffle=True,num_workers=2)
            test_loader = torch.utils.data.DataLoader(test_set ,batch_size=batch_size, shuffle=True,num_workers=2)

            print("Data Loader Created")            
            
            # split validation set
            balanced_trainset_labels = labels[train_indices_balanced]
            train_indices, valid_indices = split_train_test_indices(X= train_indices_balanced,
                                                                    y = balanced_trainset_labels, test_size = 0.2,
                                                                   random_seed = random_seed)
            valid_set_balanced = torch.utils.data.Subset(meal_data, valid_indices)
            valid_loader = torch.utils.data.DataLoader(valid_set_balanced,batch_size=batch_size, shuffle=True,num_workers=2)

            # Get numpy dataset: balanced trainset, validation set, test set
            balancedData, balancedLabels = meal_data.get_subset(train_indices)
            valid_balancedData, valid_balancedLabels = meal_data.get_subset(valid_indices)
            test_Data, test_Labels = meal_data.get_subset(test_indices)
        

        #training settings
        mcp_save = tf.keras.callbacks.ModelCheckpoint(modelpath, save_best_only=True, monitor='accuracy')
        

        scheduler = tf.keras.callbacks.ReduceLROnPlateau( monitor='val_loss', factor=0.1, patience=3, verbose=0,
                                             mode='auto', min_delta=0.0001, cooldown=0, min_lr=0.)
        
        ##########train model ###############
        H = model.fit(x=balancedData, y = balancedLabels,
                       validation_data=(valid_balancedData, valid_balancedLabels),
                    epochs = EPOCHS, batch_size=batch_size, verbose=1,
                    callbacks=[mcp_save,scheduler]) # removed addons.LossHistory(jsonpath) for compatibility with TensorFlow 2.2.0, needs to be re-added at some point

        print("Max value: ", max(H.history['accuracy']), " at epoch", np.argmax(H.history['accuracy']) + 1)

        from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, balanced_accuracy_score, f1_score
        predictions = model.predict(x=test_Data)
        threshold = 0.5
        wacc =  balanced_accuracy_score(test_Labels,predictions>=threshold)
        f1 =  f1_score(test_Labels,predictions>=threshold)
        acc =  accuracy_score(test_Labels,predictions>=threshold)
        recall = recall_score(test_Labels,predictions>=threshold)
        
        #auc = roc_auc_score(test_Labels,predictions>=threshold)
        print("Weighted Accuracy:", wacc)
        print("Test Accuracy:", acc)
        print("F1-score:", f1)
        print("Recall Accuracy:", recall)
        #print("AUC Score:", auc)

        perf["model"].append("ActiModel")
        perf["data"].append(name)
        perf["win(sec)"].append(winmin*60)
        perf["wacc"].append(wacc)
        perf["f1"].append(f1)
        perf["acc"].append(acc)
        perf["recall"].append(recall)
        #perf["auc"].append(auc)
        model_ls.append(model)
        hist_ls.append(H)
    perf_df = pd.DataFrame(perf)
    print(perf_df)
    return perf_df, model_ls, hist_ls





from collections import defaultdict
name_ls = ["wenkanw",'adam',"lawler","shaurya"]
dataset_info = get_dataset_info(names= name_ls)
balance_ratio = dataset_info[dataset_info["dataset"]=="total"]['Balance Ratio(no_eat/eat)'].values[0]

def weight_accuracy(y_true, y_pred,weight= balance_ratio):
    TP = sum( (y_true==1) &(y_pred==1) )
    FN = sum( (y_true==1) &(y_pred==0) )
    TN = sum( (y_true==0) &(y_pred==0) )
    FP = sum( (y_true==0) &(y_pred==1) )
    print("TP: ",TP, "FP: ",FP, "TN: ",TN, "FN: ",FN)
    return (weight*TP + TN)/(weight*(TP+FN) + (TN+FP))

def test_models_time_metric(winmin=1, stridesec = 5,names= ["wenkanw"],random_seed=1000, split_day=False, test_balance=False, test_CAD=False):
    """
    Test time metrics
    """
    perf = defaultdict(list)
    meal_info = defaultdict(list)
    # Don't test on CAD test set if it is not enabled
    if not test_CAD and "CAD" in names:
        names.remove("CAD")
        
    for name in names:
        person = name
        if split_day:
            meal_data = Person_MealsDataset(person_name= person, file_name = "test_files", winmin = winmin,stridesec = stridesec)

            # balance test set
            testset_labels = meal_data.labels
            if test_balance:
                test_indices = balance_data_indices(testset_labels,data_indices=[i for i in range(len(meal_data))] ,mode="under", shuffle=True,random_state = random_seed,replace= False)
            else:
                test_indices = [i for i in range(len(meal_data))]
            # get numpy dataset
            test_Data, test_Labels = meal_data.get_subset(test_indices)
        else:            
            meal_data = Person_MealsDataset(person_name= person, file_name = "all_files_list", winmin = winmin,stridesec = stridesec)
            samples,labels =  meal_data.data_indices, meal_data.labels
            # split train set and test set
            train_indices, test_indices = split_train_test_indices(X= [i for i in range(len(labels))],
                                                                            y = labels, test_size = 0.2,
                                                                           random_seed = random_seed)
            
            if test_balance:
                testset_labels = labels[test_indices]
                test_indices = balance_data_indices(testset_labels,data_indices= test_indices,mode="under", shuffle=True,random_state = random_seed,replace= False)
            else:
                test_indices = test_indices
            testset_labels = labels[test_indices]
            print("Testing on : ", sum(testset_labels==1),"positive samples, ",sum(testset_labels==0)," negative samples" )
            test_Data, test_Labels = meal_data.get_subset(test_indices)
        if name !="CAD":
            meal_counts, min_counts,hour_counts, day_counts,total_hours = meal_data.get_mealdataset_info(person_name=name, file_ls_doc="all_files_list.txt")
        else:
            # data from paper
            meal_counts, min_counts,hour_counts, day_counts,total_hours = 1063, 250*60,250, 354, 4680
        meal_info["dataset"].append(name)
        meal_info["Days"].append(day_counts)
        meal_info["Meal_Hours"].append(round(hour_counts,1)) 
        meal_info["Meal_Counts"].append(meal_counts) 
        meal_info["Total_Hours"].append(total_hours) 
        
        perf["dataset"].append(name)
        perf["win(sec)"].append(winmin*60)
        
            
        
        
        from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, f1_score, balanced_accuracy_score, precision_score
        group_model_W  = tf.keras.models.load_model('../models/CAD_models/acti_6min_M_F_6.000000Min.h5')
        
        if name != "CAD":
            if split_day:
                individual_model = tf.keras.models.load_model('../models/'+ name+ '_models/acti_6min_split_day_M_F_6.000000Min.h5')
            else:    
                individual_model = tf.keras.models.load_model('../models/'+ name+ '_models/acti_6min_M_F_6.000000Min.h5')
        models = {"suffix":['Individual-Model','GroupModel'],  "model":[individual_model,group_model_W]}
        
        for i in range(len(models["suffix"])):
            suffix = models["suffix"][i]
            model = models["model"][i]
            # if the dataset is CAD group dataset and model is individual model
            # we don't need to make prediction on that data
            if name == "CAD" and suffix =='Individual-Model':
                acc = None
                auc = None
                recall = None
            else:

                predictions = model.predict(x=test_Data).squeeze(1)
                threshold = 0.5
                prediction = (predictions>=threshold).astype(int)
                wacc =  balanced_accuracy_score(test_Labels,prediction)
                acc =  accuracy_score(test_Labels,prediction)
                recall = recall_score(test_Labels,prediction)
                f1 = f1_score(test_Labels,prediction)
                precision = precision_score(test_Labels,prediction)
                print("Test label: ",test_Labels)
                print("Predictions:",prediction)
                
                # weighted accuracy 2 is computed by (weight*TP +TN)/(weight*(TP+FN) + (TN+FP))
                wacc2 = weight_accuracy(test_Labels,prediction)
            
            
            print("Weighted Accuracy:", wacc)
            print("Weighted Accuracy2:", wacc2)
            print("Recall:", recall)
            print("Precision:", precision)
            print("F1:", f1)
            print("Test Accuracy:", acc)
            
            perf["WAcc: "+suffix].append(wacc)
            perf["WAcc2: "+suffix].append(wacc2)
            perf["Recall: "+suffix].append(recall)
            perf["Precision: "+suffix].append(precision)
            perf["F1: "+suffix].append(f1)
            perf["Acc: "+suffix].append(acc)

    meal_info = pd.DataFrame(meal_info)
    perf_df = pd.DataFrame(perf)
    return meal_info, perf_df


def print_time_metrics(result, round_decimal = 3):
    perf_df = pd.DataFrame()
    mykeys = ["dataset","win(sec)","WAcc", "F1","Precision","Recall"]
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']    
    for k in mykeys:
        for key in result.keys():
            if k in key:
                perf_df[key] = result[key]
                if result[key].dtype in numerics:
                    perf_df[key] = np.round(perf_df[key],round_decimal)
    for key in result.keys():
        if "WAcc" not in key and "Acc" in key:
            perf_df[key] = result[key].values.round(round_decimal)
    
    mean_perf = pd.DataFrame(columns = perf_df.keys())
    mean_perf = mean_perf.append({"dataset":"average performance","win(sec)":"-"},ignore_index=True)
    for key in perf_df.keys():
        if key.lower() != "dataset" and  key.lower() != "win(sec)":
            mean_perf[key].at[0] = perf_df[key].mean().round(round_decimal)
    
    perf_df = perf_df.append(mean_perf,ignore_index=True)
            
    return perf_df



def hysteresis_threshold(model, data,start_threshold=0.8, end_threshold=0.4, winmin = 6,
                        stepsec=5, episode_min = 1.,t_pause = 900):
    """
    model: tensorflow model
    data:  This dataset must be the self-defined class of Person_MealsDataset  datasetset in my dataset.py/pytorch dataset without using shuffle. 
    Keep the order of dataset after extracting window samples!  You can also define your own dataset using class object to create the interface
    
    start_threshold: the high threshold of the beginning of segmentation
    
    end_threshold: the end threshold of the end of segmentation
    
    winmin: size of a window sample in unit of  minute
    
    stepsec: stride to move the window in unit of second / the number of second between two adjacent window samples
    
    episode_min: the minimum length of eating episode in unit of minute. If end of segmentation -start of segmentation < episode_min,
        then the episode will not be counted
    
    """
    result_ls = []
    
    
    days = set(data.data_indices[:,0])
    for day in days:
        # Select and Extract the data and labels of the corresponding day from the whole dataset
        sample_indices= np.where(data.data_indices[:,0]==day)[0]
        result = {'day':day,"stepsec": stepsec,'segment_start':[], 'segment_end':[],'proba':[],'predictions':np.zeros([len(sample_indices)]),'labels':[],"segment_count":0}
        
        # get the numpy array of samples and labels
        samples, labels = data.get_subset(sample_indices)
        probas = model(samples)
        state = 0
        start = 0
        end = 0 
        pause_counter = 0
        # one day data
        print("Day: ",day)
        for i in range(len(sample_indices)):
            #print("i:",i)
            #sample, label = data[i][0].numpy(),data[i][1]
            #sample = np.expand_dims(sample,axis=0)
            #proba = model(sample).numpy()[0][0]
            sample = samples[i]
            label = labels[i]
            proba = probas[i].numpy()[0]
            
            result['proba'].append(proba)
            result['labels'].append(label)
            
            if state ==0 and proba > start_threshold:
                state = 1
                start = i
            elif state == 1 and proba <end_threshold:
                state = 2
                end = i+1
                pause_counter = 0
            elif state ==2:
                if proba > start_threshold:
                    state = 1
                else:
                    pause_counter += stepsec
                    if pause_counter >= t_pause:
                        # convert time to second and check threshold
                        if (end-start)*stepsec >= episode_min*60:
                            # save data
                            result['segment_start'].append(start)
                            result['segment_end'].append(end)
                            result['segment_count'] += 1
                            result['predictions'][start:end] = 1
                            pass
                        end = 0
                        state = 0
        if state != 0:
            # if segment ended at the end of data
            if end != 0:
                result['segment_start'].append(start)
                result['segment_end'].append(end)
                result['predictions'][start:end] = 1
            else:
                result['segment_count'] -= 1  
            result['segment_count'] += 1
            
        result_ls.append(result)
        print("Segmentation Completed. ")
                            
    return pd.DataFrame(result_ls)




def get_episode_metrics(result, meal_data):
    """
    result: result from  hysteresis threshold function
    meal_data: meal dataset of Person_MealData
    
    """
    
    from sklearn.metrics import accuracy_score, recall_score, roc_auc_score
    total_preds = []
    total_labels = []
    perf = {"TPR":[],"FP/TP":[],"TP":[], "FP":[],"FN":[]}
    tpr = 0. 
    FP_TP = 0.
    TP = 0
    FP = 0
    FN = 0
    # get segmentation of ground truth labeled by user
    start_ls, end_ls = meal_data.get_GT_segment()
    
    meal_counts, min_counts,hour_counts, day_counts,total_hours = meal_data.get_mealdataset_info()
    
    # iterate every day
    for i in range(len(result)):
        #preds = result.iloc[i]['predictions']
        #labels =  result.iloc[i]['labels']
        event_start, event_end= start_ls[i], end_ls[i]
        detect_start, detect_end = result.iloc[i]['segment_start'],result.iloc[i]['segment_end']
        GT = np.array([-1]*len(event_start) )  # default all meals are missing -1, FN
        detect = np.array([-1]*len(detect_start)) # default all detected meals are wrong -1, FP
        for index in range(len(event_start)):
            # e_s: event start,  e_e: event end
            # d_s: detection start,  d_e: detection end
            e_s, e_e = event_start[index], event_end[index]
            for index2 in range(len(detect_start)):
                # convert segment from sec to index of data point
                d_s = detect_start[index2] * result.iloc[i]['stepsec']*15
                d_e = detect_end[index2]* result.iloc[i]['stepsec']*15
                #print("ds: {} d_e: {}, e_s:{}, e_e: {}".format(d_s,d_e, e_s, e_e))
                if (e_s>=d_s and e_s <= d_e) or (d_s>= e_s and d_s<= e_e):
                    GT[index] = index2
                    detect[index2] = index
        #print("GT:",GT, "Detect:", detect)
        TP += sum(GT!=-1)
        FN += sum(GT==-1)
        FP += sum(detect==-1)
                
    
    print("total_meal:",meal_counts, "TP: ", TP, "FP: ", FP, "FN: ", FN)
    perf['TPR'].append(TP/(TP+FN))
    if TP ==0:
        perf['FP/TP'].append(None)
    else:
        perf['FP/TP'].append(FP/TP)
    perf["TP"].append(TP)
    perf["FP"].append(FP)
    perf["FN"].append(FN)
    result_df = pd.DataFrame(perf)
        
    return pd.DataFrame(result_df)


from collections import defaultdict

def test_models_episode_metric(winmin=6, stridesec = 5,names= ["wenkanw"],random_seed=1000, test_balance=False, test_CAD=False,test_alldata=False):
    perf = defaultdict(list)
    meal_info = defaultdict(list)
    threshold = {'wenkanw':[0.8, 0.4], 'adam':[0.8,0.4],'lawler':[0.8,0.4], 'shaurya':[0.8,0.4]}
    # Don't test on CAD test set if it is not enabled
    if not test_CAD and "CAD" in names:
        names.remove("CAD")
        
    for name in names:
        
        high_th, low_th = threshold[name][0], threshold[name][1]
        
        person = name
        # test episode metrics that split dataset by days
        if not test_alldata:
            meal_data = Person_MealsDataset(person_name= person, file_name = "test_files", winmin = winmin,stridesec = stridesec)
        else:
            meal_data = Person_MealsDataset(person_name= person, file_name = "all_files_list", winmin = winmin,stridesec = stridesec)
        # balance test set    
        if test_balance:
            testset_labels = meal_data.labels
            test_indices = balance_data_indices(testset_labels,data_indices=[i for i in range(len(meal_data))] ,mode="under", shuffle=True,random_state = random_seed,replace= False)
        else:
            test_indices = [i for i in range(len(meal_data))]
        # get numpy dataset
        #test_Data, test_Labels = meal_data.get_subset(test_indices)
        
        
        meal_counts, min_counts,hour_counts, day_counts,total_hours = meal_data.get_mealdataset_info(person_name=name)
                
        
        perf["Days"].append(day_counts)
        perf["Meal_Hours"].append(round(hour_counts,1)) 
        perf["Meal_Counts"].append(meal_counts) 
        perf["dataset"].append(name)
        perf["win(sec)"].append(winmin*60)
        
            
        from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, f1_score, balanced_accuracy_score, precision_score
        group_model_W  = tf.keras.models.load_model('../models/CAD_models/acti_6min_M_F_6.000000Min.h5')
        
        individual_model = tf.keras.models.load_model('../models/'+ name+ '_models/acti_6min_split_day_M_F_6.000000Min.h5')
            
        models = {"suffix":['Individual-Model','GroupModel'],  "model":[individual_model,group_model_W]}
        
        for i in range(len(models["suffix"])):
            suffix = models["suffix"][i]
            model = models["model"][i]
            # if the dataset is CAD group dataset and model is individual model
            # we don't need to make prediction on that data
            
            result = hysteresis_threshold(model, meal_data,start_threshold=high_th, end_threshold=low_th, winmin = 6, stepsec=5, episode_min = 1.)
            episode_perf_df = get_episode_metrics(result,meal_data)
            perf["TPR: "+suffix].append(episode_perf_df["TPR"].iloc[0])
            perf["FP/TP: "+suffix].append(episode_perf_df["FP/TP"].iloc[0])
            perf["TP: "+suffix].append(episode_perf_df["TP"].iloc[0])
            perf["FP: "+suffix].append(episode_perf_df["FP"].iloc[0])
            perf["FN: "+suffix].append(episode_perf_df["FN"].iloc[0])
            
            print(episode_perf_df)

    perf_df = pd.DataFrame(perf)
    return perf_df





Overwriting tf_utils.py


In [8]:
%%writefile packages.py

import sys
import numpy as np
from numpy.random import seed
### imports
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Turn off TensorFlow logging
import tensorflow as tf

import numpy as np
import pandas as pd
import torch
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt

import random
#from sklearn.metrics import classification_report, confusion_matrix
from datetime import datetime
import loadfile
import addons

import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Reshape, Activation
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalAveragePooling1D, Input, add
# from tensorflow.keras.utils import np_utils
from tensorflow.keras.regularizers import l1
from tensorflow.keras.models import load_model, save_model, Model


from dataset import create_train_test_file_list, Person_MealsDataset, balance_data_indices
from utils import *
from model import *


Overwriting packages.py


In [13]:
%%writefile train_model.py

from packages import *
from dataset import *
from utils import *
from model import *
def train_models(model, win_ls = [],EPOCHS = 10,stridesec = 5,name = "wenkanw",model_name="acti_6min" ,
                 random_seed= 1000, split_day=False,test_balanced=False,
                create_file_ls = False):
    from numpy.random import seed
    seed(random_seed)
    random.seed(random_seed)
#     tf.set_random_seed(random_seed)
    from datetime  import datetime
    batch_size = 128
    outfile = sys.stdout
    perf = {"model":[],"data":[],"win(sec)":[], "wacc":[],"f1":[],"recall":[],"acc":[]}
    model_ls = []
    hist_ls = []
    for winsize in win_ls:
        tf.random.set_seed(random_seed)
        seed(random_seed)
        
        winmin = winsize
        winlength = int(winmin * 60 * 15)
        step = int(stridesec * 15)
        start_time = datetime.now()
        arr = ["echo -n 'PBS: node is '; cat $PBS_NODEFILE",\
              "echo PBS: job identifier is $PBS_JOBID",\
              "echo PBS: job name is $PBS_JOBNAME"]
        [os.system(cmd) for cmd in arr]
        print("*****************************************************************\n", file=outfile, flush=True)
        print("Execution Started at " + start_time.strftime("%m/%d/%Y, %H:%M:%S"), file=outfile, flush=True)
        print("WindowLength: {:.2f} min ({:d} datum)\tSlide: {:d} ({:d} datum)\tEpochs:{:d}\n".format(winmin, winlength, stridesec, step, EPOCHS), file=outfile, flush=True)


        if split_day:
            pathtemp = "../models/" + name+"_models" +"/"+model_name+"_split_day_M_F_"
        else:
            pathtemp = "../models/" + name+"_models" +"/"+model_name+"_M_F_"
            
        #pathtemp = "../models/" + name +"/"+model_name+"_M_F_"
        modelpath = pathtemp + "{:f}Min.h5".format(winmin)
        jsonpath = pathtemp + "{:f}Min.json".format(winmin)
        
        print("Model to Save: ",modelpath)
        print()
        ########### Load the dataset################
        person = name
        if create_file_ls:
            create_train_test_file_list(file_name= "all_files_list.txt",person_name =name,
                         out_path = "../data-file-indices/",root_path= "../",
                         test_ratio = 0.2, print_flag = True, shuffle=True, random_state=random_seed)
        
        if split_day:
            

            meal_data_train = Person_MealsDataset(person_name= person, file_name = "train_files", winmin = winmin,stridesec = stridesec)
            meal_data_test = Person_MealsDataset(person_name= person, file_name = "test_files", winmin = winmin,stridesec = stridesec)

            train_indices, valid_indices = split_train_test_indices(X= [i for i in range(len(meal_data_train.labels))],
                                                                    y = meal_data_train.labels, test_size = 0.2,
                                                                   random_seed = random_seed)
            #balanced train set
            trainset_labels = meal_data_train.labels[train_indices]
            train_indices = balance_data_indices(trainset_labels,data_indices= train_indices,mode="under", shuffle=True,random_state = random_seed,replace= False)

            # balance test set
            testset_labels = meal_data_test.labels
            if test_balanced:
                test_indices = balance_data_indices(testset_labels,data_indices=[i for i in range(len(meal_data_test))] ,mode="under", shuffle=True,random_state = random_seed,replace= False)
            else:
                # without balancing data
                test_indices = [i for i in range(len(meal_data_test))] 
                
            # get numpy dataset
            balancedData, balancedLabels = meal_data_train.get_subset(train_indices)
            valid_balancedData, valid_balancedLabels = meal_data_train.get_subset(valid_indices)
            test_Data, test_Labels = meal_data_test.get_subset(test_indices)

        else:
        
            meal_data = Person_MealsDataset(person_name= person, file_name = "all_files_list", winmin = winmin,stridesec = stridesec)
            samples,labels =  meal_data.data_indices, meal_data.labels
            # split train set and test set
            train_indices, test_indices = split_train_test_indices(X= [i for i in range(len(labels))],
                                                                    y = labels, test_size = 0.2,
                                                                   random_seed = random_seed)
            # balance train set
            trainset_labels = labels[train_indices]
            train_indices_balanced = balance_data_indices(trainset_labels,data_indices= train_indices,mode="under", shuffle=True,random_state = random_seed,replace= False)
            
            
            testset_labels = labels[test_indices]
            if test_balanced:
                #balance test set
                test_indices = balance_data_indices(testset_labels,data_indices= test_indices,mode="under", shuffle=True,random_state = random_seed,replace= False)
            else:
                test_indices = test_indices 
            
            
            train_set_balanced = torch.utils.data.Subset(meal_data, train_indices_balanced)
            test_set = torch.utils.data.Subset(meal_data, test_indices)

            train_loader = torch.utils.data.DataLoader(train_set_balanced,batch_size=batch_size, shuffle=True,num_workers=2)
            test_loader = torch.utils.data.DataLoader(test_set ,batch_size=batch_size, shuffle=True,num_workers=2)

            print("Data Loader Created")            
            
            # split validation set
            balanced_trainset_labels = labels[train_indices_balanced]
            train_indices, valid_indices = split_train_test_indices(X= train_indices_balanced,
                                                                    y = balanced_trainset_labels, test_size = 0.2,
                                                                   random_seed = random_seed)
            valid_set_balanced = torch.utils.data.Subset(meal_data, valid_indices)
            valid_loader = torch.utils.data.DataLoader(valid_set_balanced,batch_size=batch_size, shuffle=True,num_workers=2)

            # Get numpy dataset: balanced trainset, validation set, test set
            balancedData, balancedLabels = meal_data.get_subset(train_indices)
            valid_balancedData, valid_balancedLabels = meal_data.get_subset(valid_indices)
            test_Data, test_Labels = meal_data.get_subset(test_indices)
        

        #training settings
        mcp_save = tf.keras.callbacks.ModelCheckpoint(modelpath, save_best_only=True, monitor='accuracy')
        

        scheduler = tf.keras.callbacks.ReduceLROnPlateau( monitor='val_loss', factor=0.1, patience=3, verbose=0,
                                             mode='auto', min_delta=0.0001, cooldown=0, min_lr=0.)
        
        ##########train model ###############
        H = model.fit(x=balancedData, y = balancedLabels,
                       validation_data=(valid_balancedData, valid_balancedLabels),
                    epochs = EPOCHS, batch_size=batch_size, verbose=1,
                    callbacks=[mcp_save,scheduler]) # removed addons.LossHistory(jsonpath) for compatibility with TensorFlow 2.2.0, needs to be re-added at some point

        print("Max value: ", max(H.history['accuracy']), " at epoch", np.argmax(H.history['accuracy']) + 1)

        from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, balanced_accuracy_score, f1_score
        predictions = model.predict(x=test_Data)
        threshold = 0.5
        wacc =  balanced_accuracy_score(test_Labels,predictions>=threshold)
        f1 =  f1_score(test_Labels,predictions>=threshold)
        acc =  accuracy_score(test_Labels,predictions>=threshold)
        recall = recall_score(test_Labels,predictions>=threshold)
        
        #auc = roc_auc_score(test_Labels,predictions>=threshold)
        print("Weighted Accuracy:", wacc)
        print("Test Accuracy:", acc)
        print("F1-score:", f1)
        print("Recall Accuracy:", recall)
        #print("AUC Score:", auc)

        perf["model"].append("ActiModel")
        perf["data"].append(name)
        perf["win(sec)"].append(winmin*60)
        perf["wacc"].append(wacc)
        perf["f1"].append(f1)
        perf["acc"].append(acc)
        perf["recall"].append(recall)
        #perf["auc"].append(auc)
        model_ls.append(model)
        hist_ls.append(H)
    perf_df = pd.DataFrame(perf)
    print(perf_df)
    return perf_df, model_ls, hist_ls




Overwriting train_model.py


In [2]:
%%writefile hysteresis_threshold.py
from packages import *
import os

def save_proba(result,path="proba.csv"):
    """
    Save predicted probability to csv files
    """
    df = pd.DataFrame(columns=["day","proba","labels"])
    for i, day in enumerate(result["day"].unique().tolist()):
        dat = {"day":[],"proba":[],"labels":[]}
        dat["proba"] = result["proba"].iloc[i]
        dat["day"] = [day]*len(result["proba"].iloc[i])
        dat["labels"] = result["labels"].iloc[i]
        dat = pd.DataFrame(dat)
        df =df.append(dat)
    df.to_csv(path, index=False)
    print("File "+path+" Saved")
    
def load_proba(path):
    """
    Load predicted probability on individual dataset
    """
    # key is day, value is possibility sequence
    proba_ls = {}
    labels_ls = {}
    df = pd.read_csv(path)
    for i in df['day'].unique():
        probas = df[df['day']==i]["proba"].values
        labels = df[df['day']==i]["labels"].values
        proba_ls[i] = probas
        labels_ls[i] = labels
    return proba_ls,labels_ls
    
def hysteresis_threshold(model, data,days_ls = [], start_threshold=0.8, end_threshold=0.4, winmin = 6,
                        stepsec=5, episode_min = 1.,t_pause = 900,load_proba_flag = True,
                         path ="../results/possibility_results/", file_name= None):
    """
    model: tensorflow model
    data:  This dataset must be the self-defined class of Person_MealsDataset  datasetset in my dataset.py/pytorch dataset without using shuffle. 
    Keep the order of dataset after extracting window samples!  You can also define your own dataset using class object to create the interface
    
    start_threshold: the high threshold of the beginning of segmentation
    
    end_threshold: the end threshold of the end of segmentation
    
    winmin: size of a window sample in unit of  minute
    
    stepsec: stride to move the window in unit of second / the number of second between two adjacent window samples
    
    episode_min: the minimum length of eating episode in unit of minute. If end of segmentation -start of segmentation < episode_min,
        then the episode will not be counted
    file_name: csv file that store predicted possibility of model
    path: path to load / save predicted probability
    
    load_proba_flag: if enabled,  load saved probability to do hysteresis threshold
    """
    result_ls = []
    proba_ls,labels_ls = {}, {}
    if file_name == None:
        file_name =path+"{}_{}min_{}slide_proba.csv".format(data.person_name, winmin,stepsec)
    
    if load_proba_flag and  os.path.isfile(file_name):
        # load generated probability if we already generate it
        proba_ls,labels_ls = load_proba(file_name)
        pass
    else:
        # generate possibility for hysteresis threshold if we have not done yet
        if not days_ls:
            days = set(data.data_indices[:,0])
        else:
            days = days_ls
        #pbar = tqdm(days, total=len(days))
        for day in days:
            # Select and Extract the data and labels of the corresponding day from the whole dataset
            sample_indices= np.where(data.data_indices[:,0]==day)[0]

            # get the numpy array of samples and labels
            import time
            start_time = time.time()
            samples, labels = data.get_subset(sample_indices)
            #print("--- Get data:  %s seconds ---" % (time.time() - start_time))
            probas = model(samples).numpy().squeeze()
            #print("--- Prediction %s seconds ---" % (time.time() - start_time))
            print("--- Day %d: %s seconds ---" % (day, time.time() - start_time))
            proba_ls[day] = probas
            labels_ls[day] = labels
        df = {}
        df["day"] = list(days)
#         day_key = list(proba_ls.keys())
#         day_key.sort()
        df["proba"]= [proba_ls[k] for k in days]
        df['labels'] = [labels_ls[k] for k in days]
        df = pd.DataFrame(df)
        save_proba(df,path=file_name)   

    if not days_ls:
        days = set(data.data_indices[:,0])
    else:
        days = days_ls
    pbar = tqdm(days, total=len(days))
    for day in pbar:
        
        # Select and Extract the data and labels of the corresponding day from the whole dataset
        sample_indices= np.where(data.data_indices[:,0]==day)[0]
        
        probas = proba_ls[day]
        labels = labels_ls[day]
        result = {'day':day,"stepsec": stepsec,'segment_start':[], 'segment_end':[],'proba':[],'predictions':np.zeros([len(sample_indices)]),'labels':[],"segment_count":0}
        state = 0
        start = 0
        end = 0 
        pause_counter = 0
        # one day data
        #print("Day: ",day)
        for i in range(len(sample_indices)):
            #print("i:",i)
            #sample, label = data[i][0].numpy(),data[i][1]
            #sample = np.expand_dims(sample,axis=0)
            #proba = model(sample).numpy()[0][0]
            #sample = samples[i]
            label = labels[i]
            proba = probas[i]
            
            result['proba'].append(proba)
            result['labels'].append(label)
            
            if state ==0 and proba > start_threshold:
                state = 1
                start = i
            elif state == 1 and proba <end_threshold:
                state = 2
                end = i+1
                pause_counter = 0
            elif state ==2:
                if proba > start_threshold:
                    state = 1
                else:
                    pause_counter += stepsec
                    if pause_counter >= t_pause:
                        # convert time to second and check threshold
                        if (end-start)*stepsec >= episode_min*60:
                            # save data
                            result['segment_start'].append(start)
                            result['segment_end'].append(end)
                            result['segment_count'] += 1
                            result['predictions'][start:end] = 1
                            pass
                        end = 0
                        state = 0
        if state != 0:
            # if segment ended at the end of data
            if end != 0:
                result['segment_start'].append(start)
                result['segment_end'].append(end)
                result['predictions'][start:end] = 1
            else:
                result['segment_count'] -= 1  
            result['segment_count'] += 1
#         print("--- One Day: %s seconds ---" % (time.time() - start_time))    
        result_ls.append(result)
    print("Segmentation Completed. ")
    result_ls = pd.DataFrame(result_ls)
                      
    return result_ls



def get_episode_metrics(result, meal_data,days_ls= None):
    """
    Obtain and format the episode metric results 
    
    result: result from  hysteresis threshold function
    meal_data: meal dataset of Person_MealData
    
    """
    
    from sklearn.metrics import accuracy_score, recall_score, roc_auc_score
    total_preds = []
    total_labels = []
    perf = {"TPR":[],"FP/TP":[],"TP":[], "FP":[],"FN":[]}
    tpr = 0. 
    FP_TP = 0.
    TP = 0
    FP = 0
    FN = 0
    # get segmentation of ground truth labeled by user
    start_ls, end_ls = meal_data.get_GT_segment()
    if days_ls:
        start_ls = [start_ls[day] for day in days_ls]
        end_ls =  [end_ls[day] for day in days_ls]
    
    meal_counts, min_counts,hour_counts, day_counts,total_hours = meal_data.get_mealdataset_info()
    
    # iterate every day
    for i in range(len(start_ls)):
        #preds = result.iloc[i]['predictions']
        #labels =  result.iloc[i]['labels']
        event_start, event_end= start_ls[i], end_ls[i]
        detect_start, detect_end = result.iloc[i]['segment_start'],result.iloc[i]['segment_end']
        GT = np.array([-1]*len(event_start) )  # default all meals are missing -1, FN
        detect = np.array([-1]*len(detect_start)) # default all detected meals are wrong -1, FP
        for index in range(len(event_start)):
            # e_s: event start,  e_e: event end
            # d_s: detection start,  d_e: detection end
            e_s, e_e = event_start[index], event_end[index]
            for index2 in range(len(detect_start)):
                # convert segment from sec to index of data point
                d_s = detect_start[index2] * result.iloc[i]['stepsec']*15
                d_e = detect_end[index2]* result.iloc[i]['stepsec']*15
                #print("ds: {} d_e: {}, e_s:{}, e_e: {}".format(d_s,d_e, e_s, e_e))
                if (e_s>=d_s and e_s <= d_e) or (d_s>= e_s and d_s<= e_e):
                    GT[index] = index2
                    detect[index2] = index
        #print("GT:",GT, "Detect:", detect)
        TP += sum(GT!=-1)
        FN += sum(GT==-1)
        FP += sum(detect==-1)
                
    
    print("total_meal:",meal_counts, "TP: ", TP, "FP: ", FP, "FN: ", FN)
    perf['TPR'].append(TP/(TP+FN) if (TP+FN)>0 else 0)
    if TP ==0:
        perf['FP/TP'].append(None)
    else:
        perf['FP/TP'].append(FP/TP)
    perf["TP"].append(TP)
    perf["FP"].append(FP)
    perf["FN"].append(FN)
    result_df = pd.DataFrame(perf)
        
    return pd.DataFrame(result_df)


Overwriting hysteresis_threshold.py


# Utils to measure and print  Time/Episode Metric

In [3]:
%%writefile metrics.py

from packages import *
from collections import defaultdict

def weight_accuracy(y_true, y_pred,weight, print_flag=True):
    TP = sum( (y_true==1) &(y_pred==1) )
    FN = sum( (y_true==1) &(y_pred==0) )
    TN = sum( (y_true==0) &(y_pred==0) )
    FP = sum( (y_true==0) &(y_pred==1) )
    if print_flag:
        print("TP: ",TP, "FP: ",FP, "TN: ",TN, "FN: ",FN)
    return (weight*TP + TN)/(weight*(TP+FN) + (TN+FP))

def test_models_time_metric(balance_ratio,winmin=1, stridesec = 5,names= ["wenkanw"],random_seed=1000, split_day=False, test_balance=False, test_CAD=False):
    perf = defaultdict(list)
    meal_info = defaultdict(list)
    # Don't test on CAD test set if it is not enabled
    if not test_CAD and "CAD" in names:
        names.remove("CAD")
        
    for name in names:
        person = name
        if split_day:
            meal_data = Person_MealsDataset(person_name= person, file_name = "test_files", winmin = winmin,stridesec = stridesec)

            # balance test set
            testset_labels = meal_data.labels
            if test_balance:
                test_indices = balance_data_indices(testset_labels,data_indices=[i for i in range(len(meal_data))] ,mode="under", shuffle=True,random_state = random_seed,replace= False)
            else:
                test_indices = [i for i in range(len(meal_data))]
            # get numpy dataset
            test_Data, test_Labels = meal_data.get_subset(test_indices)
        else:            
            meal_data = Person_MealsDataset(person_name= person, file_name = "all_files_list", winmin = winmin,stridesec = stridesec)
            samples,labels =  meal_data.data_indices, meal_data.labels
            # split train set and test set
            train_indices, test_indices = split_train_test_indices(X= [i for i in range(len(labels))],
                                                                            y = labels, test_size = 0.2,
                                                                           random_seed = random_seed)
            
            if test_balance:
                testset_labels = labels[test_indices]
                test_indices = balance_data_indices(testset_labels,data_indices= test_indices,mode="under", shuffle=True,random_state = random_seed,replace= False)
            else:
                test_indices = test_indices
            testset_labels = labels[test_indices]
            print("Testing on : ", sum(testset_labels==1),"positive samples, ",sum(testset_labels==0)," negative samples" )
            test_Data, test_Labels = meal_data.get_subset(test_indices)
        if name !="CAD":
            meal_counts, min_counts,hour_counts, day_counts,total_hours = meal_data.get_mealdataset_info(person_name=name, file_ls_doc="all_files_list.txt")
        else:
            # data from paper
            meal_counts, min_counts,hour_counts, day_counts,total_hours = 1063, 250*60,250, 354, 4680
        meal_info["dataset"].append(name)
        meal_info["Days"].append(day_counts)
        meal_info["Meal_Hours"].append(round(hour_counts,1)) 
        meal_info["Meal_Counts"].append(meal_counts) 
        meal_info["Total_Hours"].append(total_hours) 
        
        perf["dataset"].append(name)
        perf["win(sec)"].append(winmin*60)
        
            
        
        
        from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, f1_score, balanced_accuracy_score, precision_score
        group_model_W  = tf.keras.models.load_model('../models/CAD_models/acti_6min_M_F_6.000000Min.h5')
        
        if name != "CAD":
            if split_day:
                individual_model = tf.keras.models.load_model('../models/'+ name+ '_models/acti_6min_split_day_M_F_6.000000Min.h5')
            else:    
                individual_model = tf.keras.models.load_model('../models/'+ name+ '_models/acti_6min_M_F_6.000000Min.h5')
        models = {"suffix":['Individual-Model','GroupModel'],  "model":[individual_model,group_model_W]}
        
        for i in range(len(models["suffix"])):
            suffix = models["suffix"][i]
            model = models["model"][i]
            # if the dataset is CAD group dataset and model is individual model
            # we don't need to make prediction on that data
            if name == "CAD" and suffix =='Individual-Model':
                acc = None
                auc = None
                recall = None
            else:

                predictions = model.predict(x=test_Data).squeeze(1)
                threshold = 0.5
                prediction = (predictions>=threshold).astype(int)
                wacc =  balanced_accuracy_score(test_Labels,prediction)
                acc =  accuracy_score(test_Labels,prediction)
                recall = recall_score(test_Labels,prediction)
                f1 = f1_score(test_Labels,prediction)
                precision = precision_score(test_Labels,prediction)
                print("Test label: ",test_Labels)
                print("Predictions:",prediction)
                
                # weighted accuracy 2 is computed by (weight*TP +TN)/(weight*(TP+FN) + (TN+FP))
                wacc2 = weight_accuracy(test_Labels,prediction, weight=balance_ratio)
            
            
            print("Weighted Accuracy:", wacc)
            print("Weighted Accuracy2:", wacc2)
            print("Recall:", recall)
            print("Precision:", precision)
            print("F1:", f1)
            print("Test Accuracy:", acc)
            
            perf["WAcc: "+suffix].append(wacc)
            perf["WAcc2: "+suffix].append(wacc2)
            perf["Recall: "+suffix].append(recall)
            perf["Precision: "+suffix].append(precision)
            perf["F1: "+suffix].append(f1)
            perf["Acc: "+suffix].append(acc)

    meal_info = pd.DataFrame(meal_info)
    perf_df = pd.DataFrame(perf)
    return meal_info, perf_df



def print_time_metrics(result, old_result = None,round_decimal = 3,):
    perf_df = pd.DataFrame()
    mykeys = ["dataset","win(sec)","WAcc", "F1","Precision","Recall"]
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']    
    for k in mykeys:
        for key in result.keys():
            if k in key:
                perf_df[key] = result[key]
                if result[key].dtype in numerics:
                    perf_df[key] = np.round(perf_df[key],round_decimal)
    for key in result.keys():
        if "WAcc" not in key and "Acc" in key:
            perf_df[key] = result[key].values.round(round_decimal)
    
    if not isinstance(old_result, type(None)):
        # remove average performance from old results and append new data to table
        drop_vals = result["dataset"].values.tolist()
        drop_vals.append("average performance")
        idx = []
        for i in range(len(old_result)):
            if old_result['dataset'].iloc[i] in drop_vals:
                idx.append(i)
                
        #idx =old_result[(old_result['dataset']=="average performance") | (old_result['dataset']==result["dataset"].values[0])].index
        print("index",idx)
        new_result = old_result.drop(index=idx,axis=0)
        perf_df= new_result.append(perf_df, ignore_index=True)
        
    mean_perf = pd.DataFrame(columns = perf_df.keys())
    mean_perf = mean_perf.append({"dataset":"average performance","win(sec)":"-"},ignore_index=True)
    for key in perf_df.keys():
        if key.lower() != "dataset" and  key.lower() != "win(sec)":
            mean_perf[key].at[0] = perf_df[key].mean().round(round_decimal)
    
    perf_df = perf_df.append(mean_perf,ignore_index=True)
            
    return perf_df








##########################
# Episode metric
##########################

hythreshold = {}
for n in ['wenkanw','adam','lawler','shaurya']:
    hythreshold[n]= [0.8, 0.4]
def test_models_episode_metric(winmin=6, stridesec = 5,names= ["wenkanw"],random_seed=1000,
                               test_balance=False, test_CAD=False,
                               test_alldata=False,threshold= hythreshold,
                               load_proba_flag=True, use_group_threshold = 0,
                              proba_path ="../results/possibility_results/"):
    perf = defaultdict(list)
    meal_info = defaultdict(list)
    
    # Don't test on CAD test set if it is not enabled
    if not test_CAD and "CAD" in names:
        names.remove("CAD")
    group_threshold = {'wenkanw':[0.8, 0.4], 'adam':[0.8,0.4],'lawler':[0.8,0.4], 'shaurya':[0.8,0.4]}
    for name in names:
        person = name
        # test episode metrics that split dataset by days
        if not test_alldata:
            meal_data = Person_MealsDataset(person_name= person, file_name = "test_files", winmin = winmin,stridesec = stridesec)
        else:
            meal_data = Person_MealsDataset(person_name= person, file_name = "all_files_list", winmin = winmin,stridesec = stridesec)
        # balance test set    
        if test_balance:
            testset_labels = meal_data.labels
            test_indices = balance_data_indices(testset_labels,data_indices=[i for i in range(len(meal_data))] ,mode="under", shuffle=True,random_state = random_seed,replace= False)
        else:
            test_indices = [i for i in range(len(meal_data))]
        # get numpy dataset
        #test_Data, test_Labels = meal_data.get_subset(test_indices)
        
        
        meal_counts, min_counts,hour_counts, day_counts,total_hours = meal_data.get_mealdataset_info(person_name=name)
                
        
        perf["Days"].append(day_counts)
        perf["Meal_Hours"].append(round(hour_counts,1)) 
        perf["Meal_Counts"].append(meal_counts) 
        perf["dataset"].append(name)
        perf["win(sec)"].append(winmin*60)
        
            
        from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, f1_score, balanced_accuracy_score, precision_score
        group_model_W  = tf.keras.models.load_model('../models/CAD_models/acti_6min_M_F_6.000000Min.h5')
        
        individual_model = tf.keras.models.load_model('../models/'+ name+ '_models/acti_6min_split_day_M_F_6.000000Min.h5')
            
        models = {"suffix":['Individual-Model','GroupModel'],  "model":[individual_model,group_model_W]}
        proba_path += person+"/"
        for i in range(len(models["suffix"])):
            suffix = models["suffix"][i]
            model = models["model"][i]
            # if the dataset is CAD group dataset and model is individual model
            # we don't need to make prediction on that data
            path= proba_path
            if use_group_threshold==0:
                high_th, low_th = threshold[name][0], threshold[name][1]
            elif use_group_threshold==1:
                if suffix == "GroupModel":
                    high_th, low_th = group_threshold[name][0], group_threshold[name][1]
                else:
                    high_th, low_th = threshold[name][0], threshold[name][1]
            else:
                high_th, low_th = group_threshold[name][0], group_threshold[name][1]
                
            
            if suffix == "GroupModel":
                path = proba_path +"group_"
                
                
            result = hysteresis_threshold(model, meal_data,start_threshold=high_th, end_threshold=low_th,
                                          winmin = 6, stepsec=5, episode_min = 1.,
                                         load_proba_flag=load_proba_flag, path =path)
            episode_perf_df = get_episode_metrics(result,meal_data)
            perf["TPR: "+suffix].append(episode_perf_df["TPR"].iloc[0])
            perf["FP/TP: "+suffix].append(episode_perf_df["FP/TP"].iloc[0])
            perf["TP: "+suffix].append(episode_perf_df["TP"].iloc[0])
            perf["FP: "+suffix].append(episode_perf_df["FP"].iloc[0])
            perf["FN: "+suffix].append(episode_perf_df["FN"].iloc[0])
            
            print(episode_perf_df)

    perf_df = pd.DataFrame(perf)
    return perf_df




def print_episode_metrics(result,old_result= None, round_decimal=3):
    """
    print the episode_perf_df result from test_models_episode_metric
    in suitable order
    """
    result_df= result[['dataset','Days' ,'Meal_Hours',"Meal_Counts","win(sec)",
            "TPR: Individual-Model","TPR: GroupModel","FP/TP: Individual-Model","FP/TP: GroupModel",
           "TP: Individual-Model","TP: GroupModel",
           "FP: Individual-Model","FP: GroupModel",
           "FN: Individual-Model","FN: GroupModel"]]
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']   
    for key in result_df.columns:
        if result_df[key].dtype in numerics:
            #result_df[key] = np.round(result_df[key].values, round_decimal) 
            for i in range(len(result_df[key])):
                result_df[key].at[i] = result_df[key].iloc[i].round(round_decimal)
            
    if not isinstance(old_result, type(None)):
        # remove average performance from old results and append new data to table
        drop_vals = result["dataset"].values.tolist()
        drop_vals.append("average performance")
        idx = []
        for i in range(len(old_result)):
            if old_result['dataset'].iloc[i] in drop_vals:
                idx.append(i)
        
        #idx =old_result[(old_result['dataset']=="average performance") | (old_result['dataset'].values == result["dataset"].values)].index
        print("index",idx)
        new_result = old_result.drop(index=idx,axis=0)
        result_df= new_result.append(result_df, ignore_index=True)
    
    mean_perf = pd.DataFrame(columns = result_df.keys())
    mean_perf = mean_perf.append({"dataset":"average performance","win(sec)":"-",'Days':"-" ,
                                  'Meal_Hours':"-","Meal_Counts":"-"},ignore_index=True)
    for key in mean_perf.keys():
        if key not in ['dataset','Days' ,'Meal_Hours',"Meal_Counts","win(sec)"]:
            mean_perf[key].at[0] = result_df[key].mean().round(round_decimal)
    
    result_df = result_df.append(mean_perf,ignore_index=True)
    return result_df

Overwriting metrics.py


# Cross validation

In [6]:
%%writefile cross_validation.py

from packages import *
from hysteresis_threshold import *
from dataset import *
from collections import defaultdict
from metrics import *

def cross_validation_metrics(model_arch,balance_ratio,names= ["wenkanw"], fold_num = 5, winmin=6, stridesec = 5,model_name= "acti_model",epochs = 20,
                                 random_seed=1000, split_day=False, test_balance=False, re_train = False,load_data=True,
                                 test_CAD=False, metrics =['time','episode'], ind_threshold= None,load_proba_flag=True):
    import os
    from sklearn.model_selection import KFold
    from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, f1_score, balanced_accuracy_score, precision_score
    batch_size = 128
    time_perf = defaultdict(list)
    episode_perf = defaultdict(list)
    meal_info = defaultdict(list)
    time_individual_perf = defaultdict(list)
    episode_individual_perf = defaultdict(list)
    time_group_perf = defaultdict(list)
    episode_group_perf = defaultdict(list)
    
    group_threshold = [0.8,0.4] 
                
    model = None
    kf = KFold(n_splits=5, random_state= 1000,shuffle=False)
    # Don't test on CAD test set if it is not enabled
    if not test_CAD and "CAD" in names:
        names.remove("CAD")
    
    
    for name in names:
        #load individual whole dataset
        person = name
        meal_data = Person_MealsDataset(person_name= person, file_name = "all_files_list", winmin = winmin,stridesec = stridesec)
        samples,labels =  meal_data.data_indices, meal_data.labels
            
        if name !="CAD":
            meal_counts, min_counts,hour_counts, day_counts,total_hours = meal_data.get_mealdataset_info(person_name=name, file_ls_doc="all_files_list.txt")
        else:
            # data from paper
            meal_counts, min_counts,hour_counts, day_counts,total_hours = 1063, 250*60,250, 354, 4680
        meal_info["dataset"].append(name)
        meal_info["Days"].append(day_counts)
        meal_info["Meal_Hours"].append(round(hour_counts,1)) 
        meal_info["Meal_Counts"].append(meal_counts) 
        meal_info["Total_Hours"].append(total_hours) 
        
        time_perf["dataset"].append(name)
        time_perf["win(sec)"].append(winmin*60)
        
        episode_perf["dataset"].append(name)
        episode_perf["win(sec)"].append(winmin*60)
        
        days = np.unique(meal_data.data_indices[:,0])
        samples,labels =  meal_data.data_indices, meal_data.labels
#         samples,labels = meal_data.get_subset([i for i in range(len(meal_data.labels))])

        # K-fold cross validation
        for fold, (day_train_idx, day_test_idx) in enumerate(kf.split(days)):
            print("Fold: %d"%(fold),"Train on days: ",day_train_idx, "Test on days: ",day_test_idx)
            day_train_idx = day_train_idx.tolist()
            day_test_idx = day_test_idx.tolist()
            train_indices = []
            test_indices = []
            # partition dataset by days
            for i, day in enumerate(meal_data.data_indices[:,0]):
                if day in day_train_idx:
                    train_indices.append(i)
                else:
                    test_indices.append(i)
#             print("Train indices: ", train_indices)
#             print("Test indices: ", test_indices)
#             assert False
            # balance train set
            trainset_labels = labels[train_indices]
            train_indices_balanced = balance_data_indices(trainset_labels,data_indices= train_indices,mode="under", shuffle=True,random_state = random_seed,replace= False)
            
            testset_labels = labels[test_indices]
            if test_balance:
                #balance test set
                test_indices = balance_data_indices(testset_labels,data_indices= test_indices,mode="under", shuffle=True,random_state = random_seed,replace= False)
            else:
                test_indices = test_indices 
                   
            
            # split validation set
            balanced_trainset_labels = labels[train_indices_balanced]
            train_indices, valid_indices = split_train_test_indices(X= train_indices_balanced,
                                                                    y = balanced_trainset_labels, test_size = 0.2,
                                                                   random_seed = random_seed)
                    
            if not load_data and "time" not in metrics:
                balancedData, balancedLabels = meal_data.get_subset([])
                valid_balancedData, valid_balancedLabels = meal_data.get_subset([])
                test_Data, test_Labels = meal_data.get_subset([])
            else:
                # Get numpy dataset: balanced trainset, validation set, test set
                balancedData, balancedLabels = meal_data.get_subset(train_indices)
                valid_balancedData, valid_balancedLabels = meal_data.get_subset(valid_indices)
                test_Data, test_Labels = meal_data.get_subset(test_indices)

                # balancedData, balancedLabels = samples[train_indices],labels[train_indices]  
                # valid_balancedData, valid_balancedLabels = samples[valid_indices],labels[valid_indices] 
                # test_Data, test_Labels = samples[test_indices],labels[test_indices]

                print("Train on : ", sum(balancedLabels==1),"positive samples, ",sum(balancedLabels==0)," negative samples" )
                print("Testing on : ", sum(valid_balancedLabels==1),"positive samples, ",sum(valid_balancedLabels==0)," negative samples" )
                print("Testing on : ", sum(test_Labels==1),"positive samples, ",sum(test_Labels==0)," negative samples" )
            
            
            #train models
            pathtemp = "../models/" + name+"_models" +"/"+"cv_fold_"+str(fold) +"_"+model_name+"_M_F_"
            modelpath = pathtemp + "{:f}Min.h5".format(winmin)
            jsonpath = pathtemp + "{:f}Min.json".format(winmin)
            
            # if model doesn't exist or re_train is enabled, then re_trian
            # otherwise, just load model
            if not os.path.isfile(modelpath) or re_train:
                #training settings
                win_size = 15*winmin*60
                model =model_arch(input_shape =(win_size,6) )
                model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
                
                mcp_save = tf.keras.callbacks.ModelCheckpoint(modelpath, save_best_only=True, monitor='val_accuracy')
                scheduler = tf.keras.callbacks.ReduceLROnPlateau( monitor='val_loss', factor=0.1, patience=3, verbose=0,
                                                     mode='auto', min_delta=0.0001, cooldown=0, min_lr=0.)
                ##########train model ###############
                H = model.fit(x=balancedData, y = balancedLabels,
                               validation_data=(valid_balancedData, valid_balancedLabels),
                            epochs = epochs, batch_size=batch_size, verbose=1,
                            callbacks=[mcp_save,scheduler]) # removed addons.LossHistory(jsonpath) for compatibility with TensorFlow 2.2.0, needs to be re-added at some point

                print("Max value: ", max(H.history['accuracy']), " at epoch", np.argmax(H.history['accuracy']) + 1)
                print("Model saved to path: ",modelpath)
            else:
                model  = tf.keras.models.load_model(modelpath)
                
            # obtain individual model and pre-trained group model
            individual_model = model
            group_model_W  = tf.keras.models.load_model('../models/CAD_models/acti_6min_M_F_6.000000Min.h5')
            models = {"suffix":['Individual-Model','GroupModel'],  "model":[individual_model,group_model_W]}
            
            ##### Test models####
            ############ Time metrics ##########
            if 'time' in metrics:
                for i in range(len(models["suffix"])):
                    suffix = models["suffix"][i]
                    model = models["model"][i]
                    # if the dataset is CAD group dataset and model is individual model
                    # we don't need to make prediction on that data
                    if name == "CAD" and suffix =='Individual-Model':
                        acc = None
                        auc = None
                        recall = None
                    else:

                        predictions = model.predict(x=test_Data).squeeze(1)
                        threshold = 0.5
                        prediction = (predictions>=threshold).astype(int)
                        wacc =  balanced_accuracy_score(test_Labels,prediction)
                        acc =  accuracy_score(test_Labels,prediction)
                        recall = recall_score(test_Labels,prediction)
                        f1 = f1_score(test_Labels,prediction)
                        precision = precision_score(test_Labels,prediction)
                        # weighted accuracy 2 is computed by (weight*TP +TN)/(weight*(TP+FN) + (TN+FP))
                        wacc2 = weight_accuracy(test_Labels,prediction,weight = balance_ratio[name],print_flag=False)
                        
                        # store performance for one fold 
                        if suffix == "GroupModel":
                            time_group_perf["WAcc: "+suffix].append(wacc)
                            time_group_perf["WAcc2: "+suffix].append(wacc2)
                            time_group_perf["Recall: "+suffix].append(recall)
                            time_group_perf["Precision: "+suffix].append(precision)
                            time_group_perf["F1: "+suffix].append(f1)
                            time_group_perf["Acc: "+suffix].append(acc)
                        else:
                            time_individual_perf["WAcc: "+suffix].append(wacc)
                            time_individual_perf["WAcc2: "+suffix].append(wacc2)
                            time_individual_perf["Recall: "+suffix].append(recall)
                            time_individual_perf["Precision: "+suffix].append(precision)
                            time_individual_perf["F1: "+suffix].append(f1)
                            time_individual_perf["Acc: "+suffix].append(acc)
                            

            ######## episode metric ############
            if "episode" in metrics:
                proba_path ="../results/possibility_results/"+person +"/cv_fold_"+str(fold)+"_"
                for i in range(len(models["suffix"])):
                    suffix = models["suffix"][i]
                    model = models["model"][i]
                    result_path = proba_path
                    if suffix == "GroupModel":
                        result_path += "group_"
                        high_th, low_th  = group_threshold[0],group_threshold[1]
                    else:
                        if ind_threshold:
                            high_th, low_th  = ind_threshold[name][0],ind_threshold[name][1] 
                        else:
                            high_th, low_th  = group_threshold[0],group_threshold[1]
                        
                    result = hysteresis_threshold(model, meal_data,days_ls = day_test_idx,start_threshold=high_th, end_threshold=low_th,
                                                  winmin = winmin, stepsec=stridesec, episode_min = 1.,
                                                 load_proba_flag=load_proba_flag, path =result_path)
                    episode_perf_df = get_episode_metrics(result,meal_data,days_ls = day_test_idx)
                    
                    TP = episode_perf_df["TP"].iloc[0]
                    FP = episode_perf_df["FP"].iloc[0]
                    FN = episode_perf_df["FN"].iloc[0]
                    if suffix == "GroupModel":
                        episode_group_perf["TP: "+suffix].append(TP)
                        episode_group_perf["FP: "+suffix].append(FP)
                        episode_group_perf["FN: "+suffix].append(FN)
                    else:
                        episode_individual_perf["TP: "+suffix].append(TP)
                        episode_individual_perf["FP: "+suffix].append(FP)
                        episode_individual_perf["FN: "+suffix].append(FN)

        if 'time' in metrics:
            for key in  time_group_perf.keys():
                time_perf[key].append(  np.mean(time_group_perf[key])) 
                time_group_perf[key].clear()
                
            for key in  time_individual_perf.keys():
                time_perf[key].append(  np.mean(time_individual_perf[key])) 
                time_individual_perf[key].clear()
                
        if 'episode' in metrics:
            for key in episode_group_perf.keys():
                episode_perf[key].append(  np.sum(episode_group_perf[key])) 
                episode_group_perf[key].clear()
            for key in episode_individual_perf.keys():
                episode_perf[key].append(  np.sum(episode_individual_perf[key])) 
                episode_individual_perf[key].clear()
            
    meal_info = pd.DataFrame(meal_info)
    episode_perf = pd.DataFrame(episode_perf)
    time_perf = pd.DataFrame(time_perf)
    # Compute TPR, FP/TP for all models
    for suffix in ['Individual-Model','GroupModel']:
            episode_perf["TPR: "+suffix] = episode_perf['TP: '+suffix]/(episode_perf['TP: '+suffix] + episode_perf['FN: '+suffix])
            episode_perf['FP/TP: '+suffix] = episode_perf['FP: '+suffix]/episode_perf['TP: '+suffix]
            
    return meal_info, time_perf,episode_perf





def test_threshold_cv(datasets ,ts_ls=[],te_ls=[], fold_num=5,round_num = 3,path_name = "../results/possibility_results/"):
    """
    Test the hysteresis threshold values  for individual models based on generated possibility in csv files from hysteresis_threshold function
    
    datasets:  a dictionary of datasets generated from create_dataset() in dataset.py
    """
    from sklearn.model_selection import KFold
    kf = KFold(n_splits=5, random_state= 1000,shuffle=False)
    res = pd.DataFrame()
    
    for person in datasets.keys():
        data = datasets[person]
        for ts in ts_ls:
            for te in te_ls:
                TP,FN, FP = 0,0,0
                df= {'dataset':[],'Ts':[],"Te":[],'TP':[],'FN':[],"FP":[],'TPR':[],'FP/TP':[]}
                days = np.unique(data.data_indices[:,0])
                samples,labels =  data.data_indices,data.labels
                # K-fold cross validation
                for fold, (day_train_idx, day_test_idx) in enumerate(kf.split(days)):
                    day_train_idx, day_test_idx =day_train_idx.tolist(), day_test_idx.tolist()
                    result_path =path_name +person+"/cv_fold_"+str(fold)+"_"
                    model_name = '../models/{}_models/cv_fold_{}_acti_model_M_F_6.000000Min.h5'.format(person, fold)
                    model = tf.keras.models.load_model(model_name)
                    ht_result = hysteresis_threshold(model, data,days_ls= day_test_idx,start_threshold=ts, end_threshold=te, 
                                  winmin = 6, stepsec=5, episode_min = 1., load_proba_flag=True,path =result_path)
                    episode_perf = get_episode_metrics(ht_result,data,days_ls= day_test_idx)
                    TP += episode_perf["TP"].iloc[0]
                    FP += episode_perf["FP"].iloc[0]
                    FN += episode_perf["FN"].iloc[0]
                    
                df["dataset"].append(person)
                df["Ts"].append(ts)
                df["Te"].append(te)
                df["TP"].append(TP)
                df["FN"].append(FN)
                df["FP"].append(FP)
                df["TPR"].append(round(TP/(TP+FN) if (TP+FN)>0 else 0,round_num))
                df["FP/TP"].append(round(FP/TP, round_num))
                df = pd.DataFrame(df)
                print(df)
                res = res.append(df, ignore_index=True)
                
    return res  

def find_optimal_threshold(threshold_results,mode="min_fp", min_tpr= 0.85, max_fp=1.):
    """
    To find the optimal threshold for each individual model, based on threshold_results from  function test_threshold_cv()
    
    threshold_results: output dataframe from test_threshold_cv() function
    """
    best_threshold = pd.DataFrame()
    threshold_results["ratio"] = threshold_results["TP"].values/ (threshold_results["TP"].values+threshold_results["FN"].values +threshold_results["FP"].values)
    for person in threshold_results["dataset"].unique():
        
        df= threshold_results.loc[threshold_results['dataset']==person]
        if mode =="min_fp":
            # find min FP/TP with TPR inside range
            df= df.loc[threshold_results["TPR"]>min_tpr]
            if len(df) == 0:
                df= df.iloc[threshold_results["TPR"].argmax()]
            else:
                df = df.iloc[df["FP/TP"].argmin()]
        elif mode == "max_tpr":
            # find max TPR with FP/TP inside range
            df = df.loc[df["FP/TP"]<max_fp]
            
            if len(df) == 0:
                df= df.iloc[threshold_results["FP/TP"].argmin()]
            else:
                df= df.iloc[threshold_results["TPR"].argmax()]
        else:
            df= df.loc[threshold_results["TPR"]>min_tpr]
            if len(df) == 0:
                df= df.iloc[threshold_results["TPR"].argmax()]
            else:
                df = df.iloc[df["ratio"].argmax()]
                
        best_threshold = best_threshold.append(df)
        thresholds = {}
        for name in best_threshold['dataset'].values:
            thresholds[name] = [ best_threshold[best_threshold['dataset']==name]['Ts'].values[0] ,
                                best_threshold[best_threshold['dataset']==name]['Te'].values[0] ]
        #best_threshold[["dataset","Ts","Te"]]
    best_threshold = best_threshold[["dataset","Ts","Te","TPR","FP/TP","TP","FP","FN"]]
    return best_threshold , thresholds


    

Overwriting cross_validation.py


# Visualization utils

In [7]:
%%writefile visualization.py
from __future__ import print_function
from packages import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt


def plot_threshold_results(threshold_results, te_val = 0.1, ts_val=0.6,figsize=(12,10),legend_out=True, grid =False):
    """
    plot threshold results generated from test_threshold_csv()  function in cross_validation.py file
    """
    print("Fixed Te, Change Ts")
    fig, ax= plt.subplots(2,2, figsize=figsize)
    
    for j in range(2):
            ax[0,j].set_xlim(threshold_results['Ts'].min(),threshold_results['Ts'].max())
    
    for j in range(2):
            ax[1,j].set_xlim(threshold_results['Te'].min(),threshold_results['Te'].max())
            
    th_df = threshold_results[(threshold_results['Te']==te_val)]
    
    fig_ts1 = sns.lineplot(data=th_df, x="Ts", y= "TPR", hue="dataset",ax=ax[0,0])
    fig_ts2 = sns.lineplot(data=th_df, x="Ts", y= "FP/TP", hue="dataset",ax=ax[0,1])
    _ = ax[0,0].set_title("Te = "+str(te_val))
    _ = ax[0,1].set_title("Te = "+str(te_val))

    
    th_df = threshold_results[(threshold_results['Ts']==ts_val)]
    fig_te1 = sns.lineplot(data=th_df, x="Te", y= "TPR", hue="dataset",ax=ax[1,0])
    fig_te2 = sns.lineplot(data=th_df, x="Te", y= "FP/TP", hue="dataset",ax=ax[1,1])
    _ = ax[1,0].set_title("Ts = "+str(ts_val))
    _ = ax[1,1].set_title("Ts = "+str(ts_val))
    if grid:
        fig_ts1.grid()
        fig_ts2.grid()
        fig_te1.grid()
        fig_te2.grid()
    if not legend_out :
        fig_ts1.legend(loc="upper right")
        fig_ts2.legend(loc="upper right")
        fig_te1.legend(loc="upper right")
        fig_te2.legend(loc="upper right")
    else:
        fig_te2.legend_.remove()
        fig_ts1.legend_.remove()
        fig_te1.legend_.remove()
        fig_ts2.legend(loc=2, bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
    
    return [fig_ts1,fig_ts2], [fig_te1,fig_te2]

def save_figs(fig_ts, fig_te, dpi=400,fig_path = "../results/images/",prefix=""):
    
    fig = fig_ts[0].get_figure()
    fig.savefig(fig_path+prefix+"fig_ts1", dpi = dpi)
    fig = fig_ts[1].get_figure()
    fig.savefig(fig_path+prefix+"fig_ts2", dpi = dpi)
    fig = fig_te[0].get_figure()
    fig.savefig(fig_path+prefix+"fig_te1", dpi = dpi)
    fig = fig_te[1].get_figure()
    fig.savefig(fig_path+prefix+"fig_te2", dpi = dpi)


def map_prediction_gt(meal_data, day,possib_result ):
    """
    Convert segmentation back to binary labels
    and Map the prediction possibility sequence  back to the same shape
    """
    day = int(day)
    res = possib_result 
    possib = np.array(res.proba.iloc[day])
    step= res["stepsec"].iloc[0] *15
    start_ls, end_ls = meal_data.get_GT_segment()
    start_ls = start_ls[day]
    end_ls = end_ls[day]
    proba = np.zeros([len(meal_data.data[day]), ] )
    labels = np.zeros([len(meal_data.data[day]), ] )
    preds = np.zeros([len(meal_data.data[day]), ] )
    
    #probability sequence
    for i in range(len(possib)):
        proba[i*step: (i+1)*step]= possib[i]
    
    # GT label
    for i in range(len(start_ls)):
        labels[start_ls[i]:end_ls[i]+1] = 1
        
    # prediction label by hysteresis threshold
    seg_start_ls = res["segment_start"].iloc[day]
    seg_end_ls = res["segment_end"].iloc[day]
    for i in range(len(seg_start_ls)):
        s = int(seg_start_ls[i] * step)
        e = int(seg_end_ls[i] * step)
        preds[s:e] =1
    return proba, labels, preds

def map_results(meal_data,possib_result):
    """
    Convert segmentation back to binary labels
    and Map the prediction possibility sequence  back to the same shape
    for all days of data
    """
    proba_ls, labels_ls, preds_ls = [],[],[]

    for day in range(len(possib_result)):
        proba, labels, preds = map_prediction_gt(meal_data, day,possib_result )
        proba_ls.append(proba)
        labels_ls.append(labels)
        preds_ls.append(preds)
    return  proba_ls, labels_ls, preds_ls

def get_episode_output(names= [], threshold= None, use_group_model= False,load_proba_flag=True):
    """
    Generate probability sequences for all days of data in all dataset
    """
    output_df = {"dataset":[],"proba_ls":[],"labels_ls":[],"preds_ls":[]}
    if not threshold:
        threshold = {}
        for name in names:
            threshold[name] = [0.8,0.4]
    for person in names:
        output_df["dataset"].append(person)
        meal_data = Person_MealsDataset(person_name= person, file_name = "all_files_list", winmin = 6,stridesec = 5,smooth_flag = 1,
                         normalize_flag = 1)
        high_th, low_th = threshold[person][0], threshold[person][1]
        if use_group_model:
            model = tf.keras.models.load_model('../models/CAD_models/acti_6min_M_F_6.000000Min.h5')
            result = hysteresis_threshold(model, meal_data,start_threshold=high_th, end_threshold=low_th, winmin = 6, stepsec=5, episode_min = 1.,
                                     load_proba_flag=load_proba_flag,path="../results/possibility_results/group_")
        else:
            model = tf.keras.models.load_model('../models/'+ person + '_models/acti_6min_split_day_M_F_6.000000Min.h5')
            result = hysteresis_threshold(model, meal_data,start_threshold=high_th, end_threshold=low_th, winmin = 6, stepsec=5, episode_min = 1.,
                                     load_proba_flag=load_proba_flag)
            
        proba_ls, labels_ls, preds_ls =map_results(meal_data,result)
        output_df["proba_ls"].append(proba_ls)
        output_df["labels_ls"].append(labels_ls)
        output_df["preds_ls"].append(preds_ls)
    return pd.DataFrame(output_df)


def generate_possibility(dataset,fold_num=5):
    """
    dataset: person_meal dataset
    output: generated possibility adn ground true label and prediction in episode
    """
    result = pd.DataFrame()
    from sklearn.model_selection import KFold
    days = np.unique(dataset.data_indices[:,0])
    kf = KFold(n_splits=5, random_state= 1000,shuffle=False)
    
    for fold, (day_train_idx, day_test_idx) in enumerate(kf.split(days)):
        day_test_idx = day_test_idx.tolist()
        proba_path ="../results/possibility_results/{}/cv_fold_{}_".format(dataset.person_name,fold)
        partial_result = hysteresis_threshold(None, dataset,days_ls = day_test_idx,start_threshold=0.8, end_threshold=0.4,
                                                  winmin = 6, stepsec=5, episode_min = 1.,
                                                 load_proba_flag=True, path =proba_path)
        #print(partial_result)
        result = result.append(partial_result,ignore_index=True)
    proba_ls, labels_ls, preds_ls =map_results(dataset,result)
    return proba_ls, labels_ls, preds_ls, result




def visualize_prob(offset,winsize,day, model_result="I",file_name ="possibility_seq" ):
    """
    Note:
        proba_ls, labels_ls, preds_ls are global variables from notebook
    """
    stride  = 5 *15 # 5 seconds between two adjacent labels/window samples
    fig_path = "../results/images/"
    day = int(day)
    global proba_ls
    global labels_ls
    global preds_ls
    if model_result =="I":
        proba_list=proba_ls
        labels_list= labels_ls
        preds_list=preds_ls
    else:
        proba_list=proba_ls_g
        labels_list= labels_ls_g
        preds_list=preds_ls_g
    proba, labels, preds = proba_list[day], labels_list[day], preds_list[day]    
    
    offset = offset *15
    if winsize == -1:
        winsize = len(labels)
        offset = 0
    else:
        winsize = winsize*15
    
    if  len(labels)-winsize <0:
        #offset = len(labels)-winsize
        offset =0
        winsize = len(labels)
        
    t = np.arange(start = offset, stop= offset+winsize, step=1)
    print("Offset: ",offset, "winszie: ",winsize,"t shape: ",t.shape, "label shape:", labels.shape, preds.shape)
    fig, ax = plt.subplots(3,1,figsize= (20,12))
    df1 = proba[offset:offset+winsize]
    df2 = np.array(preds[offset:offset+winsize]) #*10-5
    df3 = np.array(labels[offset:offset+winsize])#*10-5
    x1= sns.lineplot(x=t, y=df1 , ax =ax[0],color= 'grey',label="Possibility")
    x2 = sns.lineplot(t,df2 , ax =ax[1],color='g', linewidth=1.5,label="Prediction(Eat)")
    x3 = sns.lineplot(t,df3 , ax =ax[2],color='b', linewidth=1.5, label="Label(Eat)")
        
    
        
    ax[0].fill_between( t, df1, 
                interpolate=True, color='grey')
    
    ax[0].fill_between(t, df3, where=(df3==1), 
                interpolate=True, color='blue')
    ax[0].fill_between( t,df2, where=(df2==1), 
                interpolate=True, color='green')
    ax[0].set_ylim(0,1)
    print(offset,len(labels)-winsize )
    if offset >= len(labels)-winsize:
        title_txt ="Day: "+ str(day) + " Whole Day samples: "+str(len(labels)) +" . " + "Sample plotted: "+str(winsize)+". "
    else:
        title_txt = "Day: "+ str(day) + "Samples from "+str(offset) +"~" + str((offset+winsize)) +". "+ "Number of Sample plotted: "+str(winsize)+". "
    ax[0].set_title(title_txt)
    ax[2].set_xlabel("index of sample")
    ax[0].set_ylabel("Possibility")
    ax[1].set_ylabel("Predictions")
    ax[2].set_ylabel("Ground Truch")
    
    ax[0].legend(["Possibility"],loc='upper left')
    ax[1].legend(["1: Eat, 0:Other"],loc='upper left')
    ax[2].legend(["1: Eat, 0:Other"],loc='upper left')
    x3 = x3.get_figure()
    x3.savefig(fig_path+file_name, dpi = 80)
    return 

Overwriting visualization.py
