# 1.2 Data Transformation and Preprocessing
### settings: CPU 4,  memory 69G, chunk: 2, GPU model: P100, number of GPU:2

In [1]:
import numpy as np
import pandas as pd
import torch
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cuda')

In [2]:
%%writefile dataset.py


from data_loader import *
import numpy as np
import pandas as pd
import torch
import sklearn
import os
import random


def create_train_test_file_list(file_name= "all_files_list.txt",person_name = 'wenkanw',
                     out_path = "../data-file-indices/",root_path= "../",
                     test_ratio = 0.2, print_flag = True):
    """
    This function is used to split test set and training set based on file names
    
    """
    shm_file_ls = []
    event_file_ls = []
    new_files = []
    if person_name == "CAD":
        fp = open(out_path+ person_name+ "/" +"batch-unix.txt", "r")
        txt = fp.read()
        fp.close()
        # save all file list
        fp = open(out_path+ person_name+ "/" +"all_files_list.txt", "w")
        fp.write(txt)
        fp.close()
        
        txt_ls = txt.split("\n")
        txt_ls.remove("")
        txt_ls= [txt+"\n" for txt in txt_ls]
        test_size = int(len(txt_ls)*test_ratio)
        test = "".join(txt_ls[len(txt_ls) - test_size: ])
        train = "".join(txt_ls[:len(txt_ls) - test_size ])
        
        fp = open(out_path+ person_name+ "/" +"test_files.txt", "w")
        fp.write(test)
        fp.close()
        
        fp = open(out_path+ person_name+ "/" +"train_files.txt", "w")
        fp.write(train)
        fp.close()
        
        if print_flag:
            print("Train:", len(txt_ls) - test_size)
            print(train)
            print("test: ",test_size)
            print(test)
        return 
        
        
    for dirname, _, filenames in os.walk(root_path + 'data/IndividualData'):
        for filename in filenames:
            # check every file name in the individual data folder
            path = os.path.join(dirname, filename)
#             print("Path: ",path)
            # check if datafile is shm file and is not a test file
            if ".shm" in filename and person_name in path and 'test' not in path:
                # If the data file has label file as well, then it is valid
                # and we add it to the filename list
                event_file_name =  filename.replace(".shm","-events.txt")
                
                if event_file_name in filenames:
                    # if both shm and event files exist
                    new_file = path.replace(root_path+"data/","")
                    new_file += "\n"
                    new_files.append(new_file)
        
    new_files.sort()
    if test_ratio > 0.:
        # split train files and test files
        test_size = int(len(new_files)*test_ratio)
        test_files = new_files[:test_size]
        train_files = new_files[test_size:]
        # write train files
        fp = open(out_path+ person_name+ "/" +"train_files.txt", "w")
        train = "".join(train_files)
        
        fp.write(train)
        fp.close()
        # write test files
        fp = open(out_path+ person_name+ "/" +"test_files.txt", "w")
        test = "".join(test_files)
        fp.write(test)
        fp.close()
        
        if print_flag:
            print("Train:")
            print(train)
            print("test: ")
            print(test)
    
    fp = open(out_path+person_name+ "/"+file_name, "w")
    all_files = "".join(new_files)
    fp.write(all_files)
    fp.close()
    
    if print_flag:
        print("All files: ")
        print(all_files)
        





class Person_MealsDataset(torch.utils.data.Dataset):
    def __init__(self, dataset = None,person_name= "wenkanw", 
                 data_indices_file = "../data-file-indices/",
                 file_name = "all_files_list",
                 remove_trend = 0,
                 remove_walk = 0,
                 remove_rest = 0,
                 smooth_flag = 1,
                 normalize_flag = 1,
                 winmin = 6,
                 stridesec = 15,
                 gtperc = 0.5,
                 device = 'cpu',
                 ratio_dataset=1,
                load_splitted_dataset = False,
                 enable_time_feat = False,
                 debug_flag= False
                ):
        
        if file_name == "train":
            file_name = data_indices_file + person_name +"/"+"train_files.txt"
        elif file_name == "test":
            file_name = data_indices_file + person_name +"/"+"test_files.txt"
        else:
            file_name = data_indices_file + person_name +"/"+ file_name+".txt"
            
        # Note: file_name is the name of file that contain the list of shm files' names
        self.file_name = file_name
        self.dataset = dataset
        
        self.winmin = winmin
        self.stridesec = stridesec
        self.load_splitted_dataset = load_splitted_dataset
        self.remove_trend = remove_trend
        self.remove_walk = remove_walk
        self.remove_rest = remove_rest
        self.smooth_flag = smooth_flag
        self.normalize_flag = normalize_flag
        self.gtperc = gtperc,
        self.ratio_dataset = ratio_dataset
        self.enable_time_feat = enable_time_feat
        self.device = device
        self.debug_flag= debug_flag
        if not self.dataset:
            self.get_data(person_name)

    def get_data(self, person_name):
            
            
            # files_counts, data, samples_indices, labels_array
            # Note: the data preprocessing in this function is for global time series dataset
            
            self.dataset, self.data, self.data_indices, self.labels = load_train_test_data(data_file_list =self.file_name,
                                    load_splitted_dataset = False,
                                     ratio_dataset=self.ratio_dataset,
                                     enabled_time_feat = self.enable_time_feat, 
                                     winmin = self.winmin, stridesec = self.stridesec,gtperc = self.gtperc,
                                     removerest = self.remove_rest,
                                     removewalk = self.remove_walk, smooth_flag = self.smooth_flag, normalize_flag=self.normalize_flag, 
                                     remove_trend = self.remove_trend,
                                     debug_flag=self.debug_flag )
            
            if self.load_splitted_dataset:
                self.dataset = self.get_dataset()
                
            
        
    def __getitem__(self, index):
        # TODO
        # 1. Read one data from file (e.g. using numpy.fromfile, PIL.Image.open).
        # 2. Preprocess the data (e.g. torchvision.Transform).
        # 3. Return a data pair (e.g. image and label).
        #这里需要注意的是，第一步：read one data，是一个data
        data = self.get_item(index)
        return data['data'],data['label']
        
        pass
    def __len__(self):
        # You should change 0 to the total size of your dataset.
        return  len(self.dataset) if self.load_splitted_dataset else len(self.data_indices)
    def get_item(self, index, tensor_type=True):
        """
        This function is used to obtain one sample data point
        """
        f,start_time, end_time = self.data_indices[index,0], self.data_indices[index,1], self.data_indices[index,2]
        sample = self.data[f][start_time : end_time]
        data = pd.DataFrame(columns=['data','label'])    
        # Add time feature to data
        if self.enable_time_feat:
            time_offset = self.data_indices[index,3]
            freq = 1.0/15.0
            time_feat = np.array([[i for i in range(len(sample))]],dtype=float).transpose()
            time_feat *= freq
            time_feat += float(start_time)* freq
            time_feat += time_offset
            sample = np.concatenate((sample, time_feat),axis=1)
        label = self.labels[index]
        if tensor_type:
            data = {"data":torch.tensor(sample, dtype =torch.float, device =  self.device ), 'label': label}
        else:
            data = {"data":sample, 'label': label}
        return data
    
    def get_dataset(self, start_index = None, end_index = None):
        """
        This function is used to obtain the whole dataset in pandas or part of whole dataset
        It is good to use this to sample some data to analyze
        """
        start_i = 0 if not start_index else start_index
        end_i = self.__len__() if not end_index else end_index
        
        dataset = pd.DataFrame(columns=['data','label'])
        for i in tqdm(range(start_i, end_i)):
            data = self.get_item(i)
            dataset = dataset.append(data,ignore_index=True)
        self.dataset = dataset
        return self.dataset
    
    def sample(self, num = 1000,random_state = None):
        """
        Simply sample part of data for analysis
        """
        if random_state != None:
            np.random.seed(random_state)
            
        sample_data = pd.DataFrame(columns=['data','label'])
        indices = np.random.choice(len(self.labels), num)
        for i in tqdm(indices):
            data = self.get_item(i)
            data["data"] = data["data"].numpy()
            sample_data = sample_data.append(data,ignore_index=True)
        return sample_data
    
    def get_subset(self, indices_ls):
        axdata = []
        aydata = []
        for i in indices_ls:
            data = self.get_item(i, tensor_type=False)
            sample = data['data']
            label = data['label']
            axdata.append(sample)
            aydata.append(label)
        subsetData = np.array(axdata, copy=True) # Undersampled Balanced Training Set
        subsetLabels = np.array(aydata, copy=True)
        del axdata
        del aydata
        return subsetData, subsetLabels
        
        
            
                
def balance_data_indices(labels, sample_num = 4000,mode= "under", replace = False,shuffle=True, random_state = 1000):
    """
    sample_num: number of samples of each class after balancing
    mode: 
        under - undersampling
        over - oversampling
        mix - undersampling negative samples + oversampling positive samples, each class has sample_num amount samples in this mode
    return:
        balanced indices
    """
    eat_labels_index = [i for i, e in enumerate(labels) if e >= 0.5]
    not_eat_labels_index = [i for i, e in enumerate(labels) if e < 0.5]
    eat_index = eat_labels_index
    not_eat_index = not_eat_labels_index
    if random_state != None:
        np.random.seed(random_state)
        
    if mode == "over":
        eat_index = np.random.choice(eat_labels_index,len(not_eat_labels_index)).tolist()
        pass
    elif mode == "under":
        not_eat_index = np.random.choice(not_eat_labels_index,len(eat_labels_index)).tolist()
        pass
    else:
        #default as mix
        eat_index = np.random.choice(eat_labels_index,sample_num, replace = replace).tolist()
        not_eat_index = np.random.choice(not_eat_labels_index,sample_num, replace = replace).tolist()
        pass
    
    indices_balanced = eat_index + not_eat_index
    if shuffle:
        np.random.shuffle(indices_balanced)
    
    return indices_balanced


Overwriting dataset.py


In [14]:
%%writefile utils.py

from tqdm import tqdm
from sklearn.metrics import classification_report,recall_score, precision_score
import torch
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
from sklearn.metrics import classification_report,recall_score, precision_score


def eval_model(model,dataloader,device="cpu"):
    correct = 0.
    total = 0.
    TP = 0
    FN = 0
    # without update
    with torch.no_grad():
        for samples, labels in dataloader:
            samples = samples.to(device)
            labels = labels.to(device)
            outputs = model(samples).to(device).squeeze()
            #print("Output: ", outputs)
            outputs = torch.round(torch.sigmoid(outputs))
            preds = outputs>=0.5
            preds = preds.to(dtype = torch.float)
            preds.requires_grad = False
#             _,preds = torch.max(outputs,1)
            for i in range(len(preds)):
                if preds[i] == 1 and labels[i] == 1:
                    TP += 1
                if preds[i] == 0 and labels[i] == 1:
                    FN += 1
            correct += torch.sum((preds == labels)).item()
            total += float(len(labels))
        acc =100 * correct/ total
        recall = TP/(TP+FN)
#         print("Evaluation Acc: %.4f %%,  Recall: %.4f "%(acc , recall))
    return acc, recall
            
            
            
            

def train_model(model,dataloader, optimizer, criterion,lrscheduler,device="cpu" , n_epochs=20,
                earlystopping=True, patience= 3, l1_enabled=True,checkpoint_name ="checkpoint.pt" ):
    loss_ls = [0.0]
    train_acc_ls = [0.0]
    valid_acc_ls = [0.0]
    valid_acc = 0.0
    loss =0.0
    train_acc = 0.0
    patience_count = 0
    best_val_score = 0.0
    prev_val_score = 0.0
    best_model = None
    
    train_dataloader, valid_dataloader = dataloader
    print("Training set batch amounts:", len(train_dataloader))
    print("Test set :", len(valid_dataloader))
    print("Start Training..")
    
    for e in range(n_epochs):
        running_loss = 0.0
        epoch_loss = 0.0
        running_correct = 0.0
        correct_cnt = 0.0
        total_cnt = 0.0
        TP = 0.
        FN = 0.
        model.train()
        for i, (samples, labels) in enumerate(train_dataloader):
            samples = samples.to(device)
            labels = labels.to(device, dtype=torch.float32)
            
            optimizer.zero_grad()
            # reshape samples
            outputs = model(samples).squeeze()

            #print("Output: ", outputs, "label: ", labels)
            
            # Compute loss
            loss = criterion(outputs, labels)
            if l1_enabled:
                L1_loss = model.l1_loss(0.01).to(device)
                loss += L1_loss
            loss.backward()
            optimizer.step()
            
            # prediction
            #_,preds = torch.max(outputs,1)
            outputs = torch.round(torch.sigmoid(outputs))
            preds = outputs>=0.5
            preds = preds.to(dtype = torch.float)
            preds.requires_grad = False
            
            # Compute count of TP, FN
            for j in range(len(preds)):
                if preds[j] == 1. and labels[j] == 1.:
                    TP += 1
                if preds[j] == 0. and labels[j] == 1.:
                    FN += 1
            
            running_loss += loss.item()
            correct_cnt += torch.sum((preds == labels)).item()
            total_cnt += float(len(labels))
            batch_acc = 100. * (preds == labels).sum().item()/ float(len(labels))
            if i %50 ==0:
                #print("===> Batch: %d,  Batch_Loss: %.4f, Train Acc: %.4f %%,  Recall: %.f\n"%(i, loss,batch_acc, recall))
                pass

            
        
        # Compute accuracy and loss of one epoch
        epoch_loss = running_loss / len(train_dataloader)  
        epoch_acc = 100* correct_cnt/ total_cnt  # in percentage
        correct_cnt = 0.0
        total_cnt = 0.0
        train_recall = TP/(TP+FN)
        
        #Validation mode
        model.eval()
        valid_acc, valid_recall= eval_model(model,valid_dataloader,device=device)
        
        # record loss and accuracy
        valid_acc_ls.append(valid_acc)  
        train_acc_ls.append(epoch_acc)
        loss_ls.append(epoch_loss)
        
        if e %1==0:
            print("Epoch: %d,  Epoch_Loss: %.4f, Train Acc: %.4f %%, Train Recall: %.4f "%(e, epoch_loss,
                                                                                     epoch_acc,train_recall))
            print("Validation Acc:  %.4f %%,  Validation Recall: %.4f "%(valid_acc, valid_recall))
        
        # Reset train mode
        model.train()
        lrscheduler.step(valid_acc)
        
        
        # If earlystopping is enabled, then save model if performance is improved
        if earlystopping:
            if prev_val_score !=0. and valid_acc < prev_val_score :
                patience_count += 1
            else:
                patience_count = 0
                
            if patience_count >= patience:
                break 
                
            prev_val_score = valid_acc
            if valid_acc > best_val_score or best_val_score == 0.0:
                best_val_score = valid_acc
                torch.save(model,checkpoint_name)
                print("Checkpoint Saved")
            
                
        print("\n")
        
        
            
    # Load best model
    best_model = torch.load(checkpoint_name)
    print("Load Best Model.")
    print("Training completed")
        
    return model, best_model,best_val_score,loss_ls, train_acc_ls, valid_acc_ls
            

def plot_data(train_acc_ls,valid_acc_ls,loss_ls ):
    """
    Plot validation accuracy, training accuracy and loss
    """
    fig, ax = plt.subplots(1,2,figsize=(20,5))
    epochs = [i for i in range(len(train_acc_ls))]
    _ = sns.lineplot(x=epochs, y= train_acc_ls,ax=ax[0])
    _ = sns.lineplot(x=epochs, y= valid_acc_ls,ax=ax[0])
    ax[0].set_xlabel("Epoches")
    ax[0].set_ylabel("Accuracy")
    ax[0].legend(["Training Accuracy", "Validation Accuracy"])
    
    _ = sns.lineplot(x=epochs[1:], y= loss_ls[1:],ax=ax[1])
    ax[1].set_xlabel("Epoches")
    ax[1].set_ylabel("Training Loss")
    ax[1].set(yscale="log")
    plt.show()
    
    
            

Overwriting utils.py


In [6]:
%%writefile model.py

import torch
from torch import nn
from torch.nn import functional as F


class Discriminator_BN_Bias(nn.Module):
    def __init__(self, ngpu, input_shape , out_fea = 1, bias=True):
        super(Discriminator_BN_Bias, self).__init__()
        
        # (input_shape[0], input_shape[1] )=  (number of data point, 6 axis channel )
        in_channels, win_size = input_shape[0], input_shape[1]
        self.in_channels = in_channels
        self.ngpu = ngpu  
        self.out_fea = out_fea
        filter_num = 10
        
        self.conv0 = nn.Conv1d(in_channels = self.in_channels, 
                               out_channels = filter_num, 
                               kernel_size= 44,  stride= 2, padding=0, bias=bias)
        
        self.relu0= nn.ReLU()
        num_fea = (win_size-44)//2 +1
        self.conv1 = nn.Conv1d(filter_num,filter_num, kernel_size= 20,stride= 2, padding=0, bias=bias)
        self.relu1= nn.ReLU()
        num_fea = (num_fea-20)//2 +1
        
        self.bn1 = nn.BatchNorm1d(filter_num)
        
        self.conv2 = nn.Conv1d(filter_num,filter_num, kernel_size= 4, stride= 2, padding=0, bias=bias)
        self.relu2= nn.ReLU()
        
        num_fea = (num_fea-4)//2 +1
        self.bn2 = nn.BatchNorm1d(filter_num)
        
        
        self.avgpool = nn.AvgPool1d(kernel_size=10)
        self.flatten = nn.Flatten()
        self.linear1 = None 
        self.relu4 = nn.ReLU()
        self.linear2 = nn.Linear(in_features=200, out_features=out_fea, bias=bias)
        self.softmax = nn.Softmax(dim=out_fea)
        self.sigmoid = nn.Sigmoid()
        
        nn.init.normal_(self.conv0.weight.data, 0.0, 1.)
        nn.init.normal_(self.conv1.weight.data, 0.0, 1.)
        nn.init.normal_(self.conv2.weight.data, 0.0, 1.)
        nn.init.normal_(self.linear2.bias.data, 0.0, 1.)
        #nn.init.normal_(self.conv0.bias.data, 0.0, 1.)
        #nn.init.normal_(self.conv1.bias.data, 0.0, 1.)
        #nn.init.normal_(self.conv2.bias.data, 0.0, 1.)
#         nn.init.normal_(self.avgpool.weight.data, 0.0, 1.)
        
    def l1_loss(self,factor=0.01):
        l1_crit = nn.L1Loss(size_average=False)
        reg_loss = 0.
        loss = 0.
        layers = [self.conv0, self.conv1, self.conv2]
        for layer in layers:
            for p in layer.parameters():
                #print(p)
                reg_loss += l1_crit(p, torch.zeros(p.shape))

        loss = factor * reg_loss
        return loss

    def forward(self, input):

        x = input.permute(0,2,1)
        x = self.conv0(x)
        x = self.relu0(x)
        x = self.conv1(x)
        x=  self.bn1(x)
        x = self.relu1(x)
        x = self.conv2(x)
        x=  self.bn2(x)
        x = self.relu2(x)
        x = self.avgpool(x)
        #print("Pooling shape:",x.shape)
        x = self.flatten(x)
        if self.linear1 == None:
            self.linear1 = nn.Linear(in_features=x.shape[1], out_features=200, bias=True)
            nn.init.normal_(self.linear1.weight.data, 0.0, 1.)
            nn.init.normal_(self.linear1.bias.data, 0.0, 1.)
            
        x = self.relu4(self.linear1(x))
        out = self.linear2(x)

        return out



class BasicBlock(nn.Module):
    def __init__(self, in_channels ,bias= True, filter_num = 10):
        super(BasicBlock, self).__init__()       
        self.conv0 = nn.Conv1d(in_channels = in_channels, 
                               out_channels = filter_num, 
                               kernel_size= 3,  stride= 1, padding=1, bias=bias)
        self.bn0 = nn.BatchNorm1d(filter_num)
        self.relu0 = nn.ReLU()
        
        self.conv1 = nn.Conv1d(in_channels = in_channels, 
                               out_channels = filter_num, 
                               kernel_size= 3,  stride= 1, padding=1, bias=bias)
        self.bn1 = nn.BatchNorm1d(filter_num)
        self.relu1 = nn.ReLU()
        pass
    def forward(self, x):
        residual = x
        out = self.conv0(x)
        out = self.bn0(out)
        out = self.relu0(out)
        out = self.conv1(out)
        out = self.bn1(out)
        
        out += residual
        out = self.relu1(out)
        
        return out

class Discriminator_ResNet(nn.Module):
    def __init__(self, ngpu, input_shape , out_fea = 1):
        super(Discriminator_ResNet, self).__init__()
        
        # (input_shape[0], input_shape[1] )=  (number of data point, 6 axis channel )
        in_channels, win_size = input_shape[0], input_shape[1]
        self.in_channels = in_channels
        self.ngpu = ngpu  
        self.out_fea = out_fea
        filter_num = 10
        
        self.conv0 = nn.Conv1d(in_channels = self.in_channels, 
                               out_channels = filter_num, 
                               kernel_size= 44,  stride= 2, padding=0, bias=True)
        
        self.relu0= nn.ReLU()
        num_fea = (win_size-44)//2 +1
        self.conv1 = nn.Conv1d(filter_num,filter_num, kernel_size= 20,stride= 2, padding=0, bias=True)
        self.relu1= nn.ReLU()
        self.bn1 = nn.BatchNorm1d(filter_num)
        
        self.block1 =  BasicBlock( in_channels=filter_num  ,bias= True, filter_num = filter_num)
        self.block2 =  BasicBlock( in_channels=filter_num  ,bias= True, filter_num = filter_num)
        
        
        self.avgpool = nn.AvgPool1d(kernel_size=10)
        self.flatten = nn.Flatten()
        self.linear1 = None 
        self.relu2 = nn.ReLU()
        self.linear2 = nn.Linear(in_features=200, out_features=out_fea, bias=True)
        self.softmax = nn.Softmax(dim=out_fea)
        self.sigmoid = nn.Sigmoid()
        
        nn.init.normal_(self.conv0.weight.data, 0.0, 1.)
        nn.init.normal_(self.conv1.weight.data, 0.0, 1.)
        nn.init.normal_(self.linear2.bias.data, 0.0, 1.)
        
    def l1_loss(self,factor=0.01):
        l1_crit = nn.L1Loss(size_average=False)
        reg_loss = 0.
        loss = 0.
        layers = [self.conv0, self.conv1]
        for layer in layers:
            for p in layer.parameters():
                #print(p)
                reg_loss += l1_crit(p, torch.zeros(p.shape))

        loss = factor * reg_loss
        return loss

    
    def forward(self, input):

        x = input.permute(0,2,1)
        x = self.conv0(x)
        x = self.relu0(x)
        x = self.conv1(x)
        x=  self.bn1(x)
        x = self.relu1(x)
        x = self.block1(x)
        x = self.block2(x)

        
        x = self.avgpool(x)
        #print("Pooling shape:",x.shape)
        x = self.flatten(x)
        if self.linear1 == None:
            self.linear1 = nn.Linear(in_features=x.shape[1], out_features=200, bias=True)
            nn.init.normal_(self.linear1.weight.data, 0.0, 1.)
            nn.init.normal_(self.linear1.bias.data, 0.0, 1.)
            
        x = self.relu2(self.linear1(x))
        out = self.linear2(x)

        return out

Overwriting model.py
