In [1]:
import pandas as pd
import numpy as np
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torch
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
from collections import Counter

### Helper Function

In [2]:
#hardcoded here, not use proportional 
def downsample(dataset, proportional = 2):
    '''
    dataset: orginal pickle final
    proportional: the number of majority class/the number of minority class
    '''
    ori_patient_feature = dataset[0] #before cleaning, original data
    ori_label = dataset[1]
    ori_label_time = dataset[2]
    
    down_sample_index = np.random.randint(0,Counter(ori_label)[0]+1,3535)
    downsampled_features = ori_patient_feature[ori_label==0][down_sample_index]
    downsampled_label = ori_label[ori_label==0][down_sample_index]
    downsampled_label_time = ori_label_time[ori_label==0][down_sample_index]
    
    new_features = np.concatenate([downsampled_features, ori_patient_feature[ori_label==1]])
    new_label = np.concatenate([downsampled_label, ori_label[ori_label==1]])
    new_label_time = np.concatenate([downsampled_label_time, ori_label_time[ori_label==1]])
    
    return new_features,new_label,new_label_time

### Helper function
def block_tri(group_size,scale, num_units, mode):
    '''
    group_size: the size of each group
    num_units: group_size x sum(scale)
    mode: the way of connection, original, shift, fully connect
    return: tensor mask
    '''
    mtrx = np.zeros((num_units, num_units))
    if mode == 'original':
        for i in range(int(num_units/group_size)):
            mtrx[i*group_size:(i+1)*group_size, i*group_size:] = 1
    elif mode == 'shift': 
        mtrx = np.zeros((num_units, num_units))
        for i in range(int(num_units/group_size)):
            refer_li = sum([[i]*i for i in scale],[])
            length = refer_li[i]
            sequence = list(np.arange(i, num_units/group_size))
            #print(sequence)
            sequence = [int(j) for j in sequence if (j - i)%length == 0]
            ##print(sequence)
            for index in sequence:
                mtrx[index*group_size:(index+1)*group_size,i*group_size:(i+1)*group_size] = 1
    return mtrx 

#def activate_index()

def activate_index(timestep, num_units, group_size, scale,index_li,batch_size,mode,input_size = 48):
    '''
    timestep: the current timestep in a sequence
    num_units: dimension of hidden layer
    group_size: number of nodes in each group
    scale: the range of update frequency
    index_li: the index of each scale start point. dictionary. scale: position
    input_size: the feature dim for patient
    return: a matrix with 0 and 1. 1 for active rows seperately for linear layer h and i
    '''
    activation_map = np.zeros((batch_size, num_units))
    if mode == 'original':
        for i in scale:
            if timestep%i ==0:
                #print(i)
                index_temp = index_li[i]
                activation_map[:, index_temp*group_size:(index_temp + 1)*group_size] = np.ones((batch_size,group_size))
    elif mode == 'shift':
        for i in scale:
            remain = timestep%i
            if remain == 0:
                index_temp = index_li[i]
                activation_map[:,index_temp*group_size:(index_temp+1)*group_size] = np.ones((batch_size,group_size))
            else:
                index_temp = i - remain + index_li[i]
                activation_map[:, index_temp*group_size:(index_temp+1)*group_size] = np.ones((batch_size,group_size))
    return torch.from_numpy(activation_map).float()
### make it a tensor

def padding_fun(data, labels, label_time):
    '''
    This is going to pad the data in the front.
    data: a batch of patient feature. now, a list (with length as batch_size) of array of T * 48
    labels: a list of zero and one
    return target: padded_data should be batch x 48 x max_length. called in the data loader function (this is different from context window since window has fixed size and can be done before)
           length: original length of each patient records
    '''
    max_length = max(label_time)
    target_data = np.array([np.pad(i,((max_length - len(i),0),(0,0)), 'constant', constant_values = 0) for i in data])
    target_label = np.array([np.pad([labels[i]]*label_time[i],(max_length - label_time[i],0),'constant', constant_values = -1 ) for i in range(len(labels))])
    #flip
    target_label = np.flip(target_label, axis = 1)   
    #padding -1 here, so would be ignored when calculating loss
    #print(data[i].shape)
    target = np.array(data)
    #print(target.shape)#for debug
    return target_data, target_label

In [3]:
class MIMICDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, label_list, label_time_li):
        """
        @param data_list: list of datapoints, each element is a embedding matrix
        """
        self.data_list = data_list
        self.label_list = label_list
        self.label_time_list = label_time_li
        
    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        return (self.data_list[key], self.label_list[key], self.label_time_list[key])

#training data in this case should be an array of arrays. for each array, it is an array of size T*48 
def MIMIC_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    label_time_list = []
    for data_point in batch:#batch is a list of items(selected by index), each is imdb_train[i]#which is by above get item
        data_list.append(data_point[0])
        label_list.append(data_point[1])
        label_time_list.append(data_point[2])

    #return a batch of padded data
    new_data, label_list = padding_fun(data_list, label_list, label_time_list) #should be batch x max_len x 48    batch x max_length
    return [torch.from_numpy(new_data).float(),torch.from_numpy(np.array(label_list)).long()]

def reload_data(batch_sz):
    '''
    batch_sz: pass in the batch size
    return: data loader
    ##TODO: no test loader has been said yet
    '''        
    #print(len(training_wds))
    mimic_train = MIMICDataset(patient_feature_train, label_train, label_time_train)
    mimic_val = MIMICDataset(patient_feature_val, label_val, label_time_val)
    
    train_loader = torch.utils.data.DataLoader(dataset=mimic_train,batch_size=batch_sz,collate_fn=MIMIC_collate_func,shuffle=True)
    validation_loader = torch.utils.data.DataLoader(dataset=mimic_val,batch_size=batch_sz,collate_fn=MIMIC_collate_func,shuffle=True)
        
    return train_loader, validation_loader

In [4]:
def evaluation(data_loader, model, mode = 'novel'):
    '''
    to be filled. use loader for easy call for summarizing training acc
    '''
    pre_ones = []
    pre_zeros = []
    label_ones = []
    label_zeros = []
    model.eval()
    
    for data, label in data_loader:
        data, label = Variable(data), Variable(label)
        #label_scores = model(data, label) #I guess should be a mtrx, batch*class_num?
    
        hidden= model.init_hidden()
        if mode == 'novel':
            hidden, output = model(data, hidden)
            output = torch.stack(output, dim=1).view(-1, 2).data.numpy()
        else:
            output = model(data, hidden).data.numpy()
        #now get a list of hidden and a list of outputs
        label = label.transpose(0,1).contiguous().view(-1).data.numpy()
        
        
        #idx
        one_idx = np.where(label == 1)[0]
        zero_idx = np.where(label == 0)[0]
        keep_idx = np.concatenate((one_idx, zero_idx))
        
        label_one = list(label[one_idx])
        label_zero = list(label[zero_idx])
        output_one = list(np.argmax(np.array(output[one_idx]),axis = 1)) #now softmax, turn into class
        output_zero = list(np.argmax(np.array(output[zero_idx]),axis = 1))
        
        
        pre_ones.extend(output_one)
        pre_zeros.extend(output_zero)
        label_ones.extend(label_one)
        label_zeros.extend(label_zero)
        #print(len(pre_ones) == len(label_ones))
        #print(len(pre_zeros) == len(label_zeros))
        
    #target = list(np.array(pre) == np.array(pre))

        #print(one_idx)
    acc1 = sum(np.array(pre_ones) == np.array(label_ones))/len(pre_ones)
    acc0 = sum(np.array(pre_zeros) == np.array(label_zeros))/len(pre_zeros)
    acc =  (sum(np.array(pre_ones) == np.array(label_ones)) + sum(np.array(pre_zeros) == np.array(label_zeros)))/(len(pre_ones) + len(pre_zeros))
    model.train()
    return acc0, acc1,acc

### Configuration

In [63]:
batch_size = 20
num_epochs = 50
number_units = 7

### Data

In [6]:
training = pd.read_pickle('../MIMIC40/train_test_split/train_data.p')
val = pd.read_pickle('../MIMIC40/train_test_split/valid_data.p')

In [64]:
patient_feature_train, label_train, label_time_train = downsample(training, proportional = 2)

In [65]:
patient_feature_val = val[0] #before cleaning, original data
label_val = val[1]
label_time_val = val[2]

In [47]:
Counter(label_train), Counter(label_val)

(Counter({0: 3535, 1: 1765}), Counter({0: 4051, 1: 549}))

In [62]:
len(patient_feature_train)

5300

In [49]:
set(label_train[:3535])

{0}

### small_try

In [50]:
idx = np.random.randint(0, len(patient_feature_train))
patient_feature_train_sub, label_train_sub, label_time_train_sub = patient_feature_train.copy(), label_train.copy(), label_time_train.copy()
patient_feature_train, label_train, label_time_train = patient_feature_train_sub[idx], label_train_sub[idx], label_time_train_sub[idx]

In [35]:
#crop and make it easy to see the change of weight and other stuff
#y axis is 48 and x aixs is timestep
patient_feature_train = [patient_feature_train[i][:i+1, :5] for i in range(len(patient_feature_train))]

In [41]:
len(patient_feature_train), len(label_train), len(label_time_train)

(10, 10, 10)

In [51]:
label_train

array([1, 1, 0, 0, 0, 0, 0, 1, 0, 1])

In [44]:
patient_feature_train

[array([[-0.26679295, -0.06901753,  0.35713941, -0.58368168,  0.37670353]]),
 array([[-0.26679295, -0.06901753,  0.35713941,  1.71333182, -0.36760324],
        [-0.26679295, -0.06901753,  0.35713941,  1.71333182, -0.36760324]]),
 array([[-0.26679295, -0.06901753,  0.35713941, -0.58368168,  0.37670353],
        [-0.26679295, -0.06901753,  0.35713941, -0.58368168,  0.37670353],
        [-0.26679295, -0.06901753,  0.35713941, -0.58368168,  0.37670353]]),
 array([[-0.26679295, -0.06901753,  0.35713941, -0.58368168,  0.37670353],
        [-0.26679295, -0.06901753,  0.35713941, -0.58368168,  0.37670353],
        [-0.26679295, -0.06901753,  0.35713941, -0.58368168,  0.37670353],
        [-0.26679295, -0.06901753,  0.35713941, -0.58368168,  0.37670353]]),
 array([[-0.26679295, -0.06901753,  0.35713941, -0.58368168, -1.11191001],
        [-0.26679295, -0.06901753,  0.35713941, -0.58368168, -1.11191001],
        [-0.26679295, -0.06901753,  0.35713941, -0.58368168, -1.11191001],
        [-0.26679

### Model

In [57]:
### Model
#forward
#cell_class, step
class Clock_NN(nn.Module):
    def __init__(self, scale,batch_size, group_size = 1, activation_fun =nn.Tanh, mean = 0, std = 1, input_dim = 48,mode = 'shift'):
        super(Clock_NN, self).__init__()
        '''
        scale: the updating frequency, a list. [1,2,4,8,16,32]
        batch_size: the size of batch
        group_size: the number of nodes in each scale, default is 1.
        activation_function
        mean: the mean of Gaussian distribution for initialize weights for hidden layer
        std: the standard devation of the Gaussian distribution for initialize weights for hidden layer
        input_dim: the feature dimension of each time step
        '''
        self.scale = scale
        self.group_size = group_size
        self.batch_size = batch_size
        self.mode = mode
        if mode == 'original':
            self.num_units = len(self.scale)*self.group_size
            self.index_li = {self.scale[i]: i for i in range(len(self.scale))}
        elif mode == 'shift':
            self.num_units = sum(self.scale)*self.group_size
            self.index_li = {i:i-1 for i in self.scale}
            
        self.class_dim = 2
        self.input_dim = input_dim
        self.linear_h = nn.Linear(self.num_units,self.num_units)
        self.linear_o = nn.Linear(self.num_units,self.class_dim)
        self.linear_i = nn.Linear(self.input_dim, self.num_units)
        self.activation_fun = activation_fun
        self.connect = torch.from_numpy(block_tri(self.group_size, self.scale, self.num_units, self.mode)).float()
        self.time_step = 0
        
        self.initial_weights(mean, std)
        
        #the connectivity, when we disabled the weight, this should not change
        self.linear_h.weight.data = self.linear_h.weight.data*self.connect#here needs transpose since previously left multiplication, activate mtrx doesn't need as rewrite and select cols.
        #self.linear_i.weight.data = self.linear_i.weight.data
        
    def forward(self, sequence, hidden):#depends on what passed for model.train(), to be filled)
        '''
        sequence: batch  x timestep x number_feature matrix
        hidden: should be h0
        '''     
        #sequence = sequence.view(48,-1)when this is only one batch
        hidden_output = []
        length = sequence.size()[1]
        logit = []
        for i in range(1,length+1):
            #print('this is the timestep ' + str(self.time_step))
            self.time_step += 1
            #backwards, want discharge/dead time aligns
            #print(sequence[:,:,-i].size())#would be batch*48
            hidden = self.CW_RNN_Cell(sequence[:,-i,:].contiguous(), hidden)
            hidden_output.append(hidden)#become batch_size x hidden_dim
            out = self.linear_o(hidden)
            logit.append(F.log_softmax(out))
        return hidden_output, logit
            
                
    def CW_RNN_Cell(self, x_input, hidden):
        '''
        x_input: number_feature x batch vector, representing one time stamp
        hidden: output of the last cell, should be hidden_dim(i.e. num_units) x batch
        '''
        #which time bloack to change
        activate = activate_index(self.time_step, self.num_units, self.group_size, self.scale,self.index_li,batch_size,self.mode, self.input_dim)
        activate_re = torch.from_numpy(np.ones((self.batch_size,self.num_units))).float() - activate

        hidden_next = self.linear_h(hidden) + self.linear_i(x_input) #should be batch_size x hidden_dim       
        hidden_next.data = activate*hidden_next.data + activate_re*hidden.data
        hidden_next = self.activation_fun(hidden_next)
        return hidden_next

    def init_hidden(self):
        h0 = Variable(torch.zeros(self.batch_size,self.num_units))
        return h0
        
    def initial_weights(self, mean, std):
        lin_layers = [self.linear_h, self.linear_o, self.linear_i]
        for layer in lin_layers:
            layer.weight.data.normal_(mean, std**2)
            layer.bias.data.fill_(0) 

In [71]:
### Training original
model = Clock_NN([1,2,4], batch_size, group_size = 1, activation_fun = F.relu, mean = 0, std = 0.1, input_dim = 48, mode = 'original')

loss = torch.nn.NLLLoss(ignore_index=-1)  
optimizer = torch.optim.Adam(model.parameters(), lr=0.000001)
accuracy_list = []
train_loader, validation_loader = reload_data(batch_size)
for epoch in range(1):
    for step, (data, label) in enumerate(train_loader):
        data, label = Variable(data), Variable(label)
        model.zero_grad()
        hidden= model.init_hidden()
        hidden, output = model(data, hidden)
        #now get a list of hidden and a list of outputs
        label = label.transpose(0,1).contiguous().view(-1) 
        #should be flatten, batch_size x hidden. transpose due to below order, was batch, seq => follow up 2 down. get size batch*seq          
        output = torch.stack(output, dim=1).view(-1, 2) 
        #print(output[-1])
        lossy = loss(output, label)
        lossy.backward()
        model.linear_h.weight.grad.data = model.linear_h.weight.grad.data*model.connect
        optimizer.step()
                #for now debugging, to be removed
    #print('----------------weight--------------------------------------------------------')
    #print(model.linear_h.weight.data)
    #print(model.linear_h.bias.data)
    print("Epoch: {}; Loss: {}".format(epoch, lossy.data[0]))
        #print('accuracy_on_training: {}'.format(evaluation(train_loader))) 
    acc0, acc1, val_acc = evaluation(validation_loader, model)
    print('accuracy_on_validation: {}, the acc for LIVE is {}, the acc for DEAD is {}'.format(val_acc, acc0, acc1)) 
    #accuracy_list.append(val_acc)
    

    '''
    if ((epoch > 5) and ((accuracy_list[-1] < (accuracy_list[-2] - 0.01)) or (accuracy_list[-1] < (accuracy_list[-3] - 0.01)))):
        print("early stop, accuracy = ", accuracy_list[-2])
        break
    '''
#              

Variable containing:
-0.6931
-0.6931
[torch.FloatTensor of size 2]

Variable containing:
-0.6931
-0.6931
[torch.FloatTensor of size 2]

Variable containing:
-0.6931
-0.6931
[torch.FloatTensor of size 2]

Variable containing:
-0.6931
-0.6931
[torch.FloatTensor of size 2]

Variable containing:
-0.6950
-0.6913
[torch.FloatTensor of size 2]

Variable containing:
-0.6931
-0.6931
[torch.FloatTensor of size 2]

Variable containing:
-0.6931
-0.6931
[torch.FloatTensor of size 2]

Variable containing:
-0.6931
-0.6931
[torch.FloatTensor of size 2]

Variable containing:
-0.6931
-0.6931
[torch.FloatTensor of size 2]

Variable containing:
-0.6931
-0.6931
[torch.FloatTensor of size 2]

Variable containing:
-0.6931
-0.6931
[torch.FloatTensor of size 2]

Variable containing:
-0.6931
-0.6931
[torch.FloatTensor of size 2]

Variable containing:
-0.6931
-0.6931
[torch.FloatTensor of size 2]

Variable containing:
-0.6931
-0.6931
[torch.FloatTensor of size 2]

Variable containing:
-0.6931
-0.6931
[torch.Floa

KeyboardInterrupt: 

### debug

In [None]:
### Model
#forward
#cell_class, step
class Clock_NN(nn.Module):
    def __init__(self, scale,batch_size, group_size = 1, activation_fun =nn.Tanh, mean = 0, std = 1, input_dim = 48,mode = 'shift'):
        super(Clock_NN, self).__init__()
        '''
        scale: the updating frequency, a list. [1,2,4,8,16,32]
        batch_size: the size of batch
        group_size: the number of nodes in each scale, default is 1.
        activation_function
        mean: the mean of Gaussian distribution for initialize weights for hidden layer
        std: the standard devation of the Gaussian distribution for initialize weights for hidden layer
        input_dim: the feature dimension of each time step
        '''
        self.scale = scale
        self.group_size = group_size
        self.batch_size = batch_size
        self.mode = mode
        if mode == 'original':
            self.num_units = len(self.scale)*self.group_size
            self.index_li = {self.scale[i]: i for i in range(len(self.scale))}
        elif mode == 'shift':
            self.num_units = sum(self.scale)*self.group_size
            self.index_li = {i:i-1 for i in self.scale}
            
        self.class_dim = 2
        self.input_dim = input_dim
        self.linear_h = nn.Linear(self.num_units,self.num_units)
        self.linear_o = nn.Linear(self.num_units,self.class_dim)
        self.linear_i = nn.Linear(self.input_dim, self.num_units)
        self.activation_fun = activation_fun
        self.connect = torch.from_numpy(block_tri(self.group_size, self.scale, self.num_units, self.mode)).float()
        self.time_step = 0
        print('----------------print connectivity-------------------------------------------------------')
        print(self.connect)
        
        
        self.initial_weights(mean, std)
        
        #the connectivity, when we disabled the weight, this should not change
        self.linear_h.weight.data = self.linear_h.weight.data*self.connect#here needs transpose since previously left multiplication, activate mtrx doesn't need as rewrite and select cols.
        #self.linear_i.weight.data = self.linear_i.weight.data
        
    def forward(self, sequence, hidden):#depends on what passed for model.train(), to be filled)
        '''
        sequence: batch  x timestep x number_feature matrix
        hidden: should be h0
        '''     
        #sequence = sequence.view(48,-1)when this is only one batch
        hidden_output = []
        length = sequence.size()[1]
        logit = []
        for i in range(1,length+1):
            print('this is the timestep ' + str(self.time_step))
            self.time_step += 1
            #backwards, want discharge/dead time aligns
            #print(sequence[:,:,-i].size())#would be batch*48
            hidden = self.CW_RNN_Cell(sequence[:,-i,:].contiguous(), hidden)
            hidden_output.append(hidden)#become batch_size x hidden_dim
            out = self.linear_o(hidden)
            print('---------------- linear layer weight-------------------------------')
            print(self.linear_o.weight.data)
            print('---------------- out for prediction linear_o(hidden from next step)-------------------------------')
            print(out[:,0] == out[:,1])#0 means false. test whether it cares about how many zeros are after this
            #print(F.softmax(out))
            print(hidden)
            print(out)
            print(F.log_softmax(out))
            #print(out.size())#should be batch x 2 (for one timestep)
            logit.append(F.log_softmax(out))
        return hidden_output, logit
            
                
    def CW_RNN_Cell(self, x_input, hidden):
        '''
        x_input: number_feature x batch vector, representing one time stamp
        hidden: output of the last cell, should be hidden_dim(i.e. num_units) x batch
        '''
        print('---------------- hidden step -------------------------------------------------------')
        print(hidden)

        print('---------------- x_input -------------------------------------------------------')
        print(x_input)
        
        #which time bloack to change
        activate = activate_index(self.time_step, self.num_units, self.group_size, self.scale,self.index_li,batch_size,self.mode, self.input_dim)
        activate_re = torch.from_numpy(np.ones((self.batch_size,self.num_units))).float() - activate
        
        #print('activate ' + str(activate.size()))
        #print('activate_re ' + str(activate_re.size()))
    
        
        print('----------------weight data for this step------------------------------------------------------')
        #print(hidden)
        print(self.linear_h.weight.data)
        
        print('----------------bias data for this step------------------------------------------------------')
        #print(hidden)
        print(self.linear_h.bias.data)        
        
        #print(self.linear_h.weight.data)
        #*activate.transpose(0,1)
        hidden_next = self.linear_h(hidden) + self.linear_i(x_input) #should be batch_size x hidden_dim
        
        print('----------------hidden_next data for this step------------------------------------------------------')
        #print(hidden)
        print(hidden_next) 
        
        hidden_next.data = activate*hidden_next.data + activate_re*hidden.data
        
        print('----------------hidden_next data for this step------------------------------------------------------')
        #print(hidden)
        print(hidden_next) 
        
        hidden_next = self.activation_fun(hidden_next)
        return hidden_next

    def init_hidden(self):
        h0 = Variable(torch.zeros(self.batch_size,self.num_units))
        return h0
        
    def initial_weights(self, mean, std):
        lin_layers = [self.linear_h, self.linear_o, self.linear_i]
        for layer in lin_layers:
            layer.weight.data.normal_(mean, std**2)
            layer.bias.data.fill_(0) 

In [57]:
### Training
model = Clock_NN([1,2,4], batch_size, group_size = 5, activation_fun = F.relu, mean = 0, std = 0.1, input_dim = 5)

loss = torch.nn.CrossEntropyLoss(ignore_index=-1)  
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)
accuracy_list = []
train_loader, validation_loader = reload_data(batch_size)
for epoch in range(num_epochs):
    for step, (data, label) in enumerate(train_loader):        
        data, label = Variable(data), Variable(label)
        model.zero_grad()
        hidden= model.init_hidden()
        hidden, output = model(data, hidden)
        
        #now get a list of hidden and a list of outputs
        label = label.transpose(0,1).contiguous().view(-1) 
        #should be flatten, batch_size x hidden. transpose due to below order, was batch, seq => follow up 2 down. get size batch*seq
        output = torch.stack(output, dim=1).view(-1, 2) 
        #since batch is the first dimension, so dim is 1. now the order is  seq_len,batch, 2, so the first dimension is the first time step over all batch
        #print(output.size())
        #print(label.size())
        lossy = loss(output, label)
        lossy.backward()
        model.linear_h.weight.grad.data = model.linear_h.weight.grad.data*model.connect.transpose(0,1)
        #print(lossy.data[0])
        optimizer.step()
        
    #print("Epoch: {}; Loss: {}".format(epoch, lossy.data[0]))
        #print('accuracy_on_training: {}'.format(evaluation(train_loader))) 
    #acc0, acc1, val_acc = evaluation(validation_loader, model)
    #print('accuracy_on_validation: {}, the acc for LIVE is {}, the acc for DEAD is {}'.format(val_acc, acc0, acc1)) 
    #accuracy_list.append(val_acc)
    #if ((epoch > 5) and ((accuracy_list[-1] < (accuracy_list[-2] - 0.01)) or (accuracy_list[-1] < (accuracy_list[-3] - 0.01)))):
     #   print("early stop, accuracy = ", accuracy_list[-2])
      #  break
              

this is the timestep0
----------------hidden step-------------------------------------------------------
Variable containing:

Columns 0 to 12 
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0

Columns 13 to 25 
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0

Columns 26 to 34 
    0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0
[torch.FloatTensor of size 2x35]

----------------weight data-------------------------------------------------------

1.00000e-02 *
 0.7191  0.5688 -0.4165  ...  -1.3283 -0.1450  1.1860
-2.0198 -0.3778  0.2340  ...   1.7644  0.0881 -0.3916
 0.0281  1.5014 -0.0142  ...  -1.0727 -0.0737 -0.6148
          ...             ⋱             ...          
-0.0000 -0.0000 -0.0000  ...   0.6179 -0.5646 -1.4

### Baseline

In [37]:
### Model
#forward
#cell_class, step
class LSTM_Model(nn.Module):
    def __init__(self, number_units, batch_size, mean = 0, std = 0.1, input_dim = 48):
        super(LSTM_Model, self).__init__()
        '''
        scale: the updating frequency, a list. [1,2,4,8,16,32]
        batch_size: the size of batch
        group_size: the number of nodes in each scale, default is 1.
        mean: the mean of Gaussian distribution for initialize weights for hidden layer
        std: the standard devation of the Gaussian distribution for initialize weights for hidden layer
        input_dim: the feature dimension of each time step
        '''
        self.hidden_dim = number_units
        self.input_dim = input_dim
        self.batch_size = batch_size

        self.lstm = nn.LSTM(self.input_dim, self.hidden_dim)
        self.hidden2tag = nn.Linear(self.hidden_dim, 2)
        
    def forward(self, sequence, hidden):#depends on what passed for model.train(), to be filled)
        '''
        sequence: batch  x timestep x number_feature matrix
        hidden: should be h0
        ''' 
        sequence = sequence.transpose(0,1) #now sequence, batch, num_feature
        #print(sequence.size())
        #print(hidden[0].size())
        lstm_out, hidden = self.lstm(sequence, hidden)
        #print(lstm_out.size())
        tag_space = self.hidden2tag(lstm_out.view(len(sequence)*self.batch_size, -1))
        tag_scores = F.log_softmax(tag_space)       
        return tag_scores

    def init_hidden(self):
        return (Variable(torch.zeros(1, self.batch_size, self.hidden_dim)),
                Variable(torch.zeros(1, self.batch_size, self.hidden_dim)))


In [38]:
lstm = LSTM_Model(number_units, batch_size)
loss = torch.nn.CrossEntropyLoss(ignore_index=-1)  
optimizer = torch.optim.Adam(lstm.parameters(), lr=0.05)
accuracy_list = []
train_loader, validation_loader = reload_data(batch_size)
for epoch in range(num_epochs):
    for step, (data, label) in enumerate(train_loader):        
        data, label = Variable(data), Variable(label)
        lstm.zero_grad()
        hidden= lstm.init_hidden()
        output = lstm(data, hidden)
        
        label = label.transpose(0,1).contiguous().view(-1) 
        #output = torch.stack(output, dim=1).view(-1, 2) 
        #print(output.size())
        #print(label.size())
        lossy = loss(output, label)
        lossy.backward()
        optimizer.step()
        #print('one_step')
        
    print("Epoch: {}; Loss: {}".format(epoch, lossy.data[0]))
        #print('accuracy_on_training: {}'.format(evaluation(train_loader))) 
    acc0, acc1, val_acc = evaluation(validation_loader, lstm,'lstm')
    print('accuracy_on_validation: {}, the acc for live is {}, the acc for dead is {}'.format(val_acc, acc0, acc1)) 
    accuracy_list.append(val_acc)
    if ((epoch > 5) and ((accuracy_list[-1] < (accuracy_list[-2] - 0.01)) or (accuracy_list[-1] < (accuracy_list[-3] - 0.01)))):
        print("early stop, accuracy = ", accuracy_list[-2])
        break            

Epoch: 0; Loss: 0.6022446155548096
accuracy_on_validation: 0.6940325906553337, the acc for live is 0.6891904162328386, the acc for dead is 0.7178153059404174
Epoch: 1; Loss: 0.6236343383789062
accuracy_on_validation: 0.6620379218427803, the acc for live is 0.6416953194992615, the acc for dead is 0.7619521912350598
Epoch: 2; Loss: 0.5533823370933533
accuracy_on_validation: 0.7540486848061158, the acc for live is 0.7730271919416936, the acc for dead is 0.6608342748409348
Epoch: 3; Loss: 0.5819407105445862
accuracy_on_validation: 0.6684806115777298, the acc for live is 0.6384204218019807, the acc for dead is 0.8161235654397336
Epoch: 4; Loss: 0.5296058058738708
accuracy_on_validation: 0.5877206658954887, the acc for live is 0.5252366885396741, the acc for dead is 0.8946155675804246
Epoch: 5; Loss: 0.490635484457016
accuracy_on_validation: 0.5926444701503797, the acc for live is 0.5316866025811763, the acc for dead is 0.8920437652375572
Epoch: 6; Loss: 0.4815255105495453
accuracy_on_valida

In [39]:
### Model
#forward
#cell_class, step
class GRU_Model(nn.Module):
    def __init__(self, number_units, batch_size, mean = 0, std = 0.1, input_dim = 48):
        super(GRU_Model, self).__init__()
        '''
        scale: the updating frequency, a list. [1,2,4,8,16,32]
        batch_size: the size of batch
        group_size: the number of nodes in each scale, default is 1.
        mean: the mean of Gaussian distribution for initialize weights for hidden layer
        std: the standard devation of the Gaussian distribution for initialize weights for hidden layer
        input_dim: the feature dimension of each time step
        '''
        self.hidden_dim = number_units
        self.input_dim = input_dim
        self.batch_size = batch_size

        self.gru = nn.GRU(self.input_dim, self.hidden_dim)
        self.hidden2tag = nn.Linear(self.hidden_dim, 2)
        
    def forward(self, sequence, hidden):#depends on what passed for model.train(), to be filled)
        '''
        sequence: batch  x timestep x number_feature matrix
        hidden: should be h0
        ''' 
        sequence = sequence.transpose(0,1)
        gru_out, hidden = self.gru(sequence, hidden)
        tag_space = self.hidden2tag(gru_out.view(len(sequence)*self.batch_size, -1))
        tag_scores = F.log_softmax(tag_space)       
        return tag_scores
    
    def init_hidden(self):
        return Variable(torch.zeros(1, self.batch_size, self.hidden_dim))

In [40]:
gru = GRU_Model(number_units, batch_size)
loss = torch.nn.CrossEntropyLoss(ignore_index=-1)  
optimizer = torch.optim.Adam(gru.parameters(), lr=0.05)
accuracy_list = []
train_loader, validation_loader = reload_data(batch_size)

for epoch in range(num_epochs):
    for step, (data, label) in enumerate(train_loader):        
        data, label = Variable(data), Variable(label)
        gru.zero_grad()
        hidden= gru.init_hidden()
        output = gru(data, hidden)
        
        label = label.transpose(0,1).contiguous().view(-1) 
        lossy = loss(output, label)
        lossy.backward()
        optimizer.step()
        #print('one_step')
        
    print("Epoch: {}; Loss: {}".format(epoch, lossy.data[0]))
        #print('accuracy_on_training: {}'.format(evaluation(train_loader))) 
    acc0, acc1, val_acc = evaluation(validation_loader, gru,'gru')
    print('accuracy_on_validation: {}, the acc for live is {}, the acc for dead is {}'.format(val_acc, acc0, acc1)) 
    accuracy_list.append(val_acc)
    if ((epoch > 5) and ((accuracy_list[-1] < (accuracy_list[-2] - 0.01)) or (accuracy_list[-1] < (accuracy_list[-3] - 0.01)))):
        print("early stop, accuracy = ", accuracy_list[-2])
        break            

Epoch: 0; Loss: 0.7124496102333069
accuracy_on_validation: 0.6651435900015088, the acc for live is 0.6404362091091794, the acc for dead is 0.786495807813522
Epoch: 1; Loss: 0.6974120736122131
accuracy_on_validation: 0.502290901775386, the acc for live is 0.414852659870699, the acc for dead is 0.9317506095022894
Epoch: 2; Loss: 0.6752939224243164
accuracy_on_validation: 0.5331564653221345, the acc for live is 0.4545691178963171, the acc for dead is 0.9191443182493905
Epoch: 3; Loss: 0.5863159894943237
accuracy_on_validation: 0.6688779359251622, the acc for live is 0.6424641032470519, the acc for dead is 0.7986115240530416
Epoch: 4; Loss: 0.6798003315925598
accuracy_on_validation: 0.7993612633908364, the acc for live is 0.8632049202159858, the acc for dead is 0.48578819052149613
Epoch: 5; Loss: 0.525550127029419
accuracy_on_validation: 0.7118191419805864, the acc for live is 0.7197087096539868, the acc for dead is 0.673068918356425
Epoch: 6; Loss: 0.543253481388092
accuracy_on_validation