In [43]:
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.autograd import Function
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from time import time
import matplotlib.pyplot as plt
import pickle as pkl
import pandas as pd
from collections import Counter
from sklearn.metrics import r2_score

% matplotlib inline

In [2]:
train_set = pd.read_csv('path/to/trainset')
test_set = pd.read_csv('path/to/testset')

In [3]:
observ_cols = ['gender', 'age','elixhauser','re_admission', 'SOFA', 'SIRS', 'Weight_kg', 'GCS', 'HR',
            'SysBP', 'MeanBP', 'DiaBP', 'RR', 'SpO2',
            'Temp_C', 'FiO2_1', 'Potassium', 'Sodium', 'Chloride',
            'Glucose', 'BUN', 'Creatinine', 'Magnesium', 'Calcium',
            'Ionised_Ca', 'CO2_mEqL', 'SGOT', 'SGPT', 'Total_bili',
            'Albumin', 'Hb', 'WBC_count', 'Platelets_count', 'PTT',
            'PT', 'INR', 'Arterial_pH', 'paO2', 'paCO2',
            'Arterial_BE', 'Arterial_lactate', 'HCO3', 'PaO2_FiO2',
            'output_total', 'output_4hourly']

In [28]:
# sparsity constrain
class L1Penalty(Function):
    @staticmethod
    def forward(ctx, input, l1weight):
        ctx.save_for_backward(input)
        ctx.l1weight = l1weight
        return input

    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_variables
        grad_input = input.clone().sign().mul(self.l1weight)
        grad_input += grad_output
        return grad_input

In [72]:
class SparseAutoEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, l1=1e-5):
        
        super(SparseAutoEncoder, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.encoder1 = nn.Linear(input_size, hidden_size)
        self.encoder2 = nn.Linear(hidden_size, hidden_size)
        self.decoder1 = nn.Linear(hidden_size, hidden_size)
        self.decoder2 = nn.Linear(hidden_size, input_size)
        self.l1weight = l1
    
    def forward(self, x):
        
        encoded1 = F.relu(self.encoder1(x))
        encoded = self.encoder2(encoded1)
        
        # add sparsity constraint
        encoded = L1Penalty.apply(encoded, self.l1weight)
        
        decoded1 = self.decoder1(encoded)
        decoded = self.decoder2(decoded1)
        decoded = F.sigmoid(decoded)
        
        return encoded, decoded

In [17]:
def do_eval(eval_set, autoencoder, output_embeddings=False):
    
    enc_criterion = torch.nn.MSELoss()

    eval_enc_loss, eval_enc_acc = 0, 0   

    enc_X = Variable(torch.FloatTensor(eval_set[observ_cols].values))
    encoded, decoded = autoencoder(enc_X.unsqueeze(1)) 

    eval_enc_loss = enc_criterion(decoded, enc_X)
    eval_enc_acc = r2_score(enc_X.data.numpy(), decoded.squeeze(1).data.numpy(), 
                             multioutput='variance_weighted')
    
    if output_embeddings:
        return encoded.squeeze(1).data.numpy()
    
    return eval_enc_loss.data[0], eval_enc_acc

In [63]:
def train_autoencoder(train_set, test_set, autoencoder, lr=0.001, batch_size=128, 
                      num_epoch=50, print_every=500, val=False):
    
    enc_criterion = torch.nn.MSELoss()
    
    optimizer = torch.optim.Adam(autoencoder.parameters(), lr=lr)
    
    train_x, val_x = train_test_split(train_set[observ_cols].values, test_size=0.1, random_state=42)

    for epoch in range(1, num_epoch+1):
        
        num_batches = train_x.shape[0] // batch_size
        
        for batch in range(num_batches):
            
            enc_loss, enc_acc = 0, 0
            
            batch_train_x = train_x[batch*batch_size: (batch+1)*batch_size]
            enc_X = Variable(torch.FloatTensor(batch_train_x))
            encoded, decoded = autoencoder(enc_X.unsqueeze(1)) 
            #print(encoded.shape, decoded.shape)
            enc_loss = enc_criterion(decoded, enc_X)
            
            enc_acc = r2_score(enc_X.data.numpy(), decoded.squeeze(1).data.numpy(), 
                                multioutput='variance_weighted')
            
            if batch != 0 and batch % print_every == 0:
                print ('epoch:{}/{}, batch:{}/{}, loss:{}, enc_acc:{}'.format(epoch, num_epoch,batch, \
                                                                                                       num_batches, enc_loss.data[0], \
                                                                                                       enc_acc))
            optimizer.zero_grad()
            enc_loss.backward()
            optimizer.step()
        
        if val:
            print ('-----------------------')
            print ('evaluating ...')
            val_total_loss, val_enc_acc = do_val(train_set, val_uids, autoencoder)
            print ('Validating: loss:{}, enc_acc:{}'.format(val_total_loss.data[0], \
                                                                                     val_enc_acc))
            print ('-----------------------')
        
        if epoch != 0 and epoch % 10 == 0:
            print ('Testing ...')
            eval_enc_loss, eval_enc_acc = do_eval(test_set, autoencoder)
            print ('eval loss:', eval_enc_loss, 'eval acc:', eval_enc_acc)
            print ('-----------------------')

In [64]:
autoencoder = SparseAutoEncoder(45, 128)

In [1]:
train_autoencoder(train_set, test_set, autoencoder, lr=1e-4, num_epoch=5) # 55 epochs in total

In [68]:
train_embeddings = do_eval(train_set, autoencoder, output_embeddings=True)

In [69]:
test_embeddings = do_eval(test_set, autoencoder, output_embeddings=True)

In [75]:
pkl.dump(train_embeddings, open('train_embeddings_sparse_encoded.pkl', 'wb'))

In [76]:
pkl.dump(test_embeddings, open('test_embeddings_sparse_encoded.pkl', 'wb'))