# Import Packages

In [1]:
import warnings
warnings.filterwarnings('ignore')

import gc
import glob
import pathlib
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from sklearn import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, average_precision_score

import torch
import torch.nn as nn
from torch.autograd import Variable

import matplotlib.pyplot as plt

# Data Wrangling

In [30]:
##### path to data #####
path = '../data/raw/'

path_train_tdcsfog = glob.glob(path + 'train/tdcsfog/**')
path_train_defog = glob.glob(path + 'train/defog/**')

path_test = glob.glob(path + 'test/**/**')

In [31]:
##### load metadata #####
df_tdcsfog_metadata = pd.read_csv(path + 'tdcsfog_metadata.csv')
df_defog_metadata = pd.read_csv(path + 'defog_metadata.csv')

df_tdcsfog_metadata['Module'] = 'tdcsfog'; df_defog_metadata['Module'] = 'defog'

df_metadata = pd.concat([df_tdcsfog_metadata, df_defog_metadata])

# print(df_metadata.shape)
# df_metadata.head()

In [32]:
##### load subjects data #####
df_subjects = pd.read_csv(path + 'subjects.csv')

# print(df_subjects.shape)
# df_subjects.head()


##### process subjects data #####
df_subjects = df_subjects.fillna(0).groupby('Subject').median()
df_subjects = df_subjects.reset_index()
# df_subjects.rename(columns={'Subject':'Id'}, inplace=True)

df_subjects['s_kmeans'] = cluster.KMeans(n_clusters=10, random_state=3).fit_predict(df_subjects[df_subjects.columns[1:]])

df_subjects=df_subjects.rename(columns={'Visit':'s_Visit', 'Age':'s_Age', 'YearsSinceDx':'s_YearsSinceDx', 'UPDRSIII_On':'s_UPDRSIII_On', 
                                        'UPDRSIII_Off':'s_UPDRSIII_Off', 'NFOGQ':'s_NFOGQ'})

# print(df_subjects.shape)
# df_subjects.head()

In [33]:
##### load tasks data #####
df_tasks = pd.read_csv(path + 'tasks.csv')

# print(df_tasks.shape)
# df_tasks.head()


##### process tasks data #####
df_tasks['Duration'] = df_tasks['End'] - df_tasks['Begin']
df_tasks = pd.pivot_table(df_tasks, 
                       values=['Duration'], index=['Id'], columns=['Task'], aggfunc='sum', fill_value=0)
df_tasks.columns = [c[-1] for c in df_tasks.columns]
df_tasks = df_tasks.reset_index()

df_tasks['t_kmeans'] = cluster.KMeans(n_clusters=10, random_state=3).fit_predict(df_tasks[df_tasks.columns[1:]])

# print(df_tasks.shape)
# df_tasks.head()

In [34]:
##### merge metadata and subjects data #####
df_metadata_complex = df_metadata.merge(df_subjects, 
                                        how='left', on='Subject').copy()
df_metadata_complex['Medication'] = df_metadata_complex['Medication'].factorize()[0]

# print(df_metadata_complex.shape)
# df_metadata_complex.head()

In [35]:
def reader(file):
    try:
        df = pd.read_csv(file, 
                         usecols=['Time', 'AccV', 'AccML', 'AccAP', 'StartHesitation', 'Turn', 'Walking'])

        df['Id'] = file.split('/')[-1].split('.')[0]; df['Module'] = pathlib.Path(file).parts[-2]
        
        df['TimeFrac']=(df.index/df.index.max()).values

        df = pd.merge(df, df_tasks[['Id', 't_kmeans']], 
                      how='left', on='Id').fillna(-1)
        df = pd.merge(df, df_metadata_complex[['Id', 'Subject'] + ['Visit', 'Test', 'Medication', 's_kmeans']], 
                      how='left', on='Id').fillna(-1)

        return df
    except: 
        pass

In [8]:
##### load train data #####
df_train_tdcsfog = pd.concat([reader(file=file) for file in tqdm(path_train_tdcsfog)])
df_train_defog = pd.concat([reader(file=file) for file in tqdm(path_train_defog)])

df_train = pd.concat([df_train_tdcsfog, df_train_defog]).fillna(0)

# print(df_train.shape)
# df_train.head()

In [9]:
# del(df_train_tdcsfog); del(df_train_defog)
# gc.collect()

In [10]:
# cols = [c for c in df_train.columns if c not in ['Id', 'Subject', 'Module', 'Time', 'StartHesitation', 'Turn' , 'Walking', 'Valid', 'Task', 'Event']]
# pcols = ['StartHesitation', 'Turn' , 'Walking']; scols = ['Id', 'StartHesitation', 'Turn' , 'Walking']

print(df_train.shape)
df_train.head()

# Data Preparation

In [11]:
def create_sequences(X, y, window_size):
    
    list_X, list_y = list(), list()
    
    for i in range(len(X)):
        end_ix = i + window_size
        
        if end_ix >= len(X): break

        seq_x, seq_y = X[i:end_ix], y[end_ix - 1]      
        list_X.append(seq_x); list_y.append(seq_y)
            
    
    return np.array(list_X), np.array(list_y) 

In [36]:
def split_data(data, train_size, window_size, outcome, keep_columns):
    
    train_len = int(len(data) * train_size)
    validation_len = len(data) - train_len
    
    train = data[:train_len]; validation = data[train_len:]  
    
    y_train = train[outcome]; y_validation = validation[outcome]
    y_train, y_validation = y_train.to_numpy(), y_validation.to_numpy()
    
    X_train = train[keep_columns]; X_validation = validation[keep_columns]
    
    scaler = MinMaxScaler(feature_range=(-1, 1)) 
    X_train = scaler.fit_transform(X_train); X_validation = scaler.fit_transform(X_validation)

    # train = np.hstack((X_train, y_train)); validation = np.hstack((X_validation, y_validation))

    X_train, y_train = create_sequences(X=X_train, y=y_train, window_size=window_size)
    X_validation, y_validation = create_sequences(X=X_validation, y=y_validation, window_size=window_size)
    
    return X_train, X_validation, y_train, y_validation

In [37]:
def get_data(data, train_size, window_size, id_column, outcome, keep_columns):
    
    list_unique_id = list(data[id_column].unique())
    X_train, X_validation, y_train, y_validation = list(), list(), list(), list()

    for unique_id in tqdm(list_unique_id):
        data_id = data[data[id_column] == unique_id]

        X_train_id, X_validation_id, y_train_id, y_validation_id = split_data(data=data_id, 
                                                                              train_size=train_size, 
                                                                              window_size=window_size, 
                                                                              outcome=outcome, 
                                                                              keep_columns=keep_columns)

        X_train.append(X_train_id); X_validation.append(X_validation_id)
        y_train.append(y_train_id); y_validation.append(y_validation_id)
            
    
    X_train = np.vstack(X_train); X_validation = np.vstack(X_validation)
    y_train = np.vstack(y_train); y_validation = np.vstack(y_validation)  

    return torch.Tensor(X_train), torch.Tensor(X_validation), torch.Tensor(y_train), torch.Tensor(y_validation)

In [14]:
X_train, X_validation, y_train, y_validation = get_data(data=df_train,
                                                        train_size=0.75,
                                                        window_size=1,
                                                        id_column='Id', outcome = ['StartHesitation', 'Turn', 'Walking'], 
                                                        keep_columns=['AccV', 'AccML', 'AccAP']) 

In [15]:
# torch.save(X_train, '../data/processed/X_train.pt'); torch.save(y_train, '../data/processed/y_train.pt')
# torch.save(X_validation, '../data/processed/X_validation.pt'); torch.save(y_validation, '../data/processed/y_validation.pt')

In [16]:
# X_train = torch.load('../data/processed/X_train.pt'); y_train = torch.load('../data/processed/y_train.pt')
# X_validation = torch.load('../data/processed/X_validation.pt'); y_validation = torch.load('../data/processed/y_validation.pt')

In [17]:
# del(df_train)
gc.collect()

132

# Modeling

## Training

In [19]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 
print(device)

cpu


In [20]:
class LSTM(nn.Module):

    def __init__(self, num_classes, input_size, hidden_size, num_layers, seq_length, batch_size):
        super(LSTM, self).__init__()
        
        self.seq_length = seq_length
        self.num_layers = num_layers
        self.input_size = input_size
        self.batch_size = batch_size
        self.num_classes = num_classes
        self.hidden_size = hidden_size
        
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        
        self.fc = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
        
        
    def forward(self, x):
        
        h_in = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).to(device)
        c_in = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).to(device)
        
        # propagate input through LSTM
        out, (h_out, c_out) = self.lstm(x, (h_in, c_in))
        
        h_out = h_out.view(-1, self.hidden_size)
        output = self.relu(h_out)        
        output = self.fc(h_out)
        
        return output

In [21]:
num_epochs = 50
learning_rate = 0.0001

input_size = 3; hidden_size = 32; num_classes = 3 
num_layers = 1 

In [22]:
lstm = LSTM(num_classes, input_size, hidden_size, num_layers, seq_length=1, batch_size=100000).to(device)

loss = nn.MSELoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)

In [23]:
for epoch in tqdm(range(num_epochs)):
    y_output = lstm.forward(X_train.to(device))
    
    optimizer.zero_grad()
    
    rmse_loss = torch.sqrt(loss(y_output, y_train.to(device)))
    rmse_loss.backward()
    
    optimizer.step()

    print('Epoch: %d, loss: %1.5f' % (epoch, rmse_loss.item()))

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch: 0, loss: 0.25151
Epoch: 1, loss: 0.25136
Epoch: 2, loss: 0.25120
Epoch: 3, loss: 0.25104
Epoch: 4, loss: 0.25088
Epoch: 5, loss: 0.25073
Epoch: 6, loss: 0.25057
Epoch: 7, loss: 0.25041
Epoch: 8, loss: 0.25026
Epoch: 9, loss: 0.25010


In [24]:
y_validation_proba = lstm(X_validation.to(device)) 

In [25]:
y_validation_proba = y_validation_proba.detach().cpu().numpy()
y_validation = y_validation.detach().cpu().numpy()

In [26]:
print(metrics.average_precision_score(y_validation, y_validation_proba.clip(0.0, 1.0)))

0.039122503150026224


## Prediction

In [45]:
def reader(file):
    try:
        df = pd.read_csv(file)
        df.set_index('Time', drop=True, inplace=True)

        df['Id'] = file.split('/')[-1].split('.')[0]
        df['TimeFrac'] = (df.index/df.index.max()).values

        df = pd.merge(df, df_tasks[['Id', 't_kmeans']], 
                      how='left', on='Id').fillna(-1)
        df = pd.merge(df, df_metadata_complex[['Id', 'Subject'] + ['Visit', 'Test', 'Medication', 's_kmeans']], 
                      how='left', on='Id').fillna(-1)

        return df
    except: 
        pass

    
df_test = pd.concat([reader(file=file) for file in tqdm(path_test)])

In [156]:
df_time = pd.DataFrame(df_test.index, columns=['Time'])
df_subject = pd.DataFrame(df_test.Subject, columns=['Subject'])

df_test = df_test[['AccV', 'AccML', 'AccAP']]

scaler = MinMaxScaler(feature_range=(-1, 1)) 
X_test = scaler.fit_transform(df_test)

In [158]:
def create_sequences(X, window_size):
    
    list_X = []
    
    for i in range(len(X)):
        end_ix = i + window_size
        
        if end_ix > len(X): break

        seq_x = X[i:end_ix]     
        list_X.append(seq_x)
            
    
    return np.array(list_X)


X_test = create_sequences(X=X_test, window_size=1)
X_test = torch.Tensor(X_test)

In [160]:
y_test_proba = lstm(X_test.to(device)) 
# y_test_proba = y_test_proba.clip(0.0, 1.0)

y_test_proba = y_test_proba.detach().cpu().numpy()
y_test_proba = pd.DataFrame(y_test_proba, columns=['StartHesitation', 'Turn', 'Walking'])

In [161]:
df_subject.reset_index(drop=True, inplace=True)
df_time.reset_index(drop=True, inplace=True)

df_time = df_time.astype(str)

In [163]:
Id = df_subject.Subject.str.cat(df_time.Time, sep='_')
y_test_proba.insert(0, 'Id', list(Id.values))

df_submission = y_test_proba

df_submission.to_csv('submission.csv', index=False)