# Load required packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from train_utils import batchify_data, run_epoch, train_model, Flatten

In [3]:
np.random.seed(12321)  # for reproducibility
torch.manual_seed(12321)  # for reproducibility

<torch._C.Generator at 0x192c2ede8b0>

# Load Data

In [4]:
path="D:/A-Themed/0-NYU/2020 Fall/kaggle-MoA/1-Dataset/lish-moa/" # Matyas T450s

In [5]:
train_df = pd.read_csv(path+'train_features.csv',index_col=['sig_id'])
train_target_df = pd.read_csv(path+'train_targets_scored.csv',index_col=['sig_id'])

In [6]:
test_df = pd.read_csv(path+'test_features.csv',index_col=['sig_id'])

# Prepocessing

In [7]:
def preprocess(df):
    df = df.copy()
    df['cp_type'] = df['cp_type'].map({'trt_cp':0,'ctl_vehicle':1})
    df['cp_dose'] = df['cp_dose'].map({'D1':0, 'D2':1})
    df['cp_time'] = df['cp_time'].map({24:0,48:1,72:2})
    return df

train = preprocess(train_df)
test = preprocess(test_df)

In [None]:
# (TODO) what is this used for
train_target_df = train_target_df.loc[train['cp_type']==0]
train = train.loc[train['cp_type']==0]
train.drop(columns=['cp_type'], inplace=True)

test = test.loc[test['cp_type']==0]
test.drop(columns=['cp_type'], inplace=True)

In [8]:
train = train.merge(train_target_df, how='inner', left_index=True, right_index=True)

In [9]:
# Split into train and test
train_proportion = 0.75

shuffle_df = train.sample(frac=1, random_state=1)
ratio = int(len(shuffle_df)*train_proportion)
Xtrain = shuffle_df.iloc[:ratio]
Xtest = shuffle_df.iloc[ratio:]
print(f"{Xtrain.shape}, {Xtest.shape}")

(17860, 1081), (5954, 1081)


In [10]:
ytrain = Xtrain.loc[:,'5-alpha_reductase_inhibitor':]
ytest = Xtest.loc[:,'5-alpha_reductase_inhibitor':]
Xtrain = Xtrain.loc[:,:'5-alpha_reductase_inhibitor'].iloc[:,:-1]
Xtest = Xtest.loc[:,:'5-alpha_reductase_inhibitor'].iloc[:,:-1]

In [12]:
ytest.head()

Unnamed: 0_level_0,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_978310d04,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_8926ec9a9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_5b78f9971,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
id_52de11ba0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_f8cf3c427,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Baseline NN model

In [None]:
# batch_size = 64
# nb_classes = 10
# nb_epoch = 30
# num_classes = 10
# img_rows, img_cols = 42, 28 # input image dimensions

In [11]:
X_train = Xtrain.to_numpy()
y_train = ytrain.to_numpy()
X_test = Xtest.to_numpy()
y_test = ytest.to_numpy()


In [12]:
# Split into train and dev
dev_split_index = int(9 * len(X_train) / 10)

X_dev = X_train[dev_split_index:]
y_dev = y_train[dev_split_index:]
X_train = X_train[:dev_split_index]
y_train = y_train[:dev_split_index]


In [18]:

# (TODO) Stucks all the time
permutation = np.array([i for i in range(len(X_train))])
np.random.shuffle(permutation) # needs to be np array instead of pandas dataframe
X_train = [X_train[i] for i in permutation]
y_train = [y_train[i] for i in permutation]

In [19]:
print(X_train)

[[ 1.      2.      1.     ...  0.0796  0.2755  1.919 ]
 [ 0.      0.      0.     ... -0.8075 -0.6839 -0.6308]
 [ 0.      0.      0.     ... -5.888  -3.445  -2.788 ]
 ...
 [ 0.      0.      0.     ...  0.1576  0.4419  0.4157]
 [ 0.      0.      1.     ... -1.074   0.9589 -0.099 ]
 [ 0.      0.      0.     ...  0.027   0.7192 -1.2   ]]


In [13]:
## Model specification TODO
model = nn.Sequential(
            nn.Linear(875, 128),#10 representations
            nn.ReLU(),
        #   torch.nn.LeakyReLU(negative_slope = 0.01),
            nn.Linear(128, 10),
        )
lr=0.1 # 0.1 in baseline
momentum=0 # 0 in baseline

In [20]:
from train_utils import batchify_data, run_epoch, train_model, Flatten

In [14]:
# Split dataset into batches
batch_size = 32 # 32 in baseline
train_batches = batchify_data(X_train, y_train, batch_size)
dev_batches = batchify_data(X_dev, y_dev, batch_size)
test_batches = batchify_data(X_test, y_test, batch_size)


In [23]:
print(train_batches[0]["y"].shape)

torch.Size([32, 206])


In [25]:
out1=model(train_batches[0]['x'])
print(out1.shape)

torch.Size([32, 10])


In [22]:
train_model(train_batches, dev_batches, model, lr=lr, momentum=momentum)

  0%|          | 0/502 [00:00<?, ?it/s]-------------
Epoch 1:




IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 205)

In [None]:
## Evaluate the model on test data
    loss, accuracy = run_epoch(test_batches, model.eval(), None)

    print ("Loss on test set:"  + str(loss) + " Accuracy on test set: " + str(accuracy))