## Training multi task learning on cancer microarray data

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from pathlib import Path

import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

In [2]:
PATH = Path("/data2/yinterian/microarray/")

In [3]:
train = pd.read_csv(PATH/"train_all_problems.csv")
train.head()

Unnamed: 0,PSME1,CISD1,SPDEF,ATF1,RHEB,IGF1R,FOXO3,GSTM2,RHOA,IL1B,...,CDKN2A,ATP11B,CD320,CDKN1B,MLLT11,CEBPZ,GAPDH,CBR3,target,index
0,0.4,0.3,0.1,0.3,0.9,0.1,0.1,0.6,0.2,0.9,...,0.1,0.6,0.3,0.5,0.8,0.7,0.4,0.3,0,0
1,0.6,0.3,0.1,0.3,0.9,0.2,0.1,0.6,0.3,0.8,...,0.1,0.6,0.9,0.2,0.4,0.9,0.5,0.8,0,0
2,0.4,0.3,0.1,0.3,0.8,0.2,0.1,0.7,0.3,0.7,...,0.1,0.6,0.8,0.2,0.5,0.9,0.3,0.8,0,0
3,0.3,0.4,0.1,0.4,0.7,0.2,0.1,0.6,0.3,0.6,...,0.1,0.5,0.8,0.3,0.7,1.0,0.3,0.6,0,0
4,0.6,0.2,0.1,0.3,0.9,0.1,0.1,0.6,0.3,0.9,...,0.1,0.7,0.2,0.6,0.8,1.0,0.6,0.6,0,0


In [4]:
train.tail()

Unnamed: 0,PSME1,CISD1,SPDEF,ATF1,RHEB,IGF1R,FOXO3,GSTM2,RHOA,IL1B,...,CDKN2A,ATP11B,CD320,CDKN1B,MLLT11,CEBPZ,GAPDH,CBR3,target,index
32061,0.5,0.9,0.6,1.0,0.4,0.7,0.2,0.2,0.8,0.1,...,0.6,0.6,0.3,0.6,1.0,0.9,0.9,0.2,1,551
32062,0.6,0.7,0.5,0.9,0.7,0.3,1.0,0.5,0.5,1.0,...,0.8,0.3,0.1,0.7,1.0,1.0,0.4,1.0,1,551
32063,0.4,0.3,0.6,0.9,0.8,0.2,0.8,0.7,0.4,1.0,...,0.7,0.7,0.9,0.7,0.1,0.9,0.9,0.9,1,551
32064,0.6,0.9,0.5,0.8,0.7,0.5,0.5,0.9,0.7,1.0,...,0.4,0.5,1.0,0.3,1.0,0.8,0.7,0.9,1,551
32065,0.6,0.2,0.7,0.8,0.7,0.3,0.9,0.2,0.9,0.1,...,0.4,0.8,1.0,0.4,1.0,0.2,0.9,0.7,1,551


In [5]:
valid = pd.read_csv(PATH/"valid_all_problems.csv")
test = pd.read_csv(PATH/"test_all_problems.csv")

In [6]:
train.shape

(32066, 980)

In [7]:
valid.shape, test.shape

((3725, 980), (3725, 980))

In [8]:
def nn_model(h=100, K=552):
    model = nn.Sequential(
        nn.Linear(978, 100),
        nn.ReLU(),
        nn.BatchNorm1d(100),
        nn.Linear(100, 552))
    return model

In [9]:
class MicroarrayData(Dataset):
    def __init__(self, df):
        self.X = df.iloc[:, :978].values
        self.Y = df.iloc[:, 978].values
        self.index = df.iloc[:, 979].values
    
    def __len__(self):
        return self.Y.shape[0]
    
    def __getitem__(self, index):
        
        return self.X[index], self.Y[index], self.index[index]

In [10]:
train_ds = MicroarrayData(train)
valid_ds = MicroarrayData(valid)

In [11]:
batch_size = 50
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, shuffle=False)

In [12]:
x, y, ind = next(iter(train_dl))

In [13]:
model = nn_model()
y_hat = model(x.float())
pred = torch.stack([(y_hat[i,ind[i]] > 0).int() for i in range(len(ind))])

In [14]:
(pred == y).sum()

tensor(21)

In [15]:
def valid_loss(model, train_dl):
    model.eval()
    losses = []
    correct = 0
    total = 0
    for x, y, ind in valid_dl:
        x = x.float().cuda()
        y = y.float().cuda()
        y_hat = model(x)
        loss = torch.stack(
            [F.binary_cross_entropy_with_logits(y_hat[i,ind[i]], y[i])
             for i in range(len(ind))]).mean()
        losses.append(loss.item())
        pred = torch.stack([(y_hat[i,ind[i]] > 0).int() for i in range(len(ind))])
        correct += (pred == y).sum()
        total += y.shape[0]
    return np.mean(losses), correct/total

In [16]:
def train_epocs(model, train_dl, valid_dl, epochs=10, lr=0.01):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
    for i in range(epochs):
        model.train()
        losses = []
        for x, y, ind in train_dl:
            x = x.float().cuda()
            y = y.float().cuda()
            y_hat = model(x)
            loss = torch.stack([
                F.binary_cross_entropy_with_logits(y_hat[i,ind[i]], y[i])
                for i in range(len(ind))]).mean()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            testloss = 10
            losses.append(loss.item())
            val_loss, val_acc = valid_loss(model, valid_dl)
        print("train loss %.3f valid loss %.3f valid acc %3f" %
              (np.mean(losses), val_loss, val_acc))

In [None]:
model = nn_model().cuda()
train_epocs(model, train_dl, valid_dl, epochs=20, lr=0.01)

In [79]:
model = nn_model().cuda()
train_epocs(model, train_dl, valid_dl, epochs=50, lr=0.01)

train loss 0.700 valid loss 0.618
train loss 0.606 valid loss 0.594
train loss 0.566 valid loss 0.557
train loss 0.527 valid loss 0.536
train loss 0.496 valid loss 0.511
train loss 0.468 valid loss 0.505
train loss 0.439 valid loss 0.486
train loss 0.414 valid loss 0.473
train loss 0.401 valid loss 0.475
train loss 0.413 valid loss 0.498
train loss 0.404 valid loss 0.477
train loss 0.376 valid loss 0.446
train loss 0.369 valid loss 0.441
train loss 0.340 valid loss 0.436
train loss 0.338 valid loss 0.446
train loss 0.346 valid loss 0.454
train loss 0.325 valid loss 0.424
train loss 0.309 valid loss 0.432
train loss 0.289 valid loss 0.436
train loss 0.289 valid loss 0.435
train loss 0.273 valid loss 0.439
train loss 0.263 valid loss 0.437
train loss 0.259 valid loss 0.442
train loss 0.263 valid loss 0.431
train loss 0.242 valid loss 0.438
train loss 0.239 valid loss 0.461
train loss 0.265 valid loss 0.460
train loss 0.250 valid loss 0.475
train loss 0.341 valid loss 0.568
train loss 0.5

KeyboardInterrupt: 