In [1]:
import numpy as np
import pandas as pd

# viz
import matplotlib.pyplot as plt

# notebook settings
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 1000)

## Sample Prep

In [2]:
samples = pd.read_csv('../data/TCGA/rna-seq_pan/meta/gdc_sample_sheet.2019-12-12.tsv', sep="\t")
# get file type
samples['data'] = [val[1] for i,val in samples['File Name'].str.split(".").items()]
samples['project'] = [val[1] for i,val in samples['Project ID'].str.split("-").items()]
samples['project'].value_counts()

BRCA    1206
LUAD     588
UCEC     567
KIRC     554
LUSC     543
LGG      524
PRAD     517
COAD     506
THCA     505
SKCM     469
BLCA     431
LIHC     421
STAD     402
OV       379
KIRP     308
CESC     306
PAAD     171
ESCA     171
GBM      166
TGCT     150
PCPG     133
LAML     123
KICH      81
ACC       75
CHOL      41
SARC      10
DLBC       9
READ       7
MESO       1
Name: project, dtype: int64

In [3]:
samples['Sample Type'].value_counts()

Primary Tumor                                      8166
Solid Tissue Normal                                 636
Metastatic                                          385
Primary Blood Derived Cancer - Peripheral Blood     123
Recurrent Tumor                                      43
Additional - New Primary                             10
Additional Metastatic                                 1
Name: Sample Type, dtype: int64

In [4]:
disease = samples[samples['Sample Type']=='Primary Tumor']['project'].value_counts()
healthy = samples[samples['Sample Type']=='Solid Tissue Normal']['project'].value_counts()
overlap = pd.merge(disease, healthy, left_index=True, right_index=True)
overlap.sort_values(by=['project_y'], ascending=False)
proj = overlap.sort_values(by=['project_y'], ascending=False).head(10).index

Unnamed: 0,project_x,project_y
BRCA,1087,112
KIRC,483,70
LUAD,528,58
THCA,445,53
PRAD,466,50
LIHC,369,49
LUSC,496,47
COAD,465,39
STAD,370,32
KIRP,276,31


Subset healthy samples

In [5]:
cases = samples[samples['Sample Type']=='Solid Tissue Normal'].sample(frac=1).copy()
cases.shape
cases = cases[cases['project'].isin(proj)]
cases['project'].value_counts()
cases.shape

(636, 10)

BRCA    112
KIRC     70
LUAD     58
THCA     53
PRAD     50
LIHC     49
LUSC     47
COAD     39
STAD     32
KIRP     31
Name: project, dtype: int64

(541, 10)

## Dataset Prep

In [6]:
from sklearn.model_selection import train_test_split

target = 'project'
cases[target] = cases[target].astype('category')
train, test = train_test_split(cases)

In [7]:
import torch
import torch.nn as nn
from torch.optim import lr_scheduler
import torch.optim as optim
from torch.autograd import Variable

from trainer import fit
import visualization as vis
import numpy as np

if torch.cuda.is_available():
    cuda = torch.cuda.is_available()
    print("{} GPUs available".format(torch.cuda.device_count()))

colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728',
              '#9467bd', '#8c564b', '#e377c2', '#7f7f7f',
              '#bcbd22', '#17becf']
classes = train[target].cat.categories.values


8 GPUs available


In [8]:
from tcga_datasets import TCGA, SiameseTCGA
root_dir = "../data/TCGA/rna-seq_pan/"
batch_size = 1

train_dataset = TCGA(root_dir, samples=train, train=True, target=target)
test_dataset = TCGA(root_dir, samples=test, train=False, target=target)
print('Loaded')
kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=False, **kwargs)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, **kwargs)

Loaded


## Siamese Network

In [9]:
# Step 1 set up dataloader
siamese_train_dataset = SiameseTCGA(train_dataset) # Returns pairs of images and target same/different
siamese_test_dataset = SiameseTCGA(test_dataset)
batch_size = 256
kwargs = {'num_workers': 10, 'pin_memory': True} if cuda else {}
siamese_train_loader = torch.utils.data.DataLoader(siamese_train_dataset, batch_size=batch_size, shuffle=True, **kwargs)
siamese_test_loader = torch.utils.data.DataLoader(siamese_test_dataset, batch_size=batch_size, shuffle=False, **kwargs)

# Set up the network and training parameters
from tcga_networks import EmbeddingNet, SiameseNet
from losses import ContrastiveLoss
from metrics import AccumulatedAccuracyMetric

# Step 2
embedding_net = EmbeddingNet()
# Step 3
model = SiameseNet(embedding_net)
if cuda:
    model = nn.DataParallel(model)
    model.cuda()
    
# Step 4
margin = 1.
loss_fn = ContrastiveLoss(margin)
lr = 1e-3
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = lr_scheduler.StepLR(optimizer, 8, gamma=0.1, last_epoch=-1)
n_epochs = 10
# print training metrics every log_interval * batch_size
log_interval = 4

RuntimeError: CUDA error: out of memory

In [None]:
train_loss, val_loss = fit(siamese_train_loader, siamese_test_loader, model, loss_fn, optimizer, scheduler, 
    n_epochs, cuda, log_interval)

In [None]:
plt.plot(range(0, n_epochs), train_loss, 'rx-')
plt.plot(range(0, n_epochs), val_loss, 'bx-')

In [None]:
train_embeddings_cl, train_labels_cl = vis.extract_embeddings(train_loader, model)
vis.plot_embeddings(train_embeddings_cl, train_labels_cl, classes)

In [None]:
val_embeddings_baseline, val_labels_baseline = vis.extract_embeddings(test_loader, model)
vis.plot_embeddings(val_embeddings_baseline, val_labels_baseline, classes)