In [1]:
import numpy as np
import pandas as pd

# viz
import matplotlib.pyplot as plt

# notebook settings
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 1000)

# Input

In [2]:
samples = pd.read_csv('../data/TCGA/gdc_sample_sheet.2019-12-09.tsv', sep="\t")

# get file type
samples['data'] = [val[1] for i,val in samples['File Name'].str.split(".").items()]

# all cases with adjacent normal tissue
cases = samples[samples['Sample Type']=='Solid Tissue Normal']['Case ID']

# divide, join, subset
case_tumor = samples[(samples['Case ID'].isin(cases)) & (samples['Sample Type']=='Primary Tumor') & (samples['data']=='FPKM')]
case_norm = samples[(samples['Case ID'].isin(cases)) & (samples['Sample Type']=='Solid Tissue Normal') & (samples['data']=='FPKM')]
cases = case_norm[case_norm['Case ID'].isin(case_tumor['Case ID'])]['Case ID']
cases.shape

# combine
case_tumor = case_tumor[case_tumor['Case ID'].isin(cases)]
case_norm = case_norm[case_norm['Case ID'].isin(cases)]
cases = pd.concat([case_tumor, case_norm])
# convert target condition to categorical
cases['Sample Type'] = cases['Sample Type'].astype('category')

(57,)

# Model

In [45]:
import torch
from torch.optim import lr_scheduler
import torch.optim as optim
from torch.autograd import Variable

from trainer import fit
import visualization as vis
import numpy as np
from sklearn.model_selection import train_test_split
cuda = torch.cuda.is_available()
print("Cuda is available: {}".format(cuda))


train, test = train_test_split(cases)
classes = train['Sample Type'].cat.categories.values

train['Sample Type'].value_counts()
test['Sample Type'].value_counts()

Cuda is available: True


Primary Tumor          50
Solid Tissue Normal    43
Name: Sample Type, dtype: int64

Primary Tumor          17
Solid Tissue Normal    14
Name: Sample Type, dtype: int64

In [46]:
from tcga_datasets import TCGA, SiameseTCGA
root_dir = "../data/TCGA"
batch_size = 1

train_dataset = TCGA(root_dir, samples=train, train=True)
test_dataset = TCGA(root_dir, samples=test, train=False)

kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, **kwargs)

In [47]:
# Step 1 set up dataloader
root_dir = "../data/TCGA"
siamese_train_dataset = SiameseTCGA(train_dataset) # Returns pairs of images and target same/different
siamese_test_dataset = SiameseTCGA(test_dataset)
batch_size = 10
kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}
siamese_train_loader = torch.utils.data.DataLoader(siamese_train_dataset, batch_size=batch_size, shuffle=True, **kwargs)
siamese_test_loader = torch.utils.data.DataLoader(siamese_test_dataset, batch_size=batch_size, shuffle=False, **kwargs)

#### Balanced batch sampler testing

In [49]:
train.shape
test.shape

(93, 9)

(31, 9)

In [9]:
from datasets import BalancedBatchSampler

train_batch_sampler = BalancedBatchSampler(train_dataset.labels, n_classes=2, n_samples=61)
test_batch_sampler = BalancedBatchSampler(test_dataset.labels, n_classes=2, n_samples=25)

In [12]:
train_batch_sampler.n_dataset
train_batch_sampler.count

124

122

In [10]:
for targ in train_batch_sampler:
    targ

[12,
 44,
 15,
 1,
 28,
 3,
 8,
 29,
 20,
 22,
 24,
 26,
 37,
 5,
 36,
 19,
 63,
 57,
 41,
 10,
 14,
 17,
 42,
 45,
 7,
 52,
 31,
 46,
 43,
 59,
 55,
 9,
 39,
 48,
 50,
 53,
 21,
 33,
 6,
 65,
 61,
 23,
 56,
 62,
 27,
 25,
 13,
 66,
 34,
 38,
 30,
 51,
 47,
 18,
 4,
 49,
 32,
 54,
 64,
 58,
 16,
 85,
 81,
 77,
 78,
 109,
 99,
 75,
 123,
 110,
 91,
 98,
 67,
 112,
 73,
 96,
 82,
 105,
 104,
 68,
 114,
 86,
 119,
 111,
 101,
 115,
 97,
 74,
 87,
 83,
 84,
 69,
 102,
 120,
 107,
 90,
 79,
 122,
 76,
 100,
 95,
 113,
 117,
 93,
 88,
 108,
 106,
 71,
 103,
 72,
 118,
 80,
 92,
 89,
 121,
 70,
 94,
 116]