In [124]:
import numpy as np
import pandas as pd

# viz
import matplotlib.pyplot as plt

# notebook settings
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 1000)

## Sample Prep

In [125]:
samples = pd.read_csv('../data/TCGA/gdc_sample_sheet.2019-12-09.tsv', sep="\t")
# get file type
samples['data'] = [val[1] for i,val in samples['File Name'].str.split(".").items()]
samples.head()

Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Sample Type,data
0,c5b283b8-a6ab-4652-b824-18fe1cebe0e3,0bf53fc6-b8fb-4e6d-9297-4129c708f3da.FPKM.txt.gz,Transcriptome Profiling,Gene Expression Quantification,TCGA-LUAD,TCGA-50-5935,TCGA-50-5935-01A,Primary Tumor,FPKM
1,27079945-f3da-42b4-8c17-c82d66b7d321,f4a1e772-2a5e-4c0a-803a-1bdb2b376a47.FPKM-UQ.t...,Transcriptome Profiling,Gene Expression Quantification,TCGA-LUAD,TCGA-55-6971,TCGA-55-6971-11A,Solid Tissue Normal,FPKM-UQ
2,da368838-dad0-434d-a2a8-084b362e358e,12d2b9fa-1921-4033-bdb9-7e114c0d7812.FPKM.txt.gz,Transcriptome Profiling,Gene Expression Quantification,TCGA-LUAD,TCGA-99-8025,TCGA-99-8025-01A,Primary Tumor,FPKM
3,be46ee72-8958-483f-bd54-38d21ebf7ff6,4d36e127-5fad-4b97-afff-28f4bdbf5f5d.FPKM.txt.gz,Transcriptome Profiling,Gene Expression Quantification,TCGA-LUAD,TCGA-55-8301,TCGA-55-8301-01A,Primary Tumor,FPKM
4,c5f764bd-a9d4-4188-a4dc-a5e6ca1183f8,c57c5f57-5426-4662-832d-6f342ebeff04.FPKM-UQ.t...,Transcriptome Profiling,Gene Expression Quantification,TCGA-LUAD,TCGA-73-4677,TCGA-73-4677-01A,Primary Tumor,FPKM-UQ


Samples with RNAseq adjacent normal tissue

In [126]:
samples[samples['Sample Type']=='Solid Tissue Normal']['data'].value_counts()

htseq      59
FPKM-UQ    59
FPKM       59
Name: data, dtype: int64

In [127]:
# all cases with adjacent normal tissue
cases = samples[samples['Sample Type']=='Solid Tissue Normal']['Case ID']

In [128]:
# disparity in cases
samples[(samples['Case ID'].isin(cases)) & (samples['Sample Type']=='Primary Tumor') & (samples['data']=='FPKM')]['Case ID'].nunique()
samples[(samples['Case ID'].isin(cases)) & (samples['Sample Type']=='Solid Tissue Normal') & (samples['data']=='FPKM')]['Case ID'].nunique()

57

59

In [129]:
# divide, join, subset
case_tumor = samples[(samples['Case ID'].isin(cases)) & (samples['Sample Type']=='Primary Tumor') & (samples['data']=='FPKM')]
case_norm = samples[(samples['Case ID'].isin(cases)) & (samples['Sample Type']=='Solid Tissue Normal') & (samples['data']=='FPKM')]
cases = case_norm[case_norm['Case ID'].isin(case_tumor['Case ID'])]['Case ID']
cases.shape

(57,)

In [130]:
case_tumor = case_tumor[case_tumor['Case ID'].isin(cases)]
case_norm = case_norm[case_norm['Case ID'].isin(cases)]
cases = pd.concat([case_tumor, case_norm])
cases['Sample Type'] = cases['Sample Type'].astype('category')

In [131]:
case_tumor.shape
case_norm.shape
cases.shape

(67, 9)

(57, 9)

(124, 9)

## Dataset Prep

In [132]:
import os
from sklearn.model_selection import train_test_split

In [133]:
train, test = train_test_split(cases)
train['Sample Type'].value_counts()
test['Sample Type'].value_counts()

Primary Tumor          49
Solid Tissue Normal    44
Name: Sample Type, dtype: int64

Primary Tumor          18
Solid Tissue Normal    13
Name: Sample Type, dtype: int64

In [134]:
def load_tcga_rna(root_dir, samples):
    alt_dir = os.path.join(root_dir, "https:/api.gdc.cancer.gov/data/")
    df_list = []

    for fid,fname in zip(samples['File ID'], samples['File Name']):

        if os.path.exists(os.path.join(root_dir, fid, fname)):
            df_list.append(pd.read_csv(os.path.join(root_dir, fid, fname), sep="\t", index_col=0, header=None).T)

        elif os.path.exists(os.path.join(alt_dir, fid, fname)):
            df_list.append(pd.read_csv(os.path.join(alt_dir, fid, fname), sep="\t", index_col=0, header=None).T)

        else:
            print("{} not found".format(os.path.join(fid, fname)))
            break

    df = pd.concat(df_list)
    df.index = samples['Sample ID']

    return df

In [135]:
samples['Sample Type'].astype('category').cat.codes.to_numpy()

array([0, 2, 0, ..., 2, 2, 0], dtype=int8)

In [137]:
import torch
from torch.optim import lr_scheduler
import torch.optim as optim
from torch.autograd import Variable

from trainer import fit
import numpy as np
cuda = torch.cuda.is_available()
print("Cuda is available: {}".format(cuda))

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

fashion_mnist_classes = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
                         'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728',
              '#9467bd', '#8c564b', '#e377c2', '#7f7f7f',
              '#bcbd22', '#17becf']
mnist_classes = fashion_mnist_classes

def plot_embeddings(embeddings, targets, xlim=None, ylim=None):
    plt.figure(figsize=(10,10))
    for i in range(10):
        inds = np.where(targets==i)[0]
        plt.scatter(embeddings[inds,0], embeddings[inds,1], alpha=0.5, color=colors[i])
    if xlim:
        plt.xlim(xlim[0], xlim[1])
    if ylim:
        plt.ylim(ylim[0], ylim[1])
    plt.legend(mnist_classes)

def extract_embeddings(dataloader, model):
    with torch.no_grad():
        model.eval()
        embeddings = np.zeros((len(dataloader), 2))
        labels = np.zeros(len(dataloader))
        k = 0
        for images, target in dataloader:
            if cuda:
                images = images.cuda()
            embeddings[k:k+len(images)] = model.get_embedding(images).data.cpu().numpy()
            labels[k:k+len(images)] = target.numpy()
            k += len(images)
    return embeddings, labels

Cuda is available: False


In [142]:
root_dir = "../data/TCGA"
siamese_train_dataset = SiameseTCGA(root_dir, train, True) # Returns pairs of images and target same/different
siamese_test_dataset = SiameseTCGA(root_dir, test, False)
batch_size = 5
kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}
siamese_train_loader = torch.utils.data.DataLoader(siamese_train_dataset, batch_size=batch_size, shuffle=True, **kwargs)
siamese_test_loader = torch.utils.data.DataLoader(siamese_test_dataset, batch_size=batch_size, shuffle=False, **kwargs)

# Set up the network and training parameters
from tcga_networks import EmbeddingNet, SiameseNet
from losses import ContrastiveLoss
from metrics import AccumulatedAccuracyMetric

# Step 2
embedding_net = EmbeddingNet()
# Step 3
model = SiameseNet(embedding_net)
if cuda:
    model.cuda()
    
# Step 4
margin = 1.
loss_fn = ContrastiveLoss(margin)
lr = 1e-3
optimizer = optim.Adam(model.parameters(), lr=lr)
optimizer.step()
scheduler = lr_scheduler.StepLR(optimizer, 8, gamma=0.1, last_epoch=-1)
n_epochs = 3
# print training metrics every log_interval * batch_size
log_interval = 4

In [143]:
fit(siamese_train_loader, siamese_test_loader, model, loss_fn, optimizer, scheduler, 
    n_epochs, cuda, log_interval, metrics=[AccumulatedAccuracyMetric()])

Epoch: 1/3. Train set: Average loss: 42982919.9062	Accuracy: 53.763440860215056
Epoch: 1/3. Validation set: Average loss: 3902199.3214	Accuracy: 51.61290322580645
Epoch: 2/3. Train set: Average loss: 4134337.1562	Accuracy: 46.236559139784944
Epoch: 2/3. Validation set: Average loss: 760558.2768	Accuracy: 48.38709677419355
Epoch: 3/3. Train set: Average loss: 647847.8635	Accuracy: 39.784946236559136
Epoch: 3/3. Validation set: Average loss: 1383987.4643	Accuracy: 51.61290322580645


In [150]:
train_embeddings_cl, train_labels_cl = extract_embeddings(siamese_train_loader, model)
plot_embeddings(train_embeddings_cl, train_labels_cl)

AttributeError: 'list' object has no attribute 'dim'

In [170]:
for pair, target in siamese_train_loader:
    print(pair, target)
    break

[tensor([[2.0539e-02, 0.0000e+00, 4.0707e+00,  ..., 0.0000e+00, 6.4501e-01,
         0.0000e+00],
        [1.3074e-01, 6.0468e-03, 3.7802e+00,  ..., 0.0000e+00, 7.9834e-01,
         0.0000e+00],
        [3.7691e-01, 1.1207e-02, 3.6705e+00,  ..., 0.0000e+00, 7.0456e-01,
         0.0000e+00],
        [0.0000e+00, 1.2080e-02, 3.5799e+00,  ..., 0.0000e+00, 8.4532e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 2.0890e+00,  ..., 0.0000e+00, 4.1845e+00,
         0.0000e+00]]), tensor([[0.0949, 0.0066, 5.2819,  ..., 0.0000, 1.4907, 0.0000],
        [0.0451, 0.0000, 2.7615,  ..., 0.0000, 3.4349, 0.0000],
        [0.0000, 0.0058, 2.2528,  ..., 0.0000, 1.6044, 0.0000],
        [0.1307, 0.0060, 3.7802,  ..., 0.0000, 0.7983, 0.0000],
        [0.0000, 0.0000, 2.1704,  ..., 0.0000, 0.6467, 0.0000]])] tensor([1, 0, 0, 0, 1])


In [174]:
pair[0].shape

torch.Size([5, 60483])

In [164]:
model.embedding_net

EmbeddingNet(
  (fc): Sequential(
    (0): Linear(in_features=60483, out_features=20000, bias=True)
    (1): PReLU(num_parameters=1)
    (2): Linear(in_features=20000, out_features=2000, bias=True)
    (3): PReLU(num_parameters=1)
    (4): Linear(in_features=2000, out_features=2, bias=True)
  )
)

In [165]:
def extract_embeddings(dataloader, model):
    with torch.no_grad():
        model.eval()
        embeddings = np.zeros((len(dataloader.dataset), 2))
        labels = np.zeros(len(dataloader.dataset))
        k = 0
        for images, target in dataloader:
            if cuda:
                images = images.cuda()
            print(images.shape)
            print(images)
            #print(model.get_embedding(images).shape)    
            #print(model.get_embedding(images))
            break
            embeddings[k:k+len(images)] = model.get_embedding(images).data.cpu().numpy()
            labels[k:k+len(images)] = target.numpy()
            k += len(images)
    return embeddings, labels

In [166]:
train_embeddings_cl, train_labels_cl = extract_embeddings(siamese_train_loader, model)

AttributeError: 'list' object has no attribute 'shape'

In [167]:
for pair, target in siamese_train_dataset:
    print(pair, target)
    break

(tensor([7.5395e-02, 2.6153e-03, 3.3482e+00,  ..., 0.0000e+00, 9.4216e-01,
        0.0000e+00]), tensor([0.0000, 0.0121, 3.5799,  ..., 0.0000, 8.4532, 0.0000])) 0


In [None]:
val_embeddings_cl, val_labels_cl = extract_embeddings(siamese_test_dataset, model)
plot_embeddings(val_embeddings_cl, val_labels_cl)