In [34]:
import numpy as np
import pandas as pd

# viz
import matplotlib.pyplot as plt

# notebook settings
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 1000)

## Sample Prep

In [35]:
samples = pd.read_csv('../data/TCGA/gdc_sample_sheet.2019-12-09.tsv', sep="\t")
# get file type
samples['data'] = [val[1] for i,val in samples['File Name'].str.split(".").items()]
samples.head()

Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Sample Type,data
0,c5b283b8-a6ab-4652-b824-18fe1cebe0e3,0bf53fc6-b8fb-4e6d-9297-4129c708f3da.FPKM.txt.gz,Transcriptome Profiling,Gene Expression Quantification,TCGA-LUAD,TCGA-50-5935,TCGA-50-5935-01A,Primary Tumor,FPKM
1,27079945-f3da-42b4-8c17-c82d66b7d321,f4a1e772-2a5e-4c0a-803a-1bdb2b376a47.FPKM-UQ.t...,Transcriptome Profiling,Gene Expression Quantification,TCGA-LUAD,TCGA-55-6971,TCGA-55-6971-11A,Solid Tissue Normal,FPKM-UQ
2,da368838-dad0-434d-a2a8-084b362e358e,12d2b9fa-1921-4033-bdb9-7e114c0d7812.FPKM.txt.gz,Transcriptome Profiling,Gene Expression Quantification,TCGA-LUAD,TCGA-99-8025,TCGA-99-8025-01A,Primary Tumor,FPKM
3,be46ee72-8958-483f-bd54-38d21ebf7ff6,4d36e127-5fad-4b97-afff-28f4bdbf5f5d.FPKM.txt.gz,Transcriptome Profiling,Gene Expression Quantification,TCGA-LUAD,TCGA-55-8301,TCGA-55-8301-01A,Primary Tumor,FPKM
4,c5f764bd-a9d4-4188-a4dc-a5e6ca1183f8,c57c5f57-5426-4662-832d-6f342ebeff04.FPKM-UQ.t...,Transcriptome Profiling,Gene Expression Quantification,TCGA-LUAD,TCGA-73-4677,TCGA-73-4677-01A,Primary Tumor,FPKM-UQ


Samples with RNAseq adjacent normal tissue

In [36]:
samples[samples['Sample Type']=='Solid Tissue Normal']['data'].value_counts()

FPKM-UQ    59
htseq      59
FPKM       59
Name: data, dtype: int64

In [37]:
# all cases with adjacent normal tissue
cases = samples[samples['Sample Type']=='Solid Tissue Normal']['Case ID']

In [38]:
# disparity in cases
samples[(samples['Case ID'].isin(cases)) & (samples['Sample Type']=='Primary Tumor') & (samples['data']=='FPKM')]['Case ID'].nunique()
samples[(samples['Case ID'].isin(cases)) & (samples['Sample Type']=='Solid Tissue Normal') & (samples['data']=='FPKM')]['Case ID'].nunique()

57

59

In [39]:
# divide, join, subset
case_tumor = samples[(samples['Case ID'].isin(cases)) & (samples['Sample Type']=='Primary Tumor') & (samples['data']=='FPKM')]
case_norm = samples[(samples['Case ID'].isin(cases)) & (samples['Sample Type']=='Solid Tissue Normal') & (samples['data']=='FPKM')]
cases = case_norm[case_norm['Case ID'].isin(case_tumor['Case ID'])]['Case ID']
cases.shape

(57,)

In [40]:
case_tumor = case_tumor[case_tumor['Case ID'].isin(cases)]
case_norm = case_norm[case_norm['Case ID'].isin(cases)]
cases = pd.concat([case_tumor, case_norm])
cases['Sample Type'] = cases['Sample Type'].astype('category')

In [41]:
case_tumor.shape
case_norm.shape
cases.shape

(67, 9)

(57, 9)

(124, 9)

## Dataset Prep

In [42]:
import os
from sklearn.model_selection import train_test_split

In [43]:
train, test = train_test_split(cases)
train['Sample Type'].value_counts()
test['Sample Type'].value_counts()

Primary Tumor          49
Solid Tissue Normal    44
Name: Sample Type, dtype: int64

Primary Tumor          18
Solid Tissue Normal    13
Name: Sample Type, dtype: int64

In [44]:
def load_tcga_rna(root_dir, samples):
    alt_dir = os.path.join(root_dir, "https:/api.gdc.cancer.gov/data/")
    df_list = []

    for fid,fname in zip(samples['File ID'], samples['File Name']):

        if os.path.exists(os.path.join(root_dir, fid, fname)):
            df_list.append(pd.read_csv(os.path.join(root_dir, fid, fname), sep="\t", index_col=0, header=None).T)

        elif os.path.exists(os.path.join(alt_dir, fid, fname)):
            df_list.append(pd.read_csv(os.path.join(alt_dir, fid, fname), sep="\t", index_col=0, header=None).T)

        else:
            print("{} not found".format(os.path.join(fid, fname)))
            break

    df = pd.concat(df_list)
    df.index = samples['Sample ID']

    return df

In [45]:
from torch.utils.data import Dataset

class SiameseTCGA(Dataset):
    """
    Train: For each sample creates randomly a positive or a negative pair
    Test: Creates fixed pairs for testing
    """

    def __init__(self,  root_dir, samples, train):
        self.root_dir = root_dir
        self.samples = samples
        self.train = train

        if self.train:
            self.train_labels = self.samples['Sample Type'].cat.codes.to_numpy()
            self.train_data = torch.from_numpy(self.load_tcga_rna(self.root_dir, self.samples).values).float()
            self.labels_set = set(self.train_labels)
            self.label_to_indices = {label: np.where(self.train_labels == label)[0]
                                     for label in self.labels_set}
        else:
            # generate fixed pairs for testing
            self.test_labels = self.samples['Sample Type'].cat.codes.to_numpy()
            self.test_data = torch.from_numpy(self.load_tcga_rna(self.root_dir, self.samples).values).float()
            self.labels_set = set(self.test_labels)
            self.label_to_indices = {label: np.where(self.test_labels == label)[0]
                                     for label in self.labels_set}

            random_state = np.random.RandomState(29)

            positive_pairs = [[i,
                               random_state.choice(self.label_to_indices[self.test_labels[i].item()]),
                               1]
                              for i in range(0, len(self.test_data), 2)]

            negative_pairs = [[i,
                               random_state.choice(self.label_to_indices[
                                                       np.random.choice(
                                                           list(self.labels_set - set([self.test_labels[i].item()]))
                                                       )
                                                   ]),
                               0]
                              for i in range(1, len(self.test_data), 2)]
            self.test_pairs = positive_pairs + negative_pairs

    def __getitem__(self, index):
        if self.train:
            target = np.random.randint(0, 2)
            img1, label1 = self.train_data[index], self.train_labels[index].item()
            if target == 1:
                siamese_index = index
                while siamese_index == index:
                    siamese_index = np.random.choice(self.label_to_indices[label1])
            else:
                siamese_label = np.random.choice(list(self.labels_set - set([label1])))
                siamese_index = np.random.choice(self.label_to_indices[siamese_label])
            img2 = self.train_data[siamese_index]
        else:
            img1 = self.test_data[self.test_pairs[index][0]]
            img2 = self.test_data[self.test_pairs[index][1]]
            target = self.test_pairs[index][2]
        
        return (img1, img2), target

    def __len__(self):
        if self.train:
            return len(self.train_data)
        else:
            return len(self.test_data)
    
    def load_tcga_rna(self, root_dir, samples):
        alt_dir = os.path.join(root_dir, "https:/api.gdc.cancer.gov/data/")
        df_list = []

        for fid,fname in zip(samples['File ID'], samples['File Name']):

            if os.path.exists(os.path.join(root_dir, fid, fname)):
                df_list.append(pd.read_csv(os.path.join(root_dir, fid, fname), sep="\t", index_col=0, header=None).T)

            elif os.path.exists(os.path.join(alt_dir, fid, fname)):
                df_list.append(pd.read_csv(os.path.join(alt_dir, fid, fname), sep="\t", index_col=0, header=None).T)

            else:
                print("{} not found".format(os.path.join(fid, fname)))
                break

        df = pd.concat(df_list)
        df.index = samples['Sample ID']

        return df

In [46]:
import torch
from torch.optim import lr_scheduler
import torch.optim as optim
from torch.autograd import Variable

from trainer import fit
import numpy as np
cuda = torch.cuda.is_available()

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

fashion_mnist_classes = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
                         'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728',
              '#9467bd', '#8c564b', '#e377c2', '#7f7f7f',
              '#bcbd22', '#17becf']
mnist_classes = fashion_mnist_classes

def plot_embeddings(embeddings, targets, xlim=None, ylim=None):
    plt.figure(figsize=(10,10))
    for i in range(10):
        inds = np.where(targets==i)[0]
        plt.scatter(embeddings[inds,0], embeddings[inds,1], alpha=0.5, color=colors[i])
    if xlim:
        plt.xlim(xlim[0], xlim[1])
    if ylim:
        plt.ylim(ylim[0], ylim[1])
    plt.legend(mnist_classes)

def extract_embeddings(dataloader, model):
    with torch.no_grad():
        model.eval()
        embeddings = np.zeros((len(dataloader), 2))
        labels = np.zeros(len(dataloader))
        k = 0
        for images, target in dataloader:
            if cuda:
                images = images.cuda()
            embeddings[k:k+len(images)] = model.get_embedding(images).data.cpu().numpy()
            labels[k:k+len(images)] = target.numpy()
            k += len(images)
    return embeddings, labels

In [None]:
root_dir = "../data/TCGA"
siamese_train_dataset = SiameseTCGA(root_dir, train, True) # Returns pairs of images and target same/different
siamese_test_dataset = SiameseTCGA(root_dir, test, False)
batch_size = 10
kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}
siamese_train_loader = torch.utils.data.DataLoader(siamese_train_dataset, batch_size=batch_size, shuffle=True, **kwargs)
siamese_test_loader = torch.utils.data.DataLoader(siamese_test_dataset, batch_size=batch_size, shuffle=False, **kwargs)

# Set up the network and training parameters
from tcga_networks import EmbeddingNet, SiameseNet
from losses import ContrastiveLoss

# Step 2
embedding_net = EmbeddingNet()
# Step 3
model = SiameseNet(embedding_net)
if cuda:
    model.cuda()
    
# Step 4
margin = 1.
loss_fn = ContrastiveLoss(margin)
lr = 1e-3
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = lr_scheduler.StepLR(optimizer, 8, gamma=0.1, last_epoch=-1)
n_epochs = 1
log_interval = 10

In [None]:
fit(siamese_train_loader, siamese_test_loader, model, loss_fn, optimizer, scheduler, n_epochs, cuda, log_interval)

In [None]:
train_embeddings_cl, train_labels_cl = extract_embeddings(siamese_train_dataset, model)
plot_embeddings(train_embeddings_cl, train_labels_cl)

In [None]:
for pair, target in siamese_train_dataset:
    print(pair, target)

In [None]:
val_embeddings_cl, val_labels_cl = extract_embeddings(siamese_test_dataset, model)
plot_embeddings(val_embeddings_cl, val_labels_cl)