In [10]:
%load_ext autoreload
%autoreload 2

In [11]:
import os
import sys
sys.path.append("..")

import datetime
import pathlib

from collections import OrderedDict 

import numpy as np
import pandas as pd

In [12]:
# Pytorch
import torch
from torch.optim import lr_scheduler
import torch.optim as optim
from torch.autograd import Variable

# Custom
from dutils import Experiment
from trainer import fit
import visualization as vis
from tcga_datasets import SiameseDataset

# Models
from tcga_networks import EmbeddingNet, SiameseNet
from losses import ContrastiveLoss

# Metrics
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_mutual_info_score as ANMI

import dask

In [13]:
def getTCGA(disease):
    path = "/srv/nas/mk2/projects/pan-cancer/TCGA_CCLE_GCP/TCGA/TCGA_{}_counts.tsv.gz"
    files = [path.format(d) for d in disease]
    return files


def readGCP(files, biotype='protein_coding', mean=True):
    """
    Paths to count matrices.
    """
    data_dict = {}
    for f in files:
        key = os.path.basename(f).split("_")[1]
        data = pd.read_csv(f, sep='\t', index_col=0)
        # transcript metadata
        meta = pd.DataFrame([row[:-1] for row in data.index.str.split("|")],
                            columns=['ENST', 'ENSG', 'OTTHUMG', 'OTTHUMT', 'GENE-NUM', 'GENE', 'BP', 'BIOTYPE'])
        meta = pd.MultiIndex.from_frame(meta)
        data.index = meta
        # subset transcripts
        data = data.xs(key=biotype, level='BIOTYPE')
        data = data.droplevel(['ENST', 'ENSG', 'OTTHUMG', 'OTTHUMT', 'GENE-NUM', 'BP'])
        # average gene expression of splice variants
        data = data.T
        if mean:
            data = data.groupby(by=data.columns, axis=1).mean()
        data_dict[key] = data
    return data_dict


def uq_norm(df, q=0.75):
    """
    Upper quartile normalization of GEX for samples.
    """
    quantiles = df.quantile(q=q, axis=1)
    norm = df.divide(quantiles, axis=0)
    return norm


def process_TCGA(disease=['BRCA', 'LUAD', 'KIRC', 'THCA', 'PRAD', 'SKCM']):
    base="/srv/nas/mk2/projects/pan-cancer/TCGA_CCLE_GCP"
    # get files
    tcga_files = getTCGA(disease)
    # read meta/data
    tcga_meta = pd.read_csv(os.path.join(base, "TCGA/TCGA_GDC_ID_MAP.tsv"), sep="\t")
    tcga_raw = readGCP(tcga_files, mean=True)
    # combine samples
    tcga_raw = pd.concat(tcga_raw.values())
    # Upper quartile normalization
    tcga_raw = uq_norm(tcga_raw)
    # log norm
    tcga = tcga_raw.transform(np.log1p)
    return tcga, tcga_meta

In [14]:
def generate_fsets(data, n_features, steps=5):
    r = np.linspace(start=1, stop=n_features, num=steps, dtype='int')
    idx = [np.random.choice(data.shape[1], size=i, replace=False) for i in r]
    return idx

In [15]:
def init_model(n_features, embedding_dim, margin, lr, device):
    embedding_net = EmbeddingNet(n_features, embedding_dim)
    model = SiameseNet(embedding_net)
    model.cuda(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = lr_scheduler.StepLR(optimizer, 8, gamma=0.1, last_epoch=-1)
    return model, optimizer, scheduler

In [16]:
def cluster_eval(test_embeddings, test_labels, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters)
    siamese_clusters = kmeans.fit_predict(test_embeddings)
    anmi = ANMI(siamese_clusters, test_labels)
    return anmi

In [17]:
def feature_training(train_data, train_labels, test_data, test_labels, 
                     feature_idx, exp_dir, 
                     devices, embedding_dim, n_epochs, cuda=True):
    # Meta data
    meta_data = {"n_features":[],
                 "model":[],
                 "ANMI":[]}
    # Params
    batch_size = 8
    kwargs = {'num_workers': 10, 'pin_memory': True} if cuda else {'num_workers': 10}
    
    # Feature Index
    for batch, feat in enumerate(feature_idx):
        print("Number features: {}\n".format(len(feat)))
        exp_data = {'feature_idx':feat}
        # Define data
        siamese_train_dataset = SiameseDataset(data=train_data.iloc[:,feat],
                                           labels=train_labels,
                                           train=True)
        siamese_test_dataset = SiameseDataset(data=test_data.iloc[:,feat],
                                          labels=test_labels,
                                          train=False)
        # Loaders
        siamese_train_loader = torch.utils.data.DataLoader(siamese_train_dataset, batch_size=batch_size, shuffle=True, **kwargs)
        siamese_test_loader = torch.utils.data.DataLoader(siamese_test_dataset, batch_size=batch_size, shuffle=False, **kwargs)
        # Instantiate model
        n_samples, n_features = siamese_train_dataset.train_data.shape
        # Parameters
        margin = 1.
        lr = 1e-3
        loss_fn = ContrastiveLoss(margin)
        log_interval = round(len(siamese_train_dataset)/1/batch_size)
        
        if cuda:
            models = [init_model(n_features,
                                 embedding_dim,
                                 margin=margin, 
                                 lr=lr, 
                                 device=d) for d in devices]
        # Training
        dlosses = [dask.delayed(fit)(siamese_train_loader, 
                                    siamese_test_loader, 
                                    model, 
                                    loss_fn, 
                                    optimizer, 
                                    scheduler, 
                                    n_epochs, 
                                    cuda, 
                                    log_interval) for model,optimizer,scheduler in models]
        losses = dask.compute(*dlosses)
        print('Done with losses')
        # Test Embeddings
        dembeddings = [dask.delayed(vis.extract_embeddings)(siamese_test_dataset.test_data, 
                                                            siamese_test_dataset.labels, 
                                                            model) for model,optimizer,scheduler in models]
        embeddings = dask.compute(*dembeddings)
        print('Done with embeddings')
        # Evaluation
        danmi_eval = [dask.delayed(cluster_eval)(test_embeddings, 
                                                test_labels, 
                                                len(np.unique(test_labels))) for test_embeddings,test_labels in embeddings]
        anmi_eval = dask.compute(danmi_eval)
        print('Done with eval')
        # Store
        for i,anmi in enumerate(anmi_eval):
            nmodel = 'model_{}'.format(i)
            meta_data['n_features'].append(len(feat))
            meta_data['model'].append(nmodel)
            meta_data['ANMI'].append(anmi)

        exp_data[nmodel] = {'data': embeddings,
                            'loss': losses,
                            'ANMI': anmi_eval}
        pd.to_pickle(exp_data, os.path.join(exp_dir, "model_{}.pkl".format(len(feat))))
        
    pd.to_pickle(meta_data, os.path.join(exp_dir, "model_meta_data.pkl"))

In [18]:
def main(disease, sample_type, **kwargs):
    # GPUs
    cuda = torch.cuda.is_available()
    print("Cuda is available: {}".format(cuda))
    
    # Read / write / process
    tcga, tcga_meta = process_TCGA(disease)
    # Feature design
    feature_idx = generate_fsets(tcga, n_features=kwargs['n_features'], steps=kwargs['steps'])
    # Experiment design
    hierarchy = OrderedDict({'Disease':disease,
                             'Sample Type':sample_type})
    # Define experiment
    exp = Experiment(meta_data=tcga_meta,
                     hierarchy=hierarchy,
                     index='CGHubAnalysisID',
                     cases='Case ID',
                     min_samples=20)
    # Train / Test split
    exp.train_test_split(cases='Case ID')
    # Return data 
    train_data, train_labels = exp.get_data(tcga, subset="train", dtype=np.float32)
    test_data, test_labels = exp.get_data(tcga, subset="test", dtype=np.float32)
    
    # Path *fix*
    dtime = datetime.datetime.today().strftime("%Y.%m.%d_%H:%M")
    exp_dir = "/srv/nas/mk2/projects/pan-cancer/experiments/test/{}_{}_{}_{}-{}".format(dtime, 
                                                                                len(exp.labels_dict),
                                                                                kwargs['embedding'],
                                                                                kwargs['n_features'], 
                                                                                kwargs['steps'])
    pathlib.Path(exp_dir).mkdir(parents=True, exist_ok=False)
    print('Saving to: \n{}'.format(exp_dir))
    
    # Meta data
    experiments = {'experiment': exp,
                   'train':(train_data, train_labels),
                   'test': (test_data, test_labels)}
    pd.to_pickle(experiments, os.path.join(exp_dir, "experiment_meta_data.pkl"))
    
    # Training
    feature_training(train_data, train_labels, test_data, test_labels, 
                     feature_idx, exp_dir, 
                     kwargs['devices'], kwargs['embedding'], kwargs['n_epochs'])

### Setup

In [19]:
base="/srv/nas/mk2/projects/pan-cancer/TCGA_CCLE_GCP"
# read meta/data
tcga_meta = pd.read_csv(os.path.join(base, "TCGA/TCGA_GDC_ID_MAP.tsv"), sep="\t")

In [21]:
# disease = tcga_meta[tcga_meta['Sample Type']=='Solid Tissue Normal']['Disease'].value_counts()
# disease = list(disease[disease>=20].index)[:5]
# disease

['BRCA', 'KIRC', 'LUAD', 'THCA', 'PRAD']

In [22]:
disease = ['BRCA', 'LUAD', 'KIRC', 'THCA', 'PRAD', 'SKCM']

In [23]:
sample_type = ['Primary Tumor', 'Solid Tissue Normal']
params = {"devices":[3,4],
          "n_features":2000,
          "steps":100,
          "embedding":2,
          "n_epochs":10}

In [24]:
main(disease=disease, sample_type=sample_type, **params)

Cuda is available: True
Saving to: 
/srv/nas/mk2/projects/pan-cancer/experiments/test/2020.03.27_17:32_11_2_2000-100
Number features: 1

Epoch: 1/10. Train set: Average loss: 0.1709
Epoch: 1/10. Validation set: Average loss: 0.1502
Epoch: 1/10. Train set: Average loss: 0.1658
Epoch: 1/10. Validation set: Average loss: 0.1896

Epoch: 2/10. Train set: Average loss: 0.1625
Epoch: 2/10. Validation set: Average loss: 0.1440Epoch: 2/10. Train set: Average loss: 0.1601
Epoch: 2/10. Validation set: Average loss: 0.1966

Epoch: 3/10. Train set: Average loss: 0.1539
Epoch: 3/10. Validation set: Average loss: 0.1581
Epoch: 3/10. Train set: Average loss: 0.1508
Epoch: 3/10. Validation set: Average loss: 0.1432
Epoch: 4/10. Train set: Average loss: 0.1473
Epoch: 4/10. Validation set: Average loss: 0.1480
Epoch: 4/10. Train set: Average loss: 0.1514
Epoch: 4/10. Validation set: Average loss: 0.1448
Epoch: 4/10. Train set: Average loss: 0.1514
Epoch: 4/10. Validation set: Average loss: 0.1448
Epoch: 

KeyboardInterrupt: 