In [1]:
import numpy as np
import pandas as pd

import sklearn.preprocessing
import sklearn.neural_network
import sklearn.metrics
import tqdm.notebook

In [2]:
from plotnine import *
import plotnine
plotnine.options.figure_size = (8, 4)

import matplotlib.pyplot as plt

# 1. Load data

In [209]:
train_data = pd.read_csv('data/filtered/train.tsv.gz', sep='\t')
train_data.head(2)

Unnamed: 0,sample_barcode,source,cancer,C3orf30,TMEM31,FAM57B,ZNF366,NSMCE1,FAM150B,CTSB,...,TRIM55,KRT19,SNORD35B,RASGEF1B,CELA1,PERP,ITGAM,HIBADH,TMEM176A,LAP3
0,TCGA-2W-A8YY-01A-11R-A37O-07,CESC,0,0.0,0.3577,1.7883,6.4378,1175.608,1.073,29436.6953,...,0.0,9517.8827,0.0,254.2918,0.0,7296.495,248.2117,1050.7868,375.8941,2053.2904
1,TCGA-4J-AA1J-01A-21R-A38B-07,CESC,0,0.4272,0.4272,0.5297,12.3879,794.1051,2.563,19197.3516,...,0.4272,54491.6702,0.0,174.7117,0.0,17494.2332,98.6758,811.1918,678.3426,2566.4246


In [4]:
test_data = (
    pd.read_csv('data/filtered/test.tsv.gz', sep='\t')
    .assign(source=lambda df: df['source'].map({'breast': 'BRCA', 'melanoma': 'SKCM'}))
    .dropna(subset=['cancer'])
)
test_data.head(2)

Unnamed: 0,sample_barcode,source,cancer,C3orf30,TMEM31,FAM57B,ZNF366,NSMCE1,FAM150B,CTSB,...,TRIM55,KRT19,SNORD35B,RASGEF1B,CELA1,PERP,ITGAM,HIBADH,TMEM176A,LAP3
0,BC01_02,BRCA,1.0,0.0,0.0,0.0,0.0,10.15,0.0,93.22,...,0.0,321.25,0.0,0.0,0.0,11.5,0.0,63.86,0.0,47.68
1,BC01_03,BRCA,1.0,0.0,0.0,0.0,0.0,73.42,0.0,30.79,...,0.0,531.77,0.0,0.0,0.0,53.5,0.0,19.04,0.0,10.73


# 2. Transform data

Transform test data using the same transformation used for the train data. Don't allow test data to be incorporated into the scaling.

In [5]:
y_train = train_data[['source', 'cancer']].values
y_test = test_data[['source', 'cancer']].values

In [6]:
X_train = train_data.iloc[:, 3:].values
X_test = test_data.iloc[:, 3:].values

scaler = sklearn.preprocessing.StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
np.save('data/filtered/X_train.npy', X_train)
np.save('data/filtered/X_test.npy', X_test)

np.save('data/filtered/y_train.npy', y_train)
np.save('data/filtered/y_test.npy', y_test)

--- 

# Start here

In [54]:
X_train = np.load('data/filtered/X_train.npy')
y_train = np.load('data/filtered/y_train.npy', allow_pickle=True)

X_test = np.load('data/filtered/X_test.npy')
y_test = np.load('data/filtered/y_test.npy', allow_pickle=True)

In [4]:
def train_and_evaluate_mcc(train_X, train_y, test_X, test_y):
    classifier = sklearn.neural_network.MLPClassifier(
        hidden_layer_sizes=(2, 2),
        activation='relu', 
        solver='adam',
        alpha=0.0001,
        batch_size='auto',
        learning_rate='constant',
        learning_rate_init=0.001,
        max_iter=10000,
        shuffle=True,
        random_state=0
    )
    classifier.fit(train_X, train_y)
    preds = classifier.predict(test_X)
    return sklearn.metrics.matthews_corrcoef(test_y, preds)

# 3. Predict cancer

In [5]:
# Here we're predicting cancer vs non-cancer. They used Matthew's correlation for both
# cancer/non-cancer and tissue. 
genes_to_sample = [300,]
n_bootstraps = 100
N, N_genes = X_train.shape

tissues = sorted(set(y_train[:, 0]))

cancer_performance_values = list()

for tissue in tqdm.notebook.tqdm(tissues):
    for n in genes_to_sample:
        for i in tqdm.notebook.tnrange(n_bootstraps):
            # First subset to relevant tissues and sample n random genes
            which_genes = np.random.choice(a=N_genes, size=n, replace=False)
            which_samples = np.where(y_train[:, 0] == tissue)[0]
            X = X_train[which_samples][:, which_genes]
            y = y_train[which_samples, 1].astype(int)
            
            # Split between train and test samples
            ordered_samples = np.random.choice(a=len(which_samples), 
                                               size=len(which_samples), 
                                               replace=False)
            n_train = int(0.75 * len(which_samples))
        
            train_indices = ordered_samples[:n_train]
            test_indices = ordered_samples[n_train:]

            train_X = X[train_indices]
            train_y = y[train_indices]

            test_X = X[test_indices]
            test_y = y[test_indices]

            cancer_performance_values.append((
                n, tissue, i, train_and_evaluate_mcc(train_X, train_y, test_X, test_y)
            ))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=21.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))






HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))






HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))










HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))






HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))








HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))








HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))










HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))








HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))






HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))








HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))










HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))








HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))






HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))










HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))








HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))






HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))






HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))










HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))










HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))







In [6]:
cancer_performance_df = pd.DataFrame(
    cancer_performance_values, columns=['n_genes', 'tissue', 'iteration', 'mcc'])

cancer_performance_df.to_csv('cancer_performance.tsv', sep='\t', index=False)

cancer_performance_df.head(2)

Unnamed: 0,n_genes,tissue,iteration,mcc
0,300,BLCA,0,0.478665
1,300,BLCA,1,0.75694


# Predict tissue

In [9]:
# Now we'll predict tissue and dichotomize between cancer and non-cancer
genes_to_sample = [300,]
n_bootstraps = 100
N, N_genes = X_train.shape

tissue_performance_values = list()

for cancer_state in tqdm.notebook.tnrange(2):
    for n in genes_to_sample:
        for i in tqdm.notebook.tnrange(n_bootstraps):
            # First subset to relevant cancer state and sample n random genes
            which_genes = np.random.choice(a=N_genes, size=n, replace=False)
            which_samples = np.where(y_train[:, 1] == cancer_state)[0]
            X = X_train[which_samples][:, which_genes]
            y = y_train[which_samples, 0]

            # Split between train and test samples
            ordered_samples = np.random.choice(a=len(which_samples), 
                                               size=len(which_samples), 
                                               replace=False)
            n_train = int(0.75 * len(which_samples))
        
            train_indices = ordered_samples[:n_train]
            test_indices = ordered_samples[n_train:]

            train_X = X[train_indices]
            train_y = y[train_indices]

            test_X = X[test_indices]
            test_y = y[test_indices]

            tissue_performance_values.append((
                n, cancer_state, i, train_and_evaluate_mcc(train_X, train_y, test_X, test_y)
            ))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))





In [10]:
tissue_performance_df = pd.DataFrame(
    tissue_performance_values, columns=['n_genes', 'cancer_state', 'iteration', 'mcc'])

tissue_performance_df.to_csv('tissue_performance.tsv', sep='\t', index=False)

tissue_performance_df.head(2)

Unnamed: 0,n_genes,cancer_state,iteration,mcc
0,300,0,0,0.36667
1,300,0,1,0.411172


# Pick the 300 genes

In [5]:
import scipy.stats

In [26]:
which_cancer = y_train[:, 1].astype(bool)
X_cancer = X_train[which_cancer]
X_normal = X_train[~which_cancer]

results = list()

for i in range(X_train.shape[1]):
    cancer = X_cancer[:, i]
    normal = X_normal[:, i]
    results.append(scipy.stats.f_oneway(cancer, normal))



In [46]:
genes_df = (
    pd.DataFrame(results)
    .reset_index()
    .sort_values('statistic', ascending=False)
)

top_gene_indices = genes_df.iloc[:300]['index'].values

genes_df.head(2)

Unnamed: 0,index,statistic,pvalue
10821,10821,2629.843866,0.0
13239,13239,2507.06009,0.0


In [55]:
X_300_train = X_train[:, top_gene_indices]
X_300_test = X_test[:, top_gene_indices]

In [64]:
# Here we're predicting cancer vs non-cancer. They used Matthew's correlation for both
# cancer/non-cancer and tissue. 
n_bootstraps = 20

tissues = sorted(set(y_train[:, 0]))

cancer_performance_values = list()

for tissue in tqdm.notebook.tqdm(tissues):
    for i in tqdm.notebook.tnrange(n_bootstraps):
        # First subset to relevant tissues and sample n random genes
        which_samples = np.where(y_train[:, 0] == tissue)[0]
        X = X_300_train[which_samples]
        y = y_train[which_samples, 1].astype(int)

        # Split between train and test samples
        ordered_samples = np.random.choice(a=len(which_samples), 
                                           size=len(which_samples), 
                                           replace=False)
        n_train = int(0.75 * len(which_samples))

        train_indices = ordered_samples[:n_train]
        test_indices = ordered_samples[n_train:]

        train_X = X[train_indices]
        train_y = y[train_indices]

        test_X = X[test_indices]
        test_y = y[test_indices]

        cancer_performance_values.append((
            tissue, i, train_and_evaluate_mcc(train_X, train_y, test_X, test_y)
        ))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=21.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))






HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))






HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))






HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))






HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))








HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))








HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))








HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))








HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))








HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))








HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))





In [65]:
cancer_performance_df = pd.DataFrame(
    cancer_performance_values, columns=['tissue', 'iteration', 'mcc'])

cancer_performance_df.to_csv('cancer_performance_300.tsv', sep='\t', index=False)

cancer_performance_df.head(2)

Unnamed: 0,tissue,iteration,mcc
0,BLCA,0,0.836827
1,BLCA,1,0.722781


In [66]:
# Now we'll predict tissue and dichotomize between cancer and non-cancer
n_bootstraps = 20

tissue_performance_values = list()

for cancer_state in tqdm.notebook.tnrange(2):
    for i in tqdm.notebook.tnrange(n_bootstraps):
        # First subset to relevant cancer state and sample n random genes
        which_samples = np.where(y_train[:, 1] == cancer_state)[0]
        X = X_train[which_samples]
        y = y_train[which_samples, 0]

        # Split between train and test samples
        ordered_samples = np.random.choice(a=len(which_samples), 
                                           size=len(which_samples), 
                                           replace=False)
        n_train = int(0.75 * len(which_samples))

        train_indices = ordered_samples[:n_train]
        test_indices = ordered_samples[n_train:]

        train_X = X[train_indices]
        train_y = y[train_indices]

        test_X = X[test_indices]
        test_y = y[test_indices]

        tissue_performance_values.append((
            cancer_state, i, train_and_evaluate_mcc(train_X, train_y, test_X, test_y)
        ))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))





In [67]:
tissue_performance_df = pd.DataFrame(
    tissue_performance_values, columns=['cancer_state', 'iteration', 'mcc'])

tissue_performance_df.to_csv('tissue_performance_300.tsv', sep='\t', index=False)

tissue_performance_df.head(2)

Unnamed: 0,cancer_state,iteration,mcc
0,0,0,0.607763
1,0,1,0.435059


# UMAP

In [68]:
import umap

In [69]:
reducer = umap.UMAP()
embedding = reducer.fit_transform(X_train)

plot_df = (
    pd.DataFrame(np.concatenate([embedding, y_train], axis=1), 
                 columns=['UMAP1', 'UMAP2', 'tissue', 'cancer'])
    .assign(
        UMAP1=lambda df: df['UMAP1'].astype(float),
        UMAP2=lambda df: df['UMAP2'].astype(float),
        cancer=lambda df: df['cancer'].map({0: 'Normal', 1: 'Cancer'})
    )
)

In [100]:
X = np.concatenate([X_train, X_test])
y = np.concatenate([y_train, y_test])
sc_or_bulk = np.concatenate((np.repeat('Bulk', X_train.shape[0]), np.repeat('Single cell', X_test.shape[0])))

reducer2 = umap.UMAP()
embedding2 = reducer2.fit_transform(X)

plot2_df = (
    pd.DataFrame(np.concatenate([embedding2, y], axis=1), 
                 columns=['UMAP1', 'UMAP2', 'tissue', 'cancer'])
    .assign(
        UMAP1=lambda df: df['UMAP1'].astype(float),
        UMAP2=lambda df: df['UMAP2'].astype(float),
        cancer=lambda df: df['cancer'].map({0: 'Normal', 1: 'Cancer'}),
        source=sc_or_bulk
    )
)

In [109]:
full_plot_df = pd.concat([plot_df.assign(plot=1, source='Bulk'), plot2_df.assign(plot=2)])
full_plot_df.to_csv('umap.tsv', sep='\t', index=False)

# Performance on test data

In [140]:
def train_and_evaluate_roc(train_X, train_y, test_X, test_y):
    classifier = sklearn.neural_network.MLPClassifier(
        hidden_layer_sizes=(2, 2),
        activation='relu', 
        solver='adam',
        alpha=0.0001,
        batch_size='auto',
        learning_rate='constant',
        learning_rate_init=0.001,
        max_iter=10000,
        shuffle=True,
        random_state=0
    )
    classifier.fit(train_X, train_y)
    preds = classifier.predict(test_X)
    return (sklearn.metrics.matthews_corrcoef(test_y, preds), 
            sklearn.metrics.roc_auc_score(test_y, classifier.predict_proba(test_X)[:, 0]))

In [141]:
# Here we're predicting cancer vs non-cancer. They used Matthew's correlation for both
# cancer/non-cancer and tissue. 

tissues = sorted(set(y_test[:, 0]))

cancer_performance_values = list()

for tissue in tqdm.notebook.tqdm(tissues):
    # First subset to relevant tissues
    which_samples = np.where(y_train[:, 0] == tissue)[0]
    train_X = X_train[which_samples][:, top_gene_indices]
    train_y = y_train[which_samples, 1].astype(int)
    
    which_samples = np.where(y_test[:, 0] == tissue)[0]
    test_X = X_test[which_samples][:, top_gene_indices]
    test_y = y_test[which_samples, 1].astype(int)

    print(train_y.sum(), test_y.sum())
    cancer_performance_values.append((
        tissue, train_and_evaluate_roc(train_X, train_y, test_X, test_y)
    ))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))

112 317




1 1257





In [214]:
y_test

array([['BRCA', 1.0],
       ['BRCA', 1.0],
       ['BRCA', 1.0],
       ...,
       ['SKCM', 0.0],
       ['SKCM', 0.0],
       ['SKCM', 0.0]], dtype=object)

In [166]:
X = X_train[y_train[:, 0] == 'BRCA'][:, top_gene_indices]
y = y_train[y_train[:, 0] == 'BRCA'][:, 1].astype(int)

test_x = X_test[y_test[:, 0] == 'BRCA'][:, top_gene_indices]
test_y = y_test[y_test[:, 0] == 'BRCA'][:, 1].astype(int)

classifier = sklearn.neural_network.MLPClassifier(
    hidden_layer_sizes=(100, 100),
    activation='relu', 
    solver='adam',
    alpha=0.0001,
    batch_size='auto',
    learning_rate='constant',
    learning_rate_init=0.001,
    max_iter=10000,
    shuffle=True,
    random_state=0
)
classifier.fit(X, y)
preds = classifier.predict(test_x)
(
    sklearn.metrics.matthews_corrcoef(test_y, preds), 
    sklearn.metrics.roc_auc_score(test_y, classifier.predict_proba(test_x)[:, 0])
)



(0.0, 0.6568683682248352)

In [215]:
X = X_train[y_train[:, 0] == 'SKCM'][:, top_gene_indices]
y = y_train[y_train[:, 0] == 'SKCM'][:, 1].astype(int)

test_x = X_test[y_test[:, 0] == 'SKCM'][:, top_gene_indices]
test_y = y_test[y_test[:, 0] == 'SKCM'][:, 1].astype(int)

classifier = sklearn.neural_network.MLPClassifier(
    hidden_layer_sizes=(100, 100),
    activation='relu', 
    solver='adam',
    alpha=0.0001,
    batch_size='auto',
    learning_rate='constant',
    learning_rate_init=0.001,
    max_iter=10000,
    shuffle=True,
    random_state=0
)
classifier.fit(X, y)
preds = classifier.predict(test_x)
(
    sklearn.metrics.matthews_corrcoef(test_y, preds), 
    sklearn.metrics.roc_auc_score(test_y, classifier.predict_proba(test_x)[:, 0])
)



(0.0, 0.5896542018260396)

In [157]:
X = X_train[y_train[:, 1] == 0][:, top_gene_indices]
y = y_train[y_train[:, 1] == 0][:, 0]

test_x = X_test[y_test[:, 1] == 0][:, top_gene_indices]
test_y = y_test[y_test[:, 1] == 0][:, 0]

classifier = sklearn.neural_network.MLPClassifier(
    hidden_layer_sizes=(2, 2),
    activation='relu', 
    solver='adam',
    alpha=0.0001,
    batch_size='auto',
    learning_rate='constant',
    learning_rate_init=0.001,
    max_iter=10000,
    shuffle=True,
    random_state=0
)
classifier.fit(X, y)
preds = classifier.predict(test_x)
sklearn.metrics.matthews_corrcoef(test_y, preds)

0.3216752480390369

In [212]:
X = X_train[y_train[:, 1] == 1][:, top_gene_indices]
y = y_train[y_train[:, 1] == 1][:, 0]

test_x = X_test[y_test[:, 1] == 1][:, top_gene_indices]
test_y = y_test[y_test[:, 1] == 1][:, 0]

classifier = sklearn.neural_network.MLPClassifier(
    hidden_layer_sizes=(2, 2),
    activation='relu', 
    solver='adam',
    alpha=0.0001,
    batch_size='auto',
    learning_rate='constant',
    learning_rate_init=0.001,
    max_iter=10000,
    shuffle=True,
    random_state=0
)
classifier.fit(X, y)
preds = classifier.predict(test_x)
sklearn.metrics.matthews_corrcoef(test_y, preds)

0.22580963797744225

# Confusion matrices

In [199]:
# Within cancer state

X300 = X_train[:, top_gene_indices]

within_cancer_preds = None

for cancer_state in tqdm.notebook.tnrange(2):
    X = X300[y_train[:, 1] == cancer_state]
    y = y_train[y_train[:, 1] == cancer_state][:, 0]
    
    for i in tqdm.notebook.tnrange(10):
        # Split between train and test samples
        ordered_samples = np.random.choice(
            a=len(y), size=len(y), replace=False)
        n_train = int(0.75 * len(y))

        train_indices = ordered_samples[:n_train]
        test_indices = ordered_samples[n_train:]

        train_X = X[train_indices]
        train_y = y[train_indices]

        test_X = X[test_indices]
        test_y = y[test_indices]

        classifier = sklearn.neural_network.MLPClassifier(
            hidden_layer_sizes=(2, 2),
            activation='relu', 
            solver='adam',
            alpha=0.0001,
            batch_size='auto',
            learning_rate='constant',
            learning_rate_init=0.001,
            max_iter=10000,
            shuffle=True,
            random_state=0
        )
        classifier.fit(train_X, train_y)
        preds = classifier.predict(test_X)
        
        these_preds = (
            pd.DataFrame(np.vstack([preds, test_y]).T, columns=['predicted', 'true'])
            .assign(cancer=cancer_state)
        )
        if within_cancer_preds is None:
            within_cancer_preds = these_preds
        else:
            within_cancer_preds = pd.concat([within_cancer_preds, these_preds])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))





In [200]:
confusion_df = (
    within_cancer_preds
    .value_counts()
    .reset_index()
    .rename(columns={0: 'count'})
)
confusion_df.to_csv('confusion.tsv', sep='\t', index=False)

In [202]:
# Within cancer state

X300 = X_train[:, top_gene_indices]

tissues = sorted(set(y_train[:, 0]))

within_tissue_preds = None

for tissue in tqdm.notebook.tqdm(tissues):
    X = X300[y_train[:, 0] == tissue]
    y = y_train[y_train[:, 0] == tissue][:, 1].astype(int)
    
    for i in tqdm.notebook.tnrange(10):
        # Split between train and test samples
        ordered_samples = np.random.choice(
            a=len(y), size=len(y), replace=False)
        n_train = int(0.75 * len(y))

        train_indices = ordered_samples[:n_train]
        test_indices = ordered_samples[n_train:]

        train_X = X[train_indices]
        train_y = y[train_indices]

        test_X = X[test_indices]
        test_y = y[test_indices]

        classifier = sklearn.neural_network.MLPClassifier(
            hidden_layer_sizes=(2, 2),
            activation='relu', 
            solver='adam',
            alpha=0.0001,
            batch_size='auto',
            learning_rate='constant',
            learning_rate_init=0.001,
            max_iter=10000,
            shuffle=True,
            random_state=0
        )
        classifier.fit(train_X, train_y)
        preds = classifier.predict(test_X)
        
        these_preds = (
            pd.DataFrame(np.vstack([preds, test_y]).T, columns=['predicted', 'true'])
            .assign(tissue=tissue)
        )
        if within_tissue_preds is None:
            within_tissue_preds = these_preds
        else:
            within_tissue_preds = pd.concat([within_tissue_preds, these_preds])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=21.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))





In [206]:
confusion_tissues_df = (
    within_tissue_preds
    .value_counts()
    .reset_index()
    .rename(columns={0: 'count'})
)
confusion_tissues_df.to_csv('confusion_tissues.tsv', sep='\t', index=False)

In [204]:
within_tissue_preds

Unnamed: 0,predicted,true,tissue
0,1,1,BLCA
1,0,0,BLCA
2,0,0,BLCA
3,0,0,BLCA
4,0,0,BLCA
...,...,...,...
108,0,0,STAD
109,0,0,STAD
110,0,0,STAD
111,0,0,STAD


In [211]:
train_data.head(2)

Unnamed: 0,sample_barcode,source,cancer,C3orf30,TMEM31,FAM57B,ZNF366,NSMCE1,FAM150B,CTSB,...,TRIM55,KRT19,SNORD35B,RASGEF1B,CELA1,PERP,ITGAM,HIBADH,TMEM176A,LAP3
0,TCGA-2W-A8YY-01A-11R-A37O-07,CESC,0,0.0,0.3577,1.7883,6.4378,1175.608,1.073,29436.6953,...,0.0,9517.8827,0.0,254.2918,0.0,7296.495,248.2117,1050.7868,375.8941,2053.2904
1,TCGA-4J-AA1J-01A-21R-A38B-07,CESC,0,0.4272,0.4272,0.5297,12.3879,794.1051,2.563,19197.3516,...,0.4272,54491.6702,0.0,174.7117,0.0,17494.2332,98.6758,811.1918,678.3426,2566.4246
