In [None]:
import sys
sys.path.insert(0, "..")
import TCGAData, numpy as np, matplotlib.pyplot as plt, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
import seaborn as sns
import tqdm
import copy

root = "/mnt/home/sgolkar/projects/cancer-net/data/brain"
files = "/mnt/home/sgolkar/projects/cancer-net/data/brain/samples.txt"
label_mapping = ["LGG", "GBM"]
dataset = TCGAData.TCGADataset(
    root=root,
    files=files,
    label_mapping=label_mapping,
    gene_graph="brain.geneSymbol.gz",
)

In [None]:
gene_dict = {
    el: i
    for i, el in enumerate(
        set([item for sublist in dataset.data.node_id for item in sublist])
    )
}
num_genes = len(gene_dict)

Xencoding = np.zeros((dataset.num_samples, num_genes, 128))
Xmutated = np.zeros((dataset.num_samples, num_genes))
Xmutnorm = np.zeros((dataset.num_samples, num_genes))
Xnummut = np.zeros((dataset.num_samples, 1))
Xnummutv2 = np.zeros((dataset.num_samples, 1))

for sample_num in range(dataset.num_samples):
    sample_data = dataset.get(sample_num)
    Xnummut[sample_num, 0] = len(sample_data.node_id)
    for encoding, sample_gene in zip(sample_data.x, sample_data.node_id):
        Xencoding[sample_num, gene_dict[sample_gene]] = encoding
        Xmutated[sample_num, gene_dict[sample_gene]] = 1
        Xmutnorm[sample_num, gene_dict[sample_gene]] = np.linalg.norm(encoding)
    Xnummutv2[sample_num, 0] = (Xmutnorm[sample_num] != 0).sum()

Xmutatedv2 = (Xmutnorm != 0).astype(float)
Xencoding_flat = Xencoding.reshape(len(Xencoding), -1)

In [None]:
method_dict = {
    "LR on num_mut": {"data": Xnummut, "fitter": LogisticRegression,},
    "LR on num_mutv2": {"data": Xnummutv2, "fitter": LogisticRegression,},
    "LR on mut_onehot": {"data": Xmutated, "fitter": LogisticRegression,},
    "LR on mut_onehotv2": {"data": Xmutatedv2, "fitter": LogisticRegression,},
    "LR on mut_size": {"data": Xmutnorm, "fitter": LogisticRegression,},
    "LR on full_encoding": {"data": Xencoding_flat, "fitter": LogisticRegression,},
}
results = {"method": [], "split": [], "accuracy": []}

for valid_seed in tqdm.tqdm(range(20), desc="iterations"):

    dataset.set_valid_seed(valid_seed)

    y_test = dataset.data.y[dataset.test_idx]
    y_train = dataset.data.y[dataset.train_idx]
    y_valid = dataset.data.y[dataset.valid_idx]

    for method_key, method_val in method_dict.items():

        X = method_val["data"]
        fitter = method_val["fitter"]()
        fitter.fit(X[dataset.train_idx], dataset.data.y[dataset.train_idx])

        for split, idx in zip(
            ["train", "valid", "test"],
            [dataset.train_idx, dataset.valid_idx, dataset.test_idx],
        ):
            results["method"].append(method_key)

            results["split"].append(split)

            X_split = X[idx]
            Y_split = dataset.data.y[idx]
            results["accuracy"].append(fitter.score(X_split, Y_split))

In [None]:
results_copy=copy.deepcopy(results)

In [None]:
train_acc_2048=[0.9488888888888889, 0.9533333333333334, 0.9355555555555556, 0.9333333333333333, 0.92, 0.9511111111111111, 0.9533333333333334, 0.9444444444444444, 0.9111111111111111, 0.9288888888888889, 0.9377777777777778, 0.9488888888888889, 0.9022222222222223, 0.9511111111111111, 0.9444444444444444, 0.9422222222222222, 0.8977777777777778, 0.9244444444444444, 0.9311111111111111, 0.9333333333333333]
valid_acc_2048=[0.7272727272727273, 0.7818181818181819, 0.8272727272727273, 0.8272727272727273, 0.7818181818181819, 0.7363636363636363, 0.8090909090909091, 0.7, 0.8090909090909091, 0.8545454545454545, 0.7727272727272727, 0.7727272727272727, 0.7818181818181819, 0.6909090909090909, 0.7636363636363637, 0.7363636363636363, 0.8, 0.7636363636363637, 0.7363636363636363, 0.7727272727272727]
train_acc_128=[0.9288888888888889, 0.9133333333333333, 0.92, 0.8866666666666667, 0.92, 0.9133333333333333, 0.8755555555555555, 0.9355555555555556, 0.9111111111111111, 0.9222222222222223, 0.9022222222222223, 0.9088888888888889, 0.9222222222222223, 0.9022222222222223, 0.9088888888888889, 0.9088888888888889, 0.9066666666666666, 0.9, 0.9044444444444445, 0.9333333333333333]
valid_acc_128=[0.7636363636363637, 0.7272727272727273, 0.8090909090909091, 0.7818181818181819, 0.7909090909090909, 0.7727272727272727, 0.8272727272727273, 0.7545454545454545, 0.7818181818181819, 0.7727272727272727, 0.8272727272727273, 0.7181818181818181, 0.7454545454545455, 0.6454545454545455, 0.8181818181818182, 0.7, 0.7090909090909091, 0.7818181818181819, 0.7545454545454545, 0.7]

In [None]:
for aa in range(len(train_acc_2048)):
    results_copy["method"].append("GCN2 2048 hidden")
    results_copy["split"].append("train")
    results_copy["accuracy"].append(train_acc_2048[aa])
    results_copy["method"].append("GCN2 2048 hidden")
    results_copy["split"].append("valid")
    results_copy["accuracy"].append(valid_acc_2048[aa])
    ## 128
    results_copy["method"].append("GCN2 128 hidden")
    results_copy["split"].append("train")
    results_copy["accuracy"].append(train_acc_128[aa])
    results_copy["method"].append("GCN2 128 hidden")
    results_copy["split"].append("valid")
    results_copy["accuracy"].append(valid_acc_128[aa])

In [None]:
sns.set_theme(style="whitegrid")
fig = plt.figure(figsize=(6, 6))
sns.violinplot(
    data=pd.DataFrame(results_copy),
    y="method",
    hue="split",
    x="accuracy",
    hue_order=["train", "valid"],
    split=True,
    inner="quartile",
    palette="Set2",
);
plt.tight_layout()
fig.savefig("gcn2_vs_baseline.pdf")

In [None]:
sns.set_theme(style="whitegrid")
fig = plt.figure(figsize=(6, 6))
sns.violinplot(
    data=pd.DataFrame(results),
    y="method",
    hue="split",
    x="accuracy",
    hue_order=["valid", "test"],
    split=True,
    inner="quartile",
    palette="Set2",
);