In [3]:
# Training on brain data using GCN2 architecture

# Hyperparameters etc:

arch = "GCN2"
batch = 10
parall = False
lr = 0.005


import sys

sys.path.insert(0, "..")
import TCGAData
import torch, torch_geometric.transforms as T, torch.nn.functional as F
import matplotlib.pyplot as plt, numpy as np
from sklearn.metrics import roc_auc_score, roc_curve, auc
from torch_geometric.loader import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from arch.net import *
import wandb

if torch.cuda.is_available():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else:
    device = "cpu"

root = "/mnt/home/sgolkar/projects/cancer-net/data/brain"
files = "/mnt/home/sgolkar/projects/cancer-net/data/brain/samples.txt"
label_mapping = ["LGG", "GBM"]


In [4]:
class EarlyStopping():
    """
    Early stopping to stop the training when the loss does not improve after
    certain epochs.
    """
    def __init__(self, patience=6, min_delta=0.05):
        """
        :param patience: how many epochs to wait before stopping when loss is
               not improving
        :param min_delta: minimum difference between new loss and old loss for
               new loss to be considered as an improvement
        """
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
    def __call__(self, val_loss):
        if self.best_loss == None:
            self.best_loss = val_loss
        elif self.best_loss - val_loss > self.min_delta:
            self.best_loss = val_loss
            # reset counter if validation loss improves
            self.counter = 0
        elif self.best_loss - val_loss < self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True


In [10]:
class Objective(object):
    def __init__(self,arch,root,batch,epochs,device,hidden,valid_seed,alpha,lr,dropout):
        self.arch=arch
        self.root=root
        self.batch=batch
        self.epochs=epochs
        self.device=device
        self.hidden=hidden
        self.valid_seed=valid_seed
        
        ## hardcoding this false for now
        self.parall=False
        
        self.lr        = lr
        self.alpha     = alpha
        self.dropout   = dropout
        
        root = "/mnt/home/sgolkar/projects/cancer-net/data/brain"
        files = "/mnt/home/sgolkar/projects/cancer-net/data/brain/samples.txt"
        label_mapping = ["LGG", "GBM"]
        pre_transform = T.Compose([T.GCNNorm(), T.ToSparseTensor()])
        dataset = TCGAData.TCGADataset(
            root=root,
            files=files,
            label_mapping=label_mapping,
            gene_graph="brain.geneSymbol.gz",
            pre_transform=pre_transform,
            suffix="sparse",
            valid_seed=valid_seed,
        )

        self.train_loader = DataLoader(
            dataset,
            batch_size=batch,
            sampler=SubsetRandomSampler(dataset.train_idx),
            drop_last=True,
        )
        self.valid_loader = DataLoader(
            dataset,
            batch_size=batch,
            sampler=SubsetRandomSampler(dataset.valid_idx),
            drop_last=True,
        )

        self.model = GCN2Net(
            hidden_channels=self.hidden,
            num_layers=4,
            alpha=self.alpha,
            theta=1.0,
            shared_weights=False,
            dropout=self.dropout,
        ).to(device)

    def train(self, epoch, report=True):
        self.model.train()
        total_loss = 0
        correct = 0
        num_samps = 0
        for data in self.train_loader:
            if not self.parall:
                data = data.to(device)
            self.optimizer.zero_grad()

            output = self.model(data)
            output = output.squeeze()

            if self.parall:
                y = torch.cat([d.y for d in data]).to(output.device)
            else:
                y = data.y

            if len(output.shape) == 1:
                output = output.unsqueeze(0)
            loss = self.criterion(output, y)

            pred = output.max(1)[1]
            correct += pred.eq(y).sum().item()
            total_loss += loss
            loss.backward()
            self.optimizer.step()
            num_samps += len(y)
        if report:
            print(
                "Epoch: {:02d}, Loss: {:.3g}, Train Acc: {:.4f}".format(
                    epoch, total_loss / num_samps, correct / num_samps
                )
            )

        return total_loss / num_samps, correct / num_samps
    
    def valid(self):
        self.model.eval()
        correct = 0

        total_loss = 0
        num_samps = 0
        for data in self.valid_loader:
            if not self.parall:
                data = data.to(device)
            output = self.model(data)
            output = output.squeeze()

            pred = output.max(1)[1]
            if self.parall:
                y = torch.cat([d.y for d in data]).to(output.device)
            else:
                y = data.y
            loss = self.criterion(output, y)
            total_loss += loss.item()

            correct += pred.eq(y).sum().item()
            num_samps += len(y)
        return total_loss / num_samps, correct / num_samps
    
    def __call__(self):
        config = {"learning rate": self.lr,
         "valid_seed":self.valid_seed,
         "hidden":self.hidden,
         "epochs": self.epochs,
         "batch size": self.batch,
         "arch": self.arch,
         "alpha": self.alpha,
         "dropout": self.dropout}
        wandb.login()
        wandb.init(project="brain-GCN2-shuffle-2048", entity="chris-pedersen",config=config)
        wandb.watch(self.model, log_freq=1)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min', factor=0.3, patience=7)
        early_stopping=EarlyStopping()
        self.criterion = F.nll_loss
        train_losses = []
        train_acces = []
        valid_acces = []
        valid_losses = []
        for epoch in range(1, self.epochs):
            report = False
            #report = (epoch) % 10 == 0
            train_loss, train_acc = self.train(epoch, report=report)
            valid_loss, valid_acc = self.valid()
            train_losses.append(train_loss.cpu().detach().numpy())
            valid_losses.append(valid_loss)
            train_acces.append(train_acc)
            valid_acces.append(valid_acc)
            wandb.log({"train loss": train_loss,
                       "valid loss": valid_loss,
                       "train accuracy": train_acc,
                       "valid accuracy": valid_acc,
                       "learning rate": self.optimizer.param_groups[0]["lr"]})
            if report:
                print("Valid Loss: {:.3g}, Acc: {:.4f}".format(valid_loss, valid_acc))
            if epoch>60:
                early_stopping(valid_loss)
                if early_stopping.early_stop:
                    wandb.finish()
                    #trial.study.stop()
                    break
        wandb.finish()
        return train_acces[-1], valid_acces[-1]

In [6]:
arch = "GCN2"
batch = 10
parall = False
epochs=200
hidden=128

alpha=0.555
dropout=0.315
lr=0.007

if torch.cuda.is_available():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else:
    device = "cpu"

root = "/mnt/home/sgolkar/projects/cancer-net/data/brain"

In [7]:
train_acc=[]
valid_acc=[]
for valid_seed in range(20):
    trial=Objective(arch,root,batch,epochs,device,hidden,valid_seed,alpha,lr,dropout)
    train,valid=trial()
    train_acc.append(train)
    valid_acc.append(valid)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mchris-pedersen[0m (use `wandb login --relogin` to force relogin)





0,1
learning rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▂▃▄▄▄▄▄▄▄▅▄▅▅▆▅▅▆▆▆▆▆▆▆▇▆▆▆▇▆▇▆▇▇▇█▇█▇█
train loss,██▇▆▆▆▅▆▆▅▅▅▅▅▄▄▄▄▄▄▄▄▃▃▃▃▃▄▂▃▃▃▂▂▃▁▂▂▂▁
valid accuracy,▁▁▄▁▁▂▄▆▃▅▁▄▇▃▆▆▅▄█▄▃▇▆▆▆▇▄▅█▆█▃▇▃▇▃▃▇▇▆
valid loss,▄▃█▆▅█▃▂▂▃▃▃▂▃▃▄▄▃▁▃▃▂▄▃▃▃▄▃▂▄▃▄▃▃▂▄▄▃▂▃

0,1
learning rate,0.007
train accuracy,0.92889
train loss,0.01627
valid accuracy,0.76364
valid loss,0.06268





0,1
learning rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▃▃▃▄▅▄▅▅▄▅▅▅▆▆▇▇▆▆▆▇▇▆▆▇▇█▇▇▇███▆█▇████
train loss,█▇▇▇▆▆▆▅▅▅▅▅▅▃▄▃▃▄▄▄▃▂▄▃▃▃▂▂▃▂▂▁▂▄▂▂▂▂▁▂
valid accuracy,▁▂▃▅▇▅▂▅▂█▃▁▄▃▁▄▆▄▅▅▂▄▄▅▂▅▆▅▂▄▅▄▄▁▅▄▄▅▇▅
valid loss,▁▁▂▂▃▂▁▅▂▂▄▃▁▂▂▅▁▄▂▂▂▂▄▁▃▂▅▂▅▄▅▂▅▂▂▃▂█▂▂

0,1
learning rate,0.007
train accuracy,0.91333
train loss,0.02151
valid accuracy,0.72727
valid loss,0.06876





0,1
learning rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▃▃▄▄▄▄▄▅▅▅▅▅▆▆▅▆▆▆▆▇▇▆▆▆▇▇▇▇▇▇▇▇▇▇▆▇███
train loss,█▇▆▆▆▅▅▅▅▅▅▄▄▄▄▅▄▃▃▃▂▃▄▄▃▃▂▂▃▂▂▂▂▂▂▃▃▂▁▁
valid accuracy,▂▃▄▂▁▁▃▄▆▂▄▅▄▅▆▆▆▅▇█▄▄▆▄▄▄▆▅▅▇▃▅▅▆▁▄▄▅█▆
valid loss,▂▂▁▂▂▂▂▂▁▂▂▂▂▂▂▂▃▃▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃█▁▂▁▂

0,1
learning rate,0.007
train accuracy,0.92
train loss,0.02063
valid accuracy,0.80909
valid loss,0.06335





0,1
learning rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▃▄▄▅▄▄▄▅▅▃▅▅▅▅▅▆▇▆▆▇▇▇▇▇▇▇▇█████▇███▇▇▇
train loss,█▇▆▆▅▆▆▆▅▅▆▅▅▅▄▅▃▃▃▄▃▂▃▂▂▂▂▂▁▁▁▁▂▃▁▂▂▃▂▂
valid accuracy,▁▅▁▂▂▆▅▇▆█▇▅▇▆▆▃▇█▇▆▆▇▇▇██▇▇████▇█▆▇▇██▇
valid loss,▅▃█▇▅▃▃▂▂▃▂▃▂▂▃▅▁▁▂▃▃▄▂▄▄▂▄▂▂▂▁▂▃▁▄▃▁▁▂▂

0,1
learning rate,0.007
train accuracy,0.88667
train loss,0.02521
valid accuracy,0.78182
valid loss,0.04383





0,1
learning rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▂▃▃▃▃▅▄▅▄▅▅▅▅▅▆▅▆▅▆▆▅▇▇▇▇▇█▇▇▇▇▇▇▇▇█▇▇█
train loss,█▇▇▇▆▆▆▅▆▆▅▅▅▄▄▄▄▃▄▄▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▁▂▂▁
valid accuracy,▃▃▆▁▄▆▄▄▄▄▄▁▄▃▃▅▅▁▂▇▇▆▇▇▅▆▅▆█▇▇▅▄▄▄▆█▇▇█
valid loss,▂▂▁▃▃▁▁▂▂▂▃▃▂▂▄▃▇██▂▁▃▁▂▂▂▂▃▂▃▃▃▃▃▃▃▅▂▁▃

0,1
learning rate,0.007
train accuracy,0.92
train loss,0.01959
valid accuracy,0.79091
valid loss,0.06535





0,1
learning rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▃▃▃▃▄▄▄▅▅▅▅▆▆▅▆▅▆▇▅▆▆▆▆▅▇▇▆▆▇▇▇█▇█▇████
train loss,█▇▇▇▇▆▆▆▅▅▆▅▄▅▆▄▄▄▃▄▄▄▃▄▄▃▃▃▃▂▃▃▂▂▁▂▂▁▂▁
valid accuracy,▁▃▆▃▅▅▅▅▆▆▅▇▆▅▇▇▇▇▇▆▆▇█▆▆▆▅▆▇█▆▆▆▇▇▆█▇▇▇
valid loss,▄▄▇█▆▄▄▄▄▄▃▂▃▃▂▃▄▄▄▇▃▁▃▃▂▂▄▂▃▁▂▂▄▂▄▅▁▃▄▅

0,1
learning rate,0.007
train accuracy,0.91333
train loss,0.02052
valid accuracy,0.77273
valid loss,0.0751





0,1
learning rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▂▄▃▄▄▅▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇█▇▇███▇█▇███▇█▇
train loss,█▇▆▇▆▅▆▅▅▄▅▄▄▄▄▄▄▃▄▃▃▃▃▃▃▂▂▂▂▁▂▂▁▂▁▂▁▃▁▂
valid accuracy,▄▄▁▂▂▁▁▃▄▆▆▄▅▆▅▇▅▅▆▆▄▅▅▅▆▆▆▆▆▇▇▆▅▆▇▇█▆█▇
valid loss,▂▁▂▃▄▄█▅▅▂▂▃▂▂▂▁▁▃▂▂▂▂▂▁▂▁▁▂▁▁▂▁▂▂▁▁▁▂▁▁

0,1
learning rate,0.007
train accuracy,0.87556
train loss,0.0288
valid accuracy,0.82727
valid loss,0.04091





0,1
learning rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▃▄▄▄▄▅▅▅▅▅▅▆▇▅▆▆▆▆▇▆▆▇▆▇▇▆▇█▇█▇▇▇▇█▇▇██
train loss,█▆▆▆▆▆▅▅▅▅▄▅▄▃▄▃▃▄▃▃▃▃▃▃▂▂▂▂▁▂▁▂▂▂▂▁▁▂▁▁
valid accuracy,▁▄▅▄▅▆▇▅▄▆▆▄▃▇▆▇█▅▃▆▆▇▅▇▆█▆▇▆▇▆▆▅▇▇▇▇▇▇▇
valid loss,▃▂▂▃▄▁▁▂▄▄▂▅▂▇▅▂▂▃█▄▂▃▆▃▄▆▃▃▃▃▄▂▃▁▂▂▁▂▂▃

0,1
learning rate,0.007
train accuracy,0.93556
train loss,0.01889
valid accuracy,0.75455
valid loss,0.06985





0,1
learning rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▃▃▃▄▃▃▄▄▄▅▅▆▆▅▅▆▆▇▆▆▆▆▆▇▇▇▇▇▇██▆▇█▇██▇█
train loss,█▇▆▆▆▆▆▅▆▅▅▄▄▄▄▄▄▃▄▃▃▃▃▃▂▂▃▃▂▂▂▂▃▂▂▂▂▁▂▁
valid accuracy,▂▁▄▆▅▄▆▆▆▇▆▆▆▇███▇▇▆▇▇██▆▆█▆▆█▇█▇▇▇▆▇▇▇▇
valid loss,▂▃▂▁▁▂▁▁▂▁▁▁▂▃▃▂▃▁▁▄▃▃▃▂▂█▂▂▂▄▆▂▂▃▅▄▃▄▃▅

0,1
learning rate,0.007
train accuracy,0.91111
train loss,0.02079
valid accuracy,0.78182
valid loss,0.12706





0,1
learning rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▃▂▄▄▄▄▄▄▅▄▄▄▆▅▆▆▆▆▆▆▆▆▇▇▇▆▆▇▆▇▇▇▇██████
train loss,█▇▆▆▆▅▆▅▅▅▆▅▅▄▄▄▄▄▃▄▃▃▃▃▂▂▃▃▃▂▂▂▂▂▁▁▁▁▁▁
valid accuracy,▂▁▃▄▅▅▅▆▃▅▆▇▅▆▇▅▇▅▅▆▆▅▆▅▇▇▇▇█▆▄▅▇▇▇█▆▇▇▆
valid loss,▃▃▃▅▂▃▃▅▃▄▁▂▄▂▂▁▂▁▂▁▂▃▂▃▄▃▂▅▃▄▄▆▄▄▂▅▂▅█▃

0,1
learning rate,0.007
train accuracy,0.92222
train loss,0.02195
valid accuracy,0.77273
valid loss,0.06277





0,1
learning rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▃▄▅▄▅▅▅▅▅▅▅▆▆▆▅▆▆▆▆▆▇▇▇▇▇▆▇▇▇█▇▇██▇██▇█
train loss,█▇▆▅▆▅▅▅▄▅▅▅▄▄▄▄▄▄▃▄▃▃▂▂▂▂▃▂▂▂▂▂▁▁▁▂▁▁▂▁
valid accuracy,▃▃▁▃▂▄▅▄▄▃▄▄▆▃▆▅▇▃▆▆▅▆▅▇▇▅▆▆▆▆▆▆▅▇▇▇▇▇▇█
valid loss,▂▂▂▃▂▂▂▂▂▂▂▁▁▂█▅▂▅▂▆▇▅▂▃▂▂▂▁▂▄▂▂▂▃▃▁▃▅▃▄

0,1
learning rate,0.007
train accuracy,0.90222
train loss,0.02485
valid accuracy,0.82727
valid loss,0.108





0,1
learning rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▂▃▄▅▄▄▄▅▅▄▆▄▆▅▆▆▆▆▇▅▆▇▇▇█▇▇▇▇▅▆▇▇█▇████
train loss,█▇▇▅▅▅▆▅▅▄▅▄▅▄▄▃▃▃▃▂▄▃▂▂▂▂▂▂▂▂▃▃▃▁▁▃▁▁▁▁
valid accuracy,▂▅▂▄▂▄▇▅▆▅▄▄▅▇▄▅▅▆▅▅▅▅▃▇▃▄▅▇▇▅▁▅▄▄▇▆███▆
valid loss,▁▁▂▂▂▂▁▁▁▁▄▃▃▃▄▂▁▃▂▂▂▄▅▅▄▅▂▂▂▂▂▄▂▄▂▂▃█▂▃

0,1
learning rate,0.007
train accuracy,0.90889
train loss,0.02617
valid accuracy,0.71818
valid loss,0.1356





0,1
learning rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▃▃▄▄▄▅▄▅▅▆▅▅▆▅▆▆▆▆▅▆▅▆▅▆▆▇▆▆▆▇▇▇▇▇▇█▇▆█
train loss,█▇▆▆▆▅▅▆▅▆▄▅▅▄▄▄▄▄▃▄▃▄▄▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▃▁
valid accuracy,▃▅▁▆▆▅▆▅▅▄▄▆▆▇▇▇▅▅▇▆▅▆▆▆▇█▆▆▆▇▇▆█▇▆▆▇█▇▆
valid loss,▂▂▄▂▂▂▄▂▂▃▃▂▂▁▂▂▂█▂▂▂▂▂▂▂▁▂▂▂▁▂▂▂▄▆▂▂▁▃▄

0,1
learning rate,0.007
train accuracy,0.92222
train loss,0.02156
valid accuracy,0.74545
valid loss,0.09871





0,1
learning rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▃▃▄▄▄▅▄▄▅▅▅▅▆▅▆▆▆▆▆▆▇▆▇▇▆▆▇▇▇▇▆▇█▆▇▇▇█▇
train loss,█▇▆▆▅▆▅▅▅▅▄▅▅▃▅▄▄▃▃▄▃▃▃▃▃▃▃▂▃▂▂▃▂▁▃▁▂▂▁▃
valid accuracy,▁▁▃▂▄▃▄▄▄▆▄▄▅▃▆▄▆▆▆▃▄▆▆▆▆▃▄▇▅▆▅▄▄▅▆▆▆█▇▄
valid loss,▂▁▂█▄▆▁▃▁▂▂▂▁▃▁▂▂▁▁▂▂▁▁▂▂▂▂▁▂▂▃▂▂▂▂▂▁▂▂▂

0,1
learning rate,0.007
train accuracy,0.90222
train loss,0.02934
valid accuracy,0.64545
valid loss,0.07978





0,1
learning rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▂▃▄▄▄▆▄▄▅▅▃▅▆▆▆▆▆▆▆▆▆▇▆▇▇▇▇▇▆▇▇▇▇█▇▇▇█▇
train loss,█▇▆▆▆▆▅▅▆▅▄▆▄▄▄▄▄▄▃▃▃▃▂▄▃▃▂▂▃▃▂▂▃▃▁▁▃▁▂▂
valid accuracy,▁▁▄▄▄▆▅▆▅▅▅▇▇▅▆▆▆▇▇▇▇▆▆▇▇▇▇▆▇▆█▇▇▇█▆▇▇█▇
valid loss,▂▃▄▆█▃▂▄▃▃▂▂▁▂▁▂▂▂▂▂▂▂▄▂▂▂▁▂▄▁▁▁▂▂▁▂▁▂▁▁

0,1
learning rate,0.007
train accuracy,0.90889
train loss,0.02203
valid accuracy,0.81818
valid loss,0.05538





0,1
learning rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▂▄▄▄▄▃▄▆▅▅▆▅▆▅▅▅▆▆▇▇▇▇▇▆▆▇▇▇▇████▇██▇██
train loss,█▇▆▅▆▆▆▅▅▅▅▃▄▄▄▄▄▄▃▃▃▂▃▃▃▃▃▂▃▂▂▂▂▁▂▂▂▂▁▁
valid accuracy,▆▄▁▃▄▃▃▆▄▄▇▆▃▅▆▇▆▅▆▇▆█▅▇▇▆▅▇▆█▄▆▇▆▇▇▆▆▇▆
valid loss,▁▁▇█▃▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▅▆▂▁▃▁▂▁▁▂▂▅▄▄

0,1
learning rate,0.007
train accuracy,0.90889
train loss,0.02033
valid accuracy,0.7
valid loss,0.15095





0,1
learning rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▂▄▄▃▄▄▄▄▅▅▅▅▆▆▆▆▅▅▆▆▇▇▇▇▇█▇▇█▇██▇█▇▇███
train loss,█▇▆▆▆▅▅▆▅▄▅▄▅▄▄▄▄▄▄▄▄▃▃▃▃▂▂▃▂▂▃▁▁▂▂▂▂▂▁▂
valid accuracy,▁▁▂▄▅▇▅▄▂▄▅▅▆▆▅▄▅▆▅▆▆▄▆▆▆▅▆▆▅▆▇▇▇▇▅█▇▃▆▅
valid loss,▃▂▂▂▁▂▂▃▅▆▃▃▃▅▄▆▂▃▂▅▃▃▄▅▂█▇▆▅▅▃▆▆▄▇▄▄▆▆█

0,1
learning rate,0.007
train accuracy,0.90667
train loss,0.02679
valid accuracy,0.70909
valid loss,0.10624





0,1
learning rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▃▃▄▃▃▄▅▄▅▄▅▄▅▅▅▆▅▆▆▆▅▆▆▆▆▆▇▇█▇█▇███▇██▇
train loss,█▇▆▆▆▆▅▅▅▅▅▅▅▅▄▄▄▄▃▃▄▄▃▃▄▃▃▂▂▂▃▁▃▂▁▁▂▁▁▂
valid accuracy,▄▄▂▁▂▆▄▆▃▅▅▆▆▆▅▅▆▆▅▅▆█▆▅▇▆▇▅▇▇▆▃▆▆▇█▆▆▇▇
valid loss,▃▃▄█▆▃▅▃▄▂▃▃▄▃▃▂▂▂▂▃▂▂▂▃▂▂▂▃▃▂▃▄▂▂▂▁▂▂▂▂

0,1
learning rate,0.007
train accuracy,0.9
train loss,0.02782
valid accuracy,0.78182
valid loss,0.04592





0,1
learning rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▂▃▃▄▃▄▅▅▅▅▅▅▆▆▅▅▆▇▇▇▆▆▆▇▆▆▇▇▇▆▇▇▇▇▇▇▇██
train loss,█▇▇▆▆▆▅▅▄▅▆▅▅▄▄▄▄▄▃▃▃▄▄▄▂▃▃▃▂▃▃▂▂▂▂▂▂▂▁▁
valid accuracy,▂▅▁▆▃▆▆▅▆▆▄▄▄▇█▆▆▆▅▇▆▆▇▇█▆▇█▇▅▇▆█▇▆▇▇▅▇▇
valid loss,▂▂█▆▆▂▂▂▂▂▂▃▂▁▁▂▂▂▂▂▁▂▁▁▁▂▂▁▁▃▃▂▁▁▂▂▃▂▃▃

0,1
learning rate,0.007
train accuracy,0.90444
train loss,0.02536
valid accuracy,0.75455
valid loss,0.07519





0,1
learning rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train accuracy,▁▂▄▄▄▄▃▄▃▄▄▅▅▅▅▆▅▅▅▅▇▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇██▇█
train loss,█▇▆▆▅▅▆▆▆▅▅▅▄▄▅▄▅▅▄▄▃▄▃▃▂▂▃▂▃▃▂▂▂▂▂▃▂▂▂▁
valid accuracy,▄▁▂▂▂▄▅▆▃▂▁▅▅▄▆▆▅▅▆▅▇▆▇▇▇█▇▇▆▇▇▆▇▇▆█▆▇▇▅
valid loss,▂▂▄▂▃▄▃▃▃█▄▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▂▁▁▂

0,1
learning rate,0.007
train accuracy,0.93333
train loss,0.01759
valid accuracy,0.7
valid loss,0.07121


In [8]:
print(train_acc)

[0.9288888888888889, 0.9133333333333333, 0.92, 0.8866666666666667, 0.92, 0.9133333333333333, 0.8755555555555555, 0.9355555555555556, 0.9111111111111111, 0.9222222222222223, 0.9022222222222223, 0.9088888888888889, 0.9222222222222223, 0.9022222222222223, 0.9088888888888889, 0.9088888888888889, 0.9066666666666666, 0.9, 0.9044444444444445, 0.9333333333333333]


In [9]:
print(valid_acc)

[0.7636363636363637, 0.7272727272727273, 0.8090909090909091, 0.7818181818181819, 0.7909090909090909, 0.7727272727272727, 0.8272727272727273, 0.7545454545454545, 0.7818181818181819, 0.7727272727272727, 0.8272727272727273, 0.7181818181818181, 0.7454545454545455, 0.6454545454545455, 0.8181818181818182, 0.7, 0.7090909090909091, 0.7818181818181819, 0.7545454545454545, 0.7]


In [11]:
hidden=2048

alpha=0.66
dropout=0.216
lr=0.01


In [None]:
train_acc=[]
valid_acc=[]
for valid_seed in range(20):
    trial=Objective(arch,root,batch,epochs,device,hidden,valid_seed,alpha,lr,dropout)
    train,valid=trial()
    train_acc.append(train)
    valid_acc.append(valid)

In [None]:
print(train_acc)

In [None]:
print(valid_acc)

In [None]:
train_acc,valid_acc=test_run()

In [None]:
valid_acc

In [None]:
pre_transform = T.Compose([T.GCNNorm(), T.ToSparseTensor()])
dataset = TCGAData.TCGADataset(
    root=root,
    files=files,
    label_mapping=label_mapping,
    gene_graph="brain.geneSymbol.gz",
    pre_transform=pre_transform,
    suffix="sparse",valid_seed=0
)


train_loader = DataLoader(
    dataset,
    batch_size=batch,
    sampler=SubsetRandomSampler(dataset.train_idx),
    drop_last=True,
)
valid_loader = DataLoader(
    dataset,
    batch_size=batch,
    sampler=SubsetRandomSampler(dataset.valid_idx),
    drop_last=True,
)

In [None]:
model = GCN2Net(
    hidden_channels=128,
    num_layers=4,
    alpha=0.5,
    theta=1.0,
    shared_weights=False,
    dropout=0.2,
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = F.nll_loss


def train(epoch, report=True):
    model.train()

    if epoch == 30:
        for param_group in optimizer.param_groups:
            param_group["lr"] = lr * 0.5

    if epoch == 60:
        for param_group in optimizer.param_groups:
            param_group["lr"] = lr * 0.1

    total_loss = 0
    correct = 0
    num_samps = 0
    for data in train_loader:
        if not parall:
            data = data.to(device)
        optimizer.zero_grad()

        output = model(data)
        output = output.squeeze()

        if parall:
            y = torch.cat([d.y for d in data]).to(output.device)
        else:
            y = data.y

        if len(output.shape) == 1:
            output = output.unsqueeze(0)
        loss = criterion(output, y)

        pred = output.max(1)[1]
        correct += pred.eq(y).sum().item()
        total_loss += loss
        loss.backward()
        optimizer.step()
        num_samps += len(y)
    if report:
        print(
            "Epoch: {:02d}, Loss: {:.3g}, Train Acc: {:.4f}".format(
                epoch, total_loss / num_samps, correct / num_samps
            )
        )

    return total_loss / num_samps, correct / num_samps


def valid():
    model.eval()
    correct = 0

    total_loss = 0
    num_samps = 0
    for data in valid_loader:
        if not parall:
            data = data.to(device)
        output = model(data)
        output = output.squeeze()

        pred = output.max(1)[1]
        if parall:
            y = torch.cat([d.y for d in data]).to(output.device)
        else:
            y = data.y
        loss = criterion(output, y)
        total_loss += loss.item()

        correct += pred.eq(y).sum().item()
        num_samps += len(y)
    return total_loss / num_samps, correct / num_samps


train_losses = []
train_acces = []
valid_acces = []
valid_losses = []
for epoch in range(1, 101):
    report = (epoch) % 10 == 0
    train_loss, train_acc = train(epoch, report=report)
    valid_loss, valid_acc = valid()
    train_losses.append(train_loss.cpu().detach().numpy())
    valid_losses.append(valid_loss)
    train_acces.append(train_acc)
    valid_acces.append(valid_acc)
    if report:
        print("valid Loss: {:.3g}, Acc: {:.4f}".format(valid_loss, valid_acc))

plt.figure()
plt.plot(train_acces, label="train acc", linewidth=3)
plt.plot(valid_acces, label="valid acc", linewidth=3)
plt.legend(prop={"size": 16})
plt.xlabel("epoch", fontsize=16)
plt.grid()
plt.show()
plt.plot(train_losses, c="tab:blue", label="train loss", linewidth=3)
plt.plot(valid_losses, c="tab:orange", label="valid loss", linewidth=3)
plt.legend(prop={"size": 16})
plt.xlabel("epoch", fontsize=16)
plt.grid()
plt.show()

loader_auc = DataLoader(
    dataset,
    batch_size=batch,
    sampler=SubsetRandomSampler(dataset.train_idx),
    drop_last=False,
)

outs = []
ys = []
for tb in loader_auc:
    tbc = tb.cuda()
    outs.append(torch.exp(model(tb)).cpu().data.numpy())
    ys.append(tb.y.cpu().numpy())

outs = np.concatenate(outs)
ys = np.concatenate(ys)

fpr_train, tpr_train, _ = roc_curve(ys, outs[:, 1])
train_auc = auc(fpr_train, tpr_train)

loader_auc = DataLoader(
    dataset,
    batch_size=batch,
    sampler=SubsetRandomSampler(dataset.valid_idx),
    drop_last=False,
)

outs = []
ys = []
for tb in loader_auc:
    tbc = tb.cuda()
    outs.append(torch.exp(model(tb)).cpu().data.numpy())
    ys.append(tb.y.cpu().numpy())

outs = np.concatenate(outs)
ys = np.concatenate(ys)

fpr_valid, tpr_valid, _ = roc_curve(ys, outs[:, 1])
valid_auc = auc(fpr_valid, tpr_valid)

plt.plot(
    fpr_train, tpr_train, lw=2, label="ROC curve (area = %0.3f)" % train_auc,
)
plt.plot(
    fpr_valid, tpr_valid, lw=2, label="ROC curve (area = %0.3f)" % valid_auc,
)
plt.plot([0, 1], [0, 1], color="black", lw=1, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.show()

In [None]:
model = GCN2Net(
    hidden_channels=2048,
    num_layers=4,
    alpha=0.5,
    theta=1.0,
    shared_weights=False,
    dropout=0.2,
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = F.nll_loss


def train(epoch, report=True):
    model.train()

    if epoch == 30:
        for param_group in optimizer.param_groups:
            param_group["lr"] = lr * 0.5

    if epoch == 60:
        for param_group in optimizer.param_groups:
            param_group["lr"] = lr * 0.1

    total_loss = 0
    correct = 0
    num_samps = 0
    for data in train_loader:
        if not parall:
            data = data.to(device)
        optimizer.zero_grad()

        output = model(data)
        output = output.squeeze()

        if parall:
            y = torch.cat([d.y for d in data]).to(output.device)
        else:
            y = data.y

        if len(output.shape) == 1:
            output = output.unsqueeze(0)
        loss = criterion(output, y)

        pred = output.max(1)[1]
        correct += pred.eq(y).sum().item()
        total_loss += loss
        loss.backward()
        optimizer.step()
        num_samps += len(y)
    if report:
        print(
            "Epoch: {:02d}, Loss: {:.3g}, Train Acc: {:.4f}".format(
                epoch, total_loss / num_samps, correct / num_samps
            )
        )

    return total_loss / num_samps, correct / num_samps


def valid():
    model.eval()
    correct = 0

    total_loss = 0
    num_samps = 0
    for data in valid_loader:
        if not parall:
            data = data.to(device)
        output = model(data)
        output = output.squeeze()

        pred = output.max(1)[1]
        if parall:
            y = torch.cat([d.y for d in data]).to(output.device)
        else:
            y = data.y
        loss = criterion(output, y)
        total_loss += loss.item()

        correct += pred.eq(y).sum().item()
        num_samps += len(y)
    return total_loss / num_samps, correct / num_samps


train_losses = []
train_acces = []
valid_acces = []
valid_losses = []
for epoch in range(1, 101):
    report = (epoch) % 10 == 0
    train_loss, train_acc = train(epoch, report=report)
    valid_loss, valid_acc = valid()
    train_losses.append(train_loss.cpu().detach().numpy())
    valid_losses.append(valid_loss)
    train_acces.append(train_acc)
    valid_acces.append(valid_acc)
    if report:
        print("valid Loss: {:.3g}, Acc: {:.4f}".format(valid_loss, valid_acc))

plt.figure()
plt.plot(train_acces, label="train acc", linewidth=3)
plt.plot(valid_acces, label="valid acc", linewidth=3)
plt.legend(prop={"size": 16})
plt.xlabel("epoch", fontsize=16)
plt.grid()
plt.show()
plt.plot(train_losses, c="tab:blue", label="train loss", linewidth=3)
plt.plot(valid_losses, c="tab:orange", label="valid loss", linewidth=3)
plt.legend(prop={"size": 16})
plt.xlabel("epoch", fontsize=16)
plt.grid()
plt.show()

loader_auc = DataLoader(
    dataset,
    batch_size=batch,
    sampler=SubsetRandomSampler(dataset.train_idx),
    drop_last=False,
)

outs = []
ys = []
for tb in loader_auc:
    tbc = tb.cuda()
    outs.append(torch.exp(model(tb)).cpu().data.numpy())
    ys.append(tb.y.cpu().numpy())

outs = np.concatenate(outs)
ys = np.concatenate(ys)

fpr_train, tpr_train, _ = roc_curve(ys, outs[:, 1])
train_auc = auc(fpr_train, tpr_train)

loader_auc = DataLoader(
    dataset,
    batch_size=batch,
    sampler=SubsetRandomSampler(dataset.valid_idx),
    drop_last=False,
)

outs = []
ys = []
for tb in loader_auc:
    tbc = tb.cuda()
    outs.append(torch.exp(model(tb)).cpu().data.numpy())
    ys.append(tb.y.cpu().numpy())

outs = np.concatenate(outs)
ys = np.concatenate(ys)

fpr_valid, tpr_valid, _ = roc_curve(ys, outs[:, 1])
valid_auc = auc(fpr_valid, tpr_valid)

plt.plot(
    fpr_train, tpr_train, lw=2, label="ROC curve (area = %0.3f)" % train_auc,
)
plt.plot(
    fpr_valid, tpr_valid, lw=2, label="ROC curve (area = %0.3f)" % valid_auc,
)
plt.plot([0, 1], [0, 1], color="black", lw=1, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.show()

In [None]:
# Training on brain data using GCN architecture

# Hyperparameters etc:

arch = "GCN"
batch = 10
parall = False
lr = 0.005


import sys

sys.path.insert(0, "..")
import TCGAData
import torch, torch_geometric.transforms as T, torch.nn.functional as F
import matplotlib.pyplot as plt, numpy as np
from sklearn.metrics import roc_auc_score, roc_curve, auc
from torch_geometric.loader import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from arch.net import *

if torch.cuda.is_available():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else:
    device = "cpu"

root = "/mnt/home/sgolkar/projects/cancer-net/data/brain"
files = "/mnt/home/sgolkar/projects/cancer-net/data/brain/samples.txt"
label_mapping = ["LGG", "GBM"]
dataset = TCGAData.TCGADataset(
    root=root,
    files=files,
    label_mapping=label_mapping,
    gene_graph="brain.geneSymbol.gz",
)

train_loader = DataLoader(
    dataset,
    batch_size=batch,
    sampler=SubsetRandomSampler(dataset.train_idx),
    drop_last=True,
)
valid_loader = DataLoader(
    dataset,
    batch_size=batch,
    sampler=SubsetRandomSampler(dataset.valid_idx),
    drop_last=True,
)


model = GCNNet().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = F.nll_loss


def train(epoch, report=True):
    model.train()

    if epoch == 30:
        for param_group in optimizer.param_groups:
            param_group["lr"] = lr * 0.5

    if epoch == 60:
        for param_group in optimizer.param_groups:
            param_group["lr"] = lr * 0.1

    total_loss = 0
    correct = 0
    num_samps = 0
    for data in train_loader:
        if not parall:
            data = data.to(device)
        optimizer.zero_grad()

        output = model(data)
        output = output.squeeze()

        if parall:
            y = torch.cat([d.y for d in data]).to(output.device)
        else:
            y = data.y

        if len(output.shape) == 1:
            output = output.unsqueeze(0)
        loss = criterion(output, y)

        pred = output.max(1)[1]
        correct += pred.eq(y).sum().item()
        total_loss += loss
        loss.backward()
        optimizer.step()
        num_samps += len(y)
    if report:
        print(
            "Epoch: {:02d}, Loss: {:.3g}, Train Acc: {:.4f}".format(
                epoch, total_loss / num_samps, correct / num_samps
            )
        )

    return total_loss / num_samps, correct / num_samps


def valid():
    model.eval()
    correct = 0

    total_loss = 0
    num_samps = 0
    for data in valid_loader:
        if not parall:
            data = data.to(device)
        output = model(data)
        output = output.squeeze()

        pred = output.max(1)[1]
        if parall:
            y = torch.cat([d.y for d in data]).to(output.device)
        else:
            y = data.y
        loss = criterion(output, y)
        total_loss += loss.item()

        correct += pred.eq(y).sum().item()
        num_samps += len(y)
    return total_loss / num_samps, correct / num_samps


train_losses = []
train_acces = []
valid_acces = []
valid_losses = []
for epoch in range(1, 101):
    report = (epoch) % 10 == 0
    train_loss, train_acc = train(epoch, report=report)
    valid_loss, valid_acc = valid()
    train_losses.append(train_loss.cpu().detach().numpy())
    valid_losses.append(valid_loss)
    train_acces.append(train_acc)
    valid_acces.append(valid_acc)
    if report:
        print("valid Loss: {:.3g}, Acc: {:.4f}".format(valid_loss, valid_acc))

plt.figure()
plt.plot(train_acces, label="train acc", linewidth=3)
plt.plot(valid_acces, label="valid acc", linewidth=3)
plt.legend(prop={"size": 16})
plt.xlabel("epoch", fontsize=16)
plt.grid()
plt.show()
plt.plot(train_losses, c="tab:blue", label="train loss", linewidth=3)
plt.plot(valid_losses, c="tab:orange", label="valid loss", linewidth=3)
plt.legend(prop={"size": 16})
plt.xlabel("epoch", fontsize=16)
plt.grid()
plt.show()

loader_auc = DataLoader(
    dataset,
    batch_size=batch,
    sampler=SubsetRandomSampler(dataset.train_idx),
    drop_last=False,
)

outs = []
ys = []
for tb in loader_auc:
    tbc = tb.cuda()
    outs.append(torch.exp(model(tb)).cpu().data.numpy())
    ys.append(tb.y.cpu().numpy())

outs = np.concatenate(outs)
ys = np.concatenate(ys)

fpr_train, tpr_train, _ = roc_curve(ys, outs[:, 1])
train_auc = auc(fpr_train, tpr_train)

loader_auc = DataLoader(
    dataset,
    batch_size=batch,
    sampler=SubsetRandomSampler(dataset.valid_idx),
    drop_last=False,
)

outs = []
ys = []
for tb in loader_auc:
    tbc = tb.cuda()
    outs.append(torch.exp(model(tb)).cpu().data.numpy())
    ys.append(tb.y.cpu().numpy())

outs = np.concatenate(outs)
ys = np.concatenate(ys)

fpr_valid, tpr_valid, _ = roc_curve(ys, outs[:, 1])
valid_auc = auc(fpr_valid, tpr_valid)

plt.plot(
    fpr_train, tpr_train, lw=2, label="ROC curve (area = %0.3f)" % train_auc,
)
plt.plot(
    fpr_valid, tpr_valid, lw=2, label="ROC curve (area = %0.3f)" % valid_auc,
)
plt.plot([0, 1], [0, 1], color="black", lw=1, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.show()

In [None]:
# Looking at the difference between the files under model and under data (Brain data)

# First, looking at the old files files under model folder.

from matplotlib import pyplot as plt
import h5py
import numpy as np
import os
from os.path import isfile, join

dataroot_model = "/mnt/home/sgolkar/projects/cancer/models/CancerGNN-06-01/data/raw"
files_model = [
    join(dataroot_model, f)
    for f in os.listdir(dataroot_model)
    if isfile(join(dataroot_model, f))
]

cancer_types_model = []
error_samples_model = []
sample_IDs_model = []
nodata_model = []
for file in files_model:
    with h5py.File(file, "r") as f:

        try:
            cancer_types_model.append(f["label"]["sample_meta"]["tumor"][()])
            sample_IDs_model.append(f["label"]["sample_id"][()])
            if "data" not in f.keys():
                nodata_model.append(file)
        except:
            error_samples_model.append(file)
        # num_mutations.append(f['meta']['mutated_gene_list'])

# The reference gene file is the only one that errors out
print("Error files:", error_samples_model)

GBM_num_model = np.sum([el == b"GBM" for el in cancer_types_model])
LGG_num_model = np.sum([el == b"LGG" for el in cancer_types_model])
other_num_model = np.sum([el != b"LGG" and el != b"GBM" for el in cancer_types_model])


print("Num duplicates: {}".format(len(set(sample_IDs_model)) - len(sample_IDs_model)))
print("Num samples without data: {}\n".format(len(nodata_model)))

# Now looking at files under the mutated genes

types = ["GBM", "LGG"]

dataroot = "/mnt/home/sgolkar/projects/cancer-net/data/brain/raw/"

cancer_types = {type: [] for type in types}
num_mutations = {type: [] for type in types}
error_samples = {type: [] for type in types}
sample_IDs = {type: [] for type in types}
nodata = {type: [] for type in types}
for type in types:
    datapath = dataroot + type
    files = [
        join(datapath, f) for f in os.listdir(datapath) if isfile(join(datapath, f))
    ]
    for file in files:
        with h5py.File(file, "r") as f:

            try:
                cancer_types[type].append(f["label"]["sample_meta"]["tumor"][()])
                num_mutations[type].append(len(f["meta"]["mutated_gene_list"][()]))
                sample_IDs[type].append(f["label"]["sample_id"][()])
                if "data" not in f.keys():
                    nodata[type].append(file)
            except:
                error_samples[type].append(file)

# The reference gene file is the only one that errors out
print("Error files: ", error_samples)

# Types of cancer per folder:
print(
    "Cancer types per folder files numbers:",
    {key: len(set(value)) for key, value in cancer_types.items()},
)

print(
    "Duplicates per type:",
    {key: len(set(value)) - len(value) for key, value in sample_IDs.items()},
)

print("Samples without data: ", nodata)


print(
    "Num samples in model not in data: {}".format(
        len(
            set(sample_IDs_model).difference(
                set([el for value in sample_IDs.values() for el in value])
            )
        )
    )
)
print(
    "Num samples in data not in model: {}".format(
        len(
            set([el for value in sample_IDs.values() for el in value]).difference(
                set(sample_IDs_model)
            )
        )
    )
)


[
    plt.hist(value, bins=np.linspace(0, 6000, 150), label=key, alpha=0.6)
    for key, value in num_mutations.items()
]
plt.title("Histogram of # mutations per patient")
plt.legend()
plt.show()

[
    plt.hist(value, bins=np.linspace(0, 150, 50), label=key, alpha=0.6)
    for key, value in num_mutations.items()
]
plt.title("Histogram of # mutations (zoomed)")
plt.legend()
plt.show()


plt.bar(
    ["GBM", "LGG", "other"],
    [GBM_num_model, LGG_num_model, other_num_model],
    label="model folder",
    alpha=0.85,
)
plt.bar(
    cancer_types.keys(),
    [len(el) for el in cancer_types.values()],
    label="data folder",
    alpha=0.85,
)
plt.title("# of patients per cancer type")
plt.legend()
plt.show()

In [None]:
from matplotlib import pyplot as plt
import h5py
import numpy as np


# Loading and demoing the structure of the files

filename = "/mnt/home/sgolkar/projects/cancer-net/data/brain/raw/LGG/TCGA-CS-4942-01A-01D-1468-08.h5"

with h5py.File(filename, "r") as f:

    prom = f["data"]["promoter"][()]
    prot = f["data"]["protein"][()]

plt.imshow(prom)
plt.title("promoter encoding (LGG)")
plt.show()

plt.imshow(prot)
plt.title("protein encoding(LGG)")
plt.show()

# Why are some norms so small?
# Perhaps we can drop the genese were the norm is small for both promoter and the protein?
plt.semilogy(np.abs(prom).sum(1), marker="o", label="promoter")
plt.semilogy(np.abs(prot).sum(1), marker="o", label="protein")
plt.title("Encoding norm (LGG)")
plt.show()


# Another example

filename = "/mnt/home/sgolkar/projects/cancer-net/data/brain/raw/GBM/TCGA-02-0033-01A-01D-1490-08.h5"

with h5py.File(filename, "r") as f:

    prom = f["data"]["promoter"][()]
    prot = f["data"]["protein"][()]

plt.imshow(prom)
plt.title("promoter encoding (GBM)")
plt.show()

plt.imshow(prot)
plt.title("protein encoding(GBM)")
plt.show()

plt.semilogy(np.abs(prom).sum(1), marker="o", label="promoter")
plt.semilogy(np.abs(prot).sum(1), marker="o", label="protein")
plt.title("Encoding norm (GBM)")
plt.show()