## Brain data preprocessing

This notebook contains functions used to parse gdc files and convert to matrices of mutation and copy number variation for each gene, for each patient. We then combine these matrices into a single tensor shaped [patient, gene, features] where we have 3 features: one-hot encodings of mutation, cnv amp, cnv del.

In [None]:
import os
import numpy as np
from collections import defaultdict
from tqdm import tqdm

def read_cnv_mine(manifest_fp, outdir, strip_version_num=True):
    manifest = pd.read_table(manifest_fp)
    annot = None
    tot = {}
    GBM=0
    LGG=0
    for _, row in tqdm(manifest.iterrows(), total=manifest.shape[0]):
        ## Read the actual file in the manifest
        fp = os.path.join(outdir, row['id'], row['filename'])
        dat = pd.read_table(fp)
        if annot is None:
            print("this")
            annot = dat[['gene_id', 'gene_name', 'chromosome', 'start', 'end']]
            annot.index = annot['gene_name']
            annot.drop(columns=['gene_name'], inplace=True)
            if strip_version_num is True:
                annot.index = [x.split('.')[0] for x in annot.index]
        if "GBM" in row.filename:
            GBM+=1
        elif "LGG" in row.filename:
            LGG+=1
        tot[row['submitter_id']] = dat['copy_number'].to_list()
    res = pd.DataFrame.from_dict(tot, orient='index', columns=annot.index)
    print(GBM)
    print(LGG)
    return res, annot


In [None]:
import os
import numpy as np
from collections import defaultdict
from tqdm import tqdm

In [None]:
import pandas as pd
manif_cnv=pd.read_table("/scratch/cp3759/cancer-net/brain/gdc_manifest.2024-02-19.GeneLevelCopyNumber.txtwithSubmitterID.txt")
manif_snv=pd.read_table("/scratch/cp3759/cancer-net/brain/gdc_manifest.2024-02-19.MaskedSomaticMutation.txtwithSubmitterID.txt")
manif_cnv["pid"]=["-".join(x.split("-")[0:3]) for x in manif_cnv.submitter_id]
manif_snv["pid"]=["-".join(x.split("-")[0:3]) for x in manif_snv.submitter_id]
print(len(set(manif_cnv.pid).intersection(manif_snv.pid)))
#print(len(set(manif_cnv.pid).intersection(manif_snvf.pid)))

In [None]:
## Get CNV - this is working now

annot = None
tot = {}
GBM=0
LGG=0
response={}
for _, row in tqdm(manif_cnv.iterrows(), total=manif_cnv.shape[0]):

    ## Read the actual file in the manifest
    fp = os.path.join("/scratch/cp3759/cancer-net/brain/cnv", row['id'], row['filename'])
    dat = pd.read_table(fp)
    if annot is None:
        annot = dat[['gene_id', 'gene_name', 'chromosome', 'start', 'end']]
        annot.index = annot['gene_name']
        annot.drop(columns=['gene_name'], inplace=True)
    tag=row["submitter_id"].split("-")
    if tag[0]=="TCGA":
        if int(tag[3][0:2])<10:
            if "GBM" in row.filename:
                GBM+=1
                resp=1
            elif "LGG" in row.filename:
                LGG+=1
                resp=0
            else:
                ## Skip if neither LGG or GBM
                continue
            shortid="-".join(row["submitter_id"].split("-")[0:3])
            response[shortid]=resp
            tot[shortid] = dat['copy_number'].to_list()
cnv_table = pd.DataFrame.from_dict(tot, orient='index', columns=annot.index)

In [None]:
## Build a list of all gene mutations
full_mut_gene_list=set()
for aa in tqdm(range(len(manif_snv))):
    file_path=manif_snv["id"][aa]+"/"+manif_snv["filename"][aa]
    mut_genes=pd.read_table("/scratch/cp3759/cancer-net/brain/snv/%s" % file_path,comment="#", low_memory=False)
    test_list=set(mut_genes["Hugo_Symbol"])
    full_mut_gene_list=full_mut_gene_list.union(test_list.difference(full_mut_gene_list))

## Load pnet genes
pnet_genes=pd.read_csv("/home/cp3759/Projects/cancer-net-fresh/data/prostate/P1000_data_CNA_paper.csv")

## Intersection between brain mutation dataset and pnet - we drop anything in the cnv
## file which is not part of this intersection
brain_pnet_intersection=set(list(pnet_genes.head(0))).intersection(full_mut_gene_list)

In [None]:
cnv_cache=cnv_table

In [None]:
## Drop genes not in the mut or pnet datasets
cnv_genes=list(cnv_table.head(0))
diff_genes=set(cnv_genes).difference(brain_pnet_intersection)

cnv_table.fillna(0,inplace=True)
## Processed cnv - only genes that are common to both datasets
cnv_table=cnv_table.drop(list(diff_genes),axis=1)
## Update cnv genes to reduced set (intersection with pnet)
cnv_genes=list(cnv_table.head(0))

In [None]:
## Build one-hot encoding table for brain data
tot={}
for aa in tqdm(range(len(manif_snv))):
    file_path=manif_snv["id"][aa]+"/"+manif_snv["filename"][aa]
    tag=manif_snv["submitter_id"][aa].split("-")
    if tag[0]=="TCGA":
        shortid="-".join(manif_snv["submitter_id"][aa].split("-")[0:3])
        mut_genes=pd.read_table("/scratch/cp3759/cancer-net/brain/snv/%s" % file_path,comment="#", low_memory=False)
        mut_gene=list(mut_genes["Hugo_Symbol"])
        both = set(cnv_genes).intersection(mut_gene)
        mut_indices = [cnv_genes.index(x) for x in both]
        mut_entries=np.zeros(len(cnv_genes))
        mut_entries[mut_indices]=1
        tot[shortid]=list(mut_entries)
mutation_table = pd.DataFrame.from_dict(tot, orient='index', columns=cnv_genes)

In [None]:
## Next, remove uncommon tumor samples
cnv_tumors=list(cnv_table.T.head())
mut_tumors=list(mutation_table.T.head())
common_tumors=set(mut_tumors).intersection(cnv_tumors)

In [None]:
## Drop uncommon genes
cnv_table=cnv_table.drop(list(set(cnv_tumors).difference(common_tumors)),axis=0)
mutation_table=mutation_table.drop(list(set(mut_tumors).difference(common_tumors)),axis=0)
## Reorder genes in mutation dataset to follow cnv dataset
mutation_table=mutation_table.reindex(list(cnv_table.T.head(0)))

In [None]:
## Finally need to generate the response vector
response_table={}
for gene in list(cnv_table.T.head(0)):
    response_table[gene]=[response[gene]]
response_table = pd.DataFrame.from_dict(response_table)

In [None]:
cnv_table.to_csv("/home/cp3759/Projects/cancer-net-fresh/data/brain/cnv.csv")
mutation_table.to_csv("/home/cp3759/Projects/cancer-net-fresh/data/brain/mut.csv")
response_table.to_csv("/home/cp3759/Projects/cancer-net-fresh/data/brain/response.csv")

In [None]:
print(data_vector)

## Load the saved tables. Convert into a single torch tensor that can be used in a dataset

In [8]:
import pandas as pd
import pickle
import torch
from tqdm import tqdm
import numpy as np

In [9]:


def tables_to_data_vector(valid_cnv,valid_mut,genes):
    """ Take in matrices of [tumor sample,genes] for both copy number variation
        and mutation"""
    cnv_amp=torch.empty((len(valid_cnv),len(genes)))
    cnv_del=torch.empty((len(valid_cnv),len(genes)))
    missing_count=0
    for aa,gene in enumerate(tqdm(genes)):
        if gene in valid_cnv:
            for bb in range(len(valid_cnv)):
                ## For some reason, some entries are lists..
                ## but the entries are all the same value
                ## so just take the zeroth
                test_val=valid_cnv[gene].values[bb]
                if type(test_val)!=np.float64:
                    test_val=test_val[0]
                if test_val>1:
                    cnv_amp[bb][aa]=1
                    cnv_del[bb][aa]=0
                elif test_val<-1:
                    cnv_amp[bb][aa]=0
                    cnv_del[bb][aa]=1
                else:
                    cnv_amp[bb][aa]=0
                    cnv_del[bb][aa]=0
        else:
            missing_count+=1
            cnv_amp[:,aa]=torch.zeros(len(valid_cnv))
            cnv_del[:,aa]=torch.zeros(len(valid_cnv))
            
    print("%d training set genes missing from cnv set" % missing_count)

    mut_matrix=torch.empty((len(valid_mut),len(genes)))
    missing_count=0
    for aa,gene in enumerate(tqdm(genes)):
        if gene in valid_mut:
            ## Set to one-hot encoding, whether or not the gene is mutated
            for bb in range(len(valid_mut)):
                if valid_mut[gene].values[bb]>0:
                    mut_matrix[bb,aa]=1
                else:
                    mut_matrix[bb,aa]=0 
            ## If the gene is not present, just set all entries to 0
        else:
            missing_count+=1
            mut_matr
            ix[:,aa]=torch.zeros(len(valid_mut))
            
    print("%d training set genes missing from mut set" % missing_count)
    return torch.swapaxes(torch.stack((mut_matrix,cnv_amp,cnv_del)),0,1)


In [10]:
cnv_table=pd.read_csv("/home/cp3759/Projects/cancer-net-fresh/data/brain/cnv.csv")
mutation_table=pd.read_csv("/home/cp3759/Projects/cancer-net-fresh/data/brain/mut.csv")
response_table=pd.read_csv("/home/cp3759/Projects/cancer-net-fresh/data/brain/response.csv")
cnv_genes=list(cnv_table.head(0))[1:]

In [18]:
data_vector=tables_to_data_vector(cnv_table,mutation_table,cnv_genes)
data_vector=torch.swapaxes(data_vector,1,2)
with open('brain_vector.pkl', 'wb') as outp:  
    pickle.dump(data_vector, outp, pickle.HIGHEST_PROTOCOL)
print("saved")

100%|██████████| 9243/9243 [01:54<00:00, 80.46it/s]


0 training set genes missing from cnv set


100%|██████████| 9243/9243 [01:06<00:00, 138.39it/s]


0 training set genes missing from mut set
saved


In [20]:
data_vector.shape

torch.Size([868, 3, 9243])

In [2]:
import os
from cancernet import PnetDataSet, ReactomeNetwork
## Initalise dataset
prostate_root = os.path.join("../data", "prostate")
dataset = PnetDataSet(
    root=prostate_root,
)



In [16]:
dataset.y[0]

tensor([1.])

In [12]:
len(cnv_genes)

9243

In [13]:
import json

with open("brain_genes.json", 'w') as f:
    json.dump(cnv_genes, f) 