In [35]:
import os
import sys
sys.path.append("..")

import numpy as np
import pandas as pd

# viz
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(18.7,6.27)})

# notebook settings
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')

In [36]:
def getTCGA(disease):
    path = "/srv/nas/mk2/projects/pan-cancer/TCGA_CCLE_GCP/TCGA/TCGA_{}_counts.tsv.gz"
    files = [path.format(d) for d in disease]
    return files


def readGCP(files, biotype='protein_coding', mean=True):
    """
    Paths to count matrices.
    """
    data_dict = {}
    for f in files:
        key = os.path.basename(f).split("_")[1]
        data = pd.read_csv(f, sep='\t', index_col=0)
        # transcript metadata
        meta = pd.DataFrame([row[:-1] for row in data.index.str.split("|")],
                            columns=['ENST', 'ENSG', 'OTTHUMG', 'OTTHUMT', 'GENE-NUM', 'GENE', 'BP', 'BIOTYPE'])
        meta = pd.MultiIndex.from_frame(meta)
        data.index = meta
        # subset transcripts
        data = data.xs(key=biotype, level='BIOTYPE')
        data = data.droplevel(['ENST', 'ENSG', 'OTTHUMG', 'OTTHUMT', 'GENE-NUM', 'BP'])
        # average gene expression of splice variants
        data = data.T
        if mean:
            data = data.groupby(by=data.columns, axis=1).mean()
        data_dict[key] = data
    return data_dict


def uq_norm(df, q=0.75):
    """
    Upper quartile normalization of GEX for samples.
    """
    quantiles = df.quantile(q=q, axis=1)
    norm = df.divide(quantiles, axis=0)
    return norm


def process_TCGA(disease=['BRCA', 'LUAD', 'KIRC', 'THCA', 'PRAD', 'SKCM']):
    base="/srv/nas/mk2/projects/pan-cancer/TCGA_CCLE_GCP"
    # get files
    tcga_files = getTCGA(disease)
    # read meta/data
    tcga_meta = pd.read_csv(os.path.join(base, "TCGA/TCGA_GDC_ID_MAP.tsv"), sep="\t")
    tcga_raw = readGCP(tcga_files, mean=True)
    # combine samples
    tcga_raw = pd.concat(tcga_raw.values())
    # Upper quartile normalization
    tcga_raw = uq_norm(tcga_raw)
    # log norm
    tcga = tcga_raw.transform(np.log1p)
    return tcga, tcga_meta

In [37]:
uniprot = pd.read_csv("/srv/home/wconnell/keiser/data/uniprot_mapping_ids/TCGA_rnaseq_uniprot_features.tab.gz", sep="\t")
uniprot.head()

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length,Features,Cross-reference (GeneID),Cross-reference (KEGG),Cross-reference (ExpressionAtlas),Tissue specificity,Gene names (primary ),Gene ontology (biological process),Gene ontology (cellular component),Gene ontology (GO),Gene ontology (molecular function),Gene ontology IDs,Ensembl transcript
0,O43657,TSN6_HUMAN,reviewed,Tetraspanin-6 (Tspan-6) (A15 homolog) (Putativ...,TSPAN6 TM4SF6 UNQ767/PRO1560,Homo sapiens (Human),245,Chain (1); Glycosylation (1); Natural variant ...,7105;,hsa:7105;,O43657;,,TSPAN6,negative regulation of NIK/NF-kappaB signaling...,extracellular exosome [GO:0070062]; integral c...,extracellular exosome [GO:0070062]; integral c...,,GO:0005887; GO:0039532; GO:0043123; GO:0070062...,ENST00000373020;
1,Q9H2S6,TNMD_HUMAN,reviewed,Tenomodulin (TeM) (hTeM) (Chondromodulin-1-lik...,TNMD CHM1L UNQ771/PRO1565,Homo sapiens (Human),317,Alternative sequence (3); Chain (1); Disulfide...,64102;,hsa:64102;,,TISSUE SPECIFICITY: Highly expressed in hypova...,TNMD,cellular response to BMP stimulus [GO:0071773]...,cytoplasm [GO:0005737]; integral component of ...,cytoplasm [GO:0005737]; integral component of ...,,GO:0001886; GO:0001937; GO:0005635; GO:0005737...,ENST00000373031 [Q9H2S6-1];
2,O60762,DPM1_HUMAN,reviewed,Dolichol-phosphate mannosyltransferase subunit...,DPM1,Homo sapiens (Human),260,Chain (1); Initiator methionine (1); Modified ...,8813;,hsa:8813;,O60762;,,DPM1,dolichol metabolic process [GO:0019348]; GPI a...,dolichol-phosphate-mannose synthase complex [G...,dolichol-phosphate-mannose synthase complex [G...,dolichyl-phosphate beta-D-mannosyltransferase ...,GO:0004169; GO:0004582; GO:0005634; GO:0005783...,ENST00000371588;
3,Q8IZE3,PACE1_HUMAN,reviewed,Protein-associating with the carboxyl-terminal...,SCYL3 PACE1,Homo sapiens (Human),742,Alternative sequence (1); Chain (1); Domain (1...,57147;,hsa:57147;,Q8IZE3;,TISSUE SPECIFICITY: Ubiquitously expressed. {E...,SCYL3,cell migration [GO:0016477]; cellular protein ...,cytoplasm [GO:0005737]; Golgi apparatus [GO:00...,cytoplasm [GO:0005737]; Golgi apparatus [GO:00...,ATP binding [GO:0005524]; identical protein bi...,GO:0000139; GO:0005524; GO:0005737; GO:0005794...,ENST00000367770 [Q8IZE3-1];ENST00000367771 [Q8...
4,Q9NSG2,CA112_HUMAN,reviewed,Uncharacterized protein C1orf112,C1orf112,Homo sapiens (Human),853,Alternative sequence (2); Chain (1); Modified ...,55732;,hsa:55732;,Q9NSG2;,,C1orf112,,,,,,ENST00000286031 [Q9NSG2-1];ENST00000359326 [Q9...


In [5]:
uniprot["Gene ontology (biological process)"].nunique()

13311

In [7]:
uniprot["Gene ontology (biological process)"].value_counts().nlargest(10)

regulation of transcription, DNA-templated [GO:0006355]                                                                                                                                                                                                                                                                                                                                                                                                                       269
G protein-coupled receptor signaling pathway [GO:0007186]                                                                                                                                                                                                                                                                                                                                                                                                                     119
keratinization [GO:0031424]                                         

In [9]:
uniprot["Gene ontology (molecular function)"].nunique()
uniprot["Gene ontology (molecular function)"].value_counts().nlargest(10)

9322

DNA binding [GO:0003677]; metal ion binding [GO:0046872]                                                                    305
G protein-coupled receptor activity [GO:0004930]; olfactory receptor activity [GO:0004984]                                  276
RNA binding [GO:0003723]                                                                                                    228
calcium ion binding [GO:0005509]                                                                                            152
metal ion binding [GO:0046872]                                                                                              116
identical protein binding [GO:0042802]                                                                                       82
G protein-coupled receptor activity [GO:0004930]; odorant binding [GO:0005549]; olfactory receptor activity [GO:0004984]     80
G protein-coupled receptor activity [GO:0004930]                                                        

In [12]:
uniprot["Gene ontology (cellular component)"].nunique()
uniprot["Gene ontology (cellular component)"].value_counts().nlargest(20)

10475

nucleus [GO:0005634]                                                                721
integral component of membrane [GO:0016021]; plasma membrane [GO:0005886]           560
integral component of membrane [GO:0016021]                                         489
cytosol [GO:0005829]                                                                268
extracellular region [GO:0005576]                                                   253
cytoplasm [GO:0005737]                                                              237
cytoplasm [GO:0005737]; nucleus [GO:0005634]                                        167
nuclear chromatin [GO:0000790]; nucleus [GO:0005634]                                154
extracellular region [GO:0005576]; extracellular space [GO:0005615]                 106
extracellular space [GO:0005615]                                                    104
integral component of plasma membrane [GO:0005887]; plasma membrane [GO:0005886]     95
nucleoplasm [GO:0005654]        

In [30]:
keratin = uniprot[uniprot["Gene ontology (biological process)"] == "keratinization [GO:0031424]"]

In [46]:
gpcr = uniprot[uniprot["Gene ontology (biological process)"] == "G protein-coupled receptor signaling pathway [GO:0007186]"]

### TCGA data

In [17]:
tcga, tcga_meta = process_TCGA(disease=['PRAD'])

In [47]:
tcga.columns.isin(gpcr['Gene names  (primary )']).sum() / gpcr.shape[0]

0.9831932773109243

In [48]:
tcga.loc[:,tcga.columns.isin(gpcr['Gene names  (primary )'])].shape

(558, 117)

In [49]:
gpcr.shape

(119, 19)