In [9]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import utils

## Normal tissues

Discrete (4-level) quantification of protein expression.

In [3]:
df_nt = pd.read_csv(utils.NORMAL_TISSUE_PATH, sep='\t')

genes = set(df_nt['Gene name'])
cell_types = set(df_nt['Cell type'])
tissues = set(df_nt['Tissue'])

print(f"Shape: {df_nt.shape}")
print(f"Num genes: {len(genes)}")
print(f"Num cell types: {len(cell_types)}")
print(f"Num tissues: {len(tissues)}")

# df_nt['Level'].value_counts() / df_nt.shape[0]

df_nt.head(10)

Shape: (1193218, 6)
Num genes: 15318
Num cell types: 141
Num tissues: 63


Unnamed: 0,Gene,Gene name,Tissue,Cell type,Level,Reliability
0,ENSG00000000003,TSPAN6,adipose tissue,adipocytes,Not detected,Approved
1,ENSG00000000003,TSPAN6,adrenal gland,glandular cells,Not detected,Approved
2,ENSG00000000003,TSPAN6,appendix,glandular cells,Medium,Approved
3,ENSG00000000003,TSPAN6,appendix,lymphoid tissue,Not detected,Approved
4,ENSG00000000003,TSPAN6,bone marrow,hematopoietic cells,Not detected,Approved
5,ENSG00000000003,TSPAN6,breast,adipocytes,Not detected,Approved
6,ENSG00000000003,TSPAN6,breast,glandular cells,High,Approved
7,ENSG00000000003,TSPAN6,breast,myoepithelial cells,Not detected,Approved
8,ENSG00000000003,TSPAN6,bronchus,respiratory epithelial cells,High,Approved
9,ENSG00000000003,TSPAN6,caudate,glial cells,Not detected,Approved


## Protein atlas

RNA: nTPMs. 

Protein: No data

In [4]:
df_pa = pd.read_csv(utils.Paths.PROTEINATLAS_PATH, sep='\t')

print(f"Shape: {df_pa.shape}")

print(df_pa.columns)

df_pa.head(10)

Shape: (20090, 313)
Index(['Gene', 'Gene synonym', 'Ensembl', 'Gene description', 'Uniprot',
       'Chromosome', 'Position', 'Protein class', 'Biological process',
       'Molecular function',
       ...
       'Single Cell Type RNA - Smooth muscle cells [nTPM]',
       'Single Cell Type RNA - Spermatocytes [nTPM]',
       'Single Cell Type RNA - Spermatogonia [nTPM]',
       'Single Cell Type RNA - Squamous epithelial cells [nTPM]',
       'Single Cell Type RNA - Suprabasal keratinocytes [nTPM]',
       'Single Cell Type RNA - Syncytiotrophoblasts [nTPM]',
       'Single Cell Type RNA - T-cells [nTPM]',
       'Single Cell Type RNA - Theca cells [nTPM]',
       'Single Cell Type RNA - Undifferentiated cells [nTPM]',
       'Single Cell Type RNA - Urothelial cells [nTPM]'],
      dtype='object', length=313)


Unnamed: 0,Gene,Gene synonym,Ensembl,Gene description,Uniprot,Chromosome,Position,Protein class,Biological process,Molecular function,...,Single Cell Type RNA - Smooth muscle cells [nTPM],Single Cell Type RNA - Spermatocytes [nTPM],Single Cell Type RNA - Spermatogonia [nTPM],Single Cell Type RNA - Squamous epithelial cells [nTPM],Single Cell Type RNA - Suprabasal keratinocytes [nTPM],Single Cell Type RNA - Syncytiotrophoblasts [nTPM],Single Cell Type RNA - T-cells [nTPM],Single Cell Type RNA - Theca cells [nTPM],Single Cell Type RNA - Undifferentiated cells [nTPM],Single Cell Type RNA - Urothelial cells [nTPM]
0,TSPAN6,"T245, TM4SF6, TSPAN-6",ENSG00000000003,Tetraspanin 6,O43657,X,100627108-100639991,"Predicted intracellular proteins, Predicted me...",,,...,43.1,20.9,32.4,23.3,5.4,13.5,2.5,49.4,94.3,89.7
1,TNMD,"BRICD4, ChM1L, myodulin, TEM, tendin",ENSG00000000005,Tenomodulin,Q9H2S6,X,100584936-100599885,Predicted membrane proteins,,,...,3.2,0.0,0.1,0.0,1.0,0.0,0.3,0.7,0.6,0.0
2,DPM1,"CDGIE, MPDS",ENSG00000000419,Dolichyl-phosphate mannosyltransferase subunit...,O60762,20,50934867-50958555,"Disease related genes, Enzymes, Human disease ...",,"Glycosyltransferase, Transferase",...,71.5,109.9,103.6,32.0,105.6,245.4,47.8,36.5,55.3,125.9
3,SCYL3,"PACE-1, PACE1",ENSG00000000457,SCY1 like pseudokinase 3,Q8IZE3,1,169849631-169894267,"Enzymes, Predicted intracellular proteins",,,...,5.4,10.9,6.8,8.3,1.8,11.6,11.0,6.3,10.3,3.3
4,C1orf112,FLJ10706,ENSG00000000460,Chromosome 1 open reading frame 112,Q9NSG2,1,169662007-169854080,Predicted intracellular proteins,,,...,2.6,37.1,20.3,2.8,1.0,3.8,3.3,2.2,2.6,2.4
5,FGR,"c-fgr, p55c-fgr, SRC2",ENSG00000000938,"FGR proto-oncogene, Src family tyrosine kinase",P09769,1,27612064-27635185,"Disease related genes, Enzymes, Metabolic prot...","Immunity, Innate immunity","Kinase, Transferase, Tyrosine-protein kinase",...,0.6,0.2,0.5,0.5,1.1,1.0,28.0,0.0,0.0,2.5
6,CFH,"ARMD4, ARMS1, FHL1, HF, HF1, HF2, HUS",ENSG00000000971,Complement factor H,P08603,1,196652043-196747504,"Cancer-related genes, Disease related genes, H...","Complement alternate pathway, Host-virus inter...",,...,59.6,1.4,1.0,10.9,38.2,0.1,13.2,26.3,0.0,61.5
7,FUCA2,"dJ20N2.5, MGC1314",ENSG00000001036,Alpha-L-fucosidase 2,Q9BTY2,6,143494812-143511720,"Enzymes, Metabolic proteins, Plasma proteins, ...",,"Glycosidase, Hydrolase",...,35.0,4.8,4.6,13.2,8.0,46.4,11.5,78.8,50.4,18.4
8,GCLC,"GCS, GLCL, GLCLC",ENSG00000001084,Glutamate-cysteine ligase catalytic subunit,P48506,6,53497341-53616970,"Disease related genes, Enzymes, Human disease ...",Glutathione biosynthesis,Ligase,...,13.5,3.1,9.2,72.6,24.9,7.8,14.9,26.6,14.8,144.8
9,NFYA,"CBF-B, HAP2, NF-YA",ENSG00000001167,Nuclear transcription factor Y subunit alpha,P23511,6,41072945-41099976,"Predicted intracellular proteins, Transcriptio...","Biological rhythms, Transcription, Transcripti...","Activator, DNA-binding",...,8.3,20.0,8.2,6.3,15.9,10.8,10.1,5.6,7.6,18.7


## rna_single_cell_read_count.tsv

RNA:

Protein:


In [7]:
df_rc = pd.read_csv(utils.Paths.RNA_READCOUNTS_PATH, sep = '\t', nrows = 100)

print(f"Shape: {df_rc.shape}")

df_rc.head(10)

Shape: (100, 20093)


Unnamed: 0,Tissue,Cell,Cluster,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,...,ENSG00000288656,ENSG00000288658,ENSG00000288661,ENSG00000288671,ENSG00000288675,ENSG00000288677,ENSG00000288678,ENSG00000288681,ENSG00000288684,ENSG00000288695
0,Adipose,1,0,0,0,0,0,0,0,8,...,0,0,0,0,0,0,0,0,0,0
1,Adipose,2,0,0,0,0,0,0,0,47,...,0,0,0,0,0,0,0,0,0,0
2,Adipose,3,0,0,0,0,0,0,0,10,...,0,0,0,0,0,0,0,0,0,0
3,Adipose,4,0,0,1,0,0,0,0,7,...,0,0,0,0,0,0,0,0,0,0
4,Adipose,5,4,0,0,0,0,0,0,5,...,0,0,0,0,0,0,0,0,0,0
5,Adipose,6,4,0,0,2,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
6,Adipose,7,12,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Adipose,8,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,Adipose,9,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9,Adipose,10,4,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


## rna_single_cell_type_tissue.tsv

RNA:

Protein:


In [8]:
df_ts = pd.read_csv(utils.Paths.RNA_TISSUES_PATH, sep = '\t')

print(f"Shape: {df_ts.shape}")

df_ts.head(10)

Shape: (8919960, 7)


Unnamed: 0,Gene,Gene name,Tissue,Cluster,Cell type,Read count,pTPM
0,ENSG00000000003,TSPAN6,adipose tissue,c-0,fibroblasts,78262533,19.1
1,ENSG00000000003,TSPAN6,adipose tissue,c-1,t-cells,34511283,0.3
2,ENSG00000000003,TSPAN6,adipose tissue,c-2,t-cells,24912790,0.2
3,ENSG00000000003,TSPAN6,adipose tissue,c-3,t-cells,25114739,0.3
4,ENSG00000000003,TSPAN6,adipose tissue,c-4,fibroblasts,35989587,21.8
5,ENSG00000000003,TSPAN6,adipose tissue,c-5,macrophages,63530316,0.1
6,ENSG00000000003,TSPAN6,adipose tissue,c-6,dendritic cells,17587269,0.2
7,ENSG00000000003,TSPAN6,adipose tissue,c-7,fibroblasts,36571091,27.4
8,ENSG00000000003,TSPAN6,adipose tissue,c-8,fibroblasts,31645718,18.8
9,ENSG00000000003,TSPAN6,adipose tissue,c-9,t-cells,14818488,0.1
