# Human Protien Atlas Data Processing

Here we are pulling data in and processing it. 
* [Here](http://www.sciencemag.org/content/347/6220/1260419.full) is the paper by Uhlen et al. on the dataset 
* The data was obtained from [proteinatlas.org](http://www.proteinatlas.org/)

In [None]:
%matplotlib inline

In [26]:
import pandas as pd

In [27]:
path = '/cellar/users/agross/Data/Protein_Atlas/'

### Cancer File

In [140]:
cancer = pd.read_csv(path + 'cancer.csv')

In [141]:
cancer.head()

Unnamed: 0,Gene,Tumor,Level,Count patients,Total patients,Expression type
0,ENSG00000000003,breast cancer,High,1,12,Staining
1,ENSG00000000003,breast cancer,Medium,7,12,Staining
2,ENSG00000000003,breast cancer,Low,2,12,Staining
3,ENSG00000000003,breast cancer,Not detected,2,12,Staining
4,ENSG00000000003,carcinoid,High,0,4,Staining


In [142]:
cancer['Expression type'].value_counts()

Staining    1329016
dtype: int64

In [143]:
cancer.Level.value_counts()

High            332254
Medium          332254
Low             332254
Not detected    332254
dtype: int64

In [144]:
cancer.Tumor.value_counts()

cervical cancer         66452
liver cancer            66452
pancreatic cancer       66452
glioma                  66452
prostate cancer         66452
breast cancer           66452
endometrial cancer      66452
head and neck cancer    66452
testis cancer           66452
lung cancer             66452
stomach cancer          66452
skin cancer             66452
lymphoma                66452
urothelial cancer       66452
melanoma                66452
thyroid cancer          66452
renal cancer            66452
ovarian cancer          66448
colorectal cancer       66448
carcinoid               66436
dtype: int64

Just double checking that the counts add up to the 'Total patients' column.

In [152]:
gb = cancer.groupby(['Gene','Tumor'])
assert all(gb['Count patients'].sum() == gb['Total patients'].first())

In [153]:
stacked = cancer.set_index(['Gene','Tumor','Level'])['Count patients']
stacked = stacked.unstack('Level')

In [154]:
stacked.head()

Unnamed: 0_level_0,Level,High,Low,Medium,Not detected
Gene,Tumor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000000003,breast cancer,1,2,7,2
ENSG00000000003,carcinoid,0,1,1,2
ENSG00000000003,cervical cancer,11,0,1,0
ENSG00000000003,colorectal cancer,0,2,6,2
ENSG00000000003,endometrial cancer,10,0,2,0


In [155]:
cancer = stacked

### Normal Tissue File

In [70]:
normals = pd.read_csv(path + 'normal_tissue.csv')

In [71]:
normals.head()

Unnamed: 0,Gene,Tissue,Cell type,Level,Expression type,Reliability
0,ENSG00000000003,adrenal gland,glandular cells,Not detected,APE,Supportive
1,ENSG00000000003,appendix,glandular cells,Medium,APE,Supportive
2,ENSG00000000003,appendix,lymphoid tissue,Not detected,APE,Supportive
3,ENSG00000000003,bone marrow,hematopoietic cells,Not detected,APE,Supportive
4,ENSG00000000003,breast,adipocytes,Not detected,APE,Supportive


In [46]:
normals['Expression type'].value_counts()

APE    1319440
dtype: int64

In [75]:
del normals['Expression type']

In [49]:
normals['Reliability'].value_counts()

Uncertain     905350
Supportive    414090
dtype: int64

In [52]:
normals['Level'].value_counts()

Not detected    526361
Medium          380303
Low             251843
High            160933
dtype: int64

In [56]:
normals.Tissue.unique()

array(['adrenal gland', 'appendix', 'bone marrow', 'breast', 'bronchus',
       'cerebellum', 'cerebral cortex', 'cervix, uterine', 'colon',
       'duodenum', 'endometrium 1', 'endometrium 2', 'epididymis',
       'esophagus', 'fallopian tube', 'gallbladder', 'heart muscle',
       'hippocampus', 'kidney', 'lateral ventricle', 'liver', 'lung',
       'lymph node', 'nasopharynx', 'oral mucosa', 'ovary', 'pancreas',
       'parathyroid gland', 'placenta', 'prostate', 'rectum',
       'salivary gland', 'seminal vesicle', 'skeletal muscle', 'skin 1',
       'skin 2', 'small intestine', 'smooth muscle', 'soft tissue 1',
       'soft tissue 2', 'spleen', 'stomach 1', 'stomach 2', 'testis',
       'thyroid gland', 'tonsil', 'urinary bladder', 'vagina'], dtype=object)

In [59]:
normals['Cell type'].unique()

array(['glandular cells', 'lymphoid tissue', 'hematopoietic cells',
       'adipocytes', 'myoepithelial cells', 'respiratory epithelial cells',
       'cells in granular layer', 'cells in molecular layer',
       'Purkinje cells', 'endothelial cells', 'glial cells',
       'neuronal cells', 'neuropil', 'squamous epithelial cells',
       'peripheral nerve/ganglion', 'cells in endometrial stroma',
       'myocytes', 'cells in glomeruli', 'cells in tubules',
       'bile duct cells', 'hepatocytes', 'macrophages', 'pneumocytes',
       'germinal center cells', 'non-germinal center cells',
       'ovarian stroma cells', 'exocrine glandular cells',
       'islets of Langerhans', 'decidual cells', 'trophoblastic cells',
       'fibroblasts', 'keratinocytes', 'Langerhans', 'melanocytes',
       'epidermal cells', 'smooth muscle cells', 'peripheral nerve',
       'cells in red pulp', 'cells in white pulp',
       'cells in seminiferous ducts', 'Leydig cells', 'urothelial cells',
       'follic

In [63]:
normals[['Tissue','Cell type']].drop_duplicates().shape

(83, 2)

### Subcellular location

In [78]:
loc = pd.read_csv(path + 'subcellular_location.csv')

In [97]:
loc.head()

Unnamed: 0,Gene,Main location,Other location,Expression type,Reliability
0,ENSG00000000003,Cytoplasm,,APE,Uncertain
1,ENSG00000000457,Cytoskeleton (Microtubules),Nucleus but not nucleoli;Golgi apparatus,APE,Uncertain
2,ENSG00000000460,Nucleus but not nucleoli;Mitochondria,,APE,Uncertain
3,ENSG00000001036,Nucleus but not nucleoli;Mitochondria,,APE,Uncertain
4,ENSG00000001084,Nucleus;Nucleoli,Cytoplasm,APE,Supportive


In [79]:
loc['Expression type'].value_counts()

APE         8656
Staining     201
dtype: int64

In [81]:
loc.Reliability.value_counts()

Uncertain         4499
Supportive        4355
Non-supportive       3
dtype: int64

In [86]:
loc['Main location'].value_counts().head(10)

Nucleus but not nucleoli              2050
Cytoplasm                             1282
Nucleus                                948
Mitochondria                           488
Vesicles                               482
Nucleus but not nucleoli;Cytoplasm     328
Nucleoli                               322
Plasma membrane;Cytoplasm              268
Nucleus;Nucleoli                       240
Golgi apparatus                        227
dtype: int64

### RNA

In [88]:
rna = pd.read_csv(path + 'rna.csv')

In [98]:
rna.head()

Unnamed: 0,Gene,Sample,Value,Abundance
0,ENSG00000000003,A-431,21.3,Medium
1,ENSG00000000003,A549,32.5,Medium
2,ENSG00000000003,AN3-CA,38.2,Medium
3,ENSG00000000003,BEWO,31.4,Medium
4,ENSG00000000003,CACO-2,63.9,High


In [94]:
rna.Unit.value_counts()

FPKM    1546127
dtype: int64

In [95]:
del rna['Unit']

In [108]:
rna.groupby('Abundance').Value.agg({'mean':'mean','count':'count'}).sort('mean')

Unnamed: 0_level_0,count,mean
Abundance,Unnamed: 1_level_1,Unnamed: 2_level_1
Not detected,610736,0.138966
Low,476583,6.894431
Medium,312896,26.212112
High,145912,179.379383


In [112]:
rna_df = rna.set_index(['Gene','Sample']).Value.unstack()
rna_df.shape

(20344, 76)

### Metadata

In [115]:
meta = pd.read_table(path + 'proteinatlas.tab')

In [119]:
meta.head(3).T

Unnamed: 0,0,1,2
Gene,TSPAN6,TNMD,DPM1
Gene synonym,"T245, TM4SF6, TSPAN-6","BRICD4, ChM1L, myodulin, TEM, tendin","CDGIE, MPDS"
Ensembl,ENSG00000000003,ENSG00000000005,ENSG00000000419
Gene description,Tetraspanin 6,Tenomodulin,Dolichyl-phosphate mannosyltransferase polypep...
Chromosome,X,X,20
Position,99883667-99894988,99839799-99854882,49551404-49575092
Protein class,"Predicted membrane proteins, Protein evidence ...",Predicted membrane proteins,"Disease related genes, Enzymes, Plasma protein..."
Evidence summary,Evidence at protein level,Evidence at transcript level,Evidence at protein level
HPA evidence,Evidence at protein level,Evidence at transcript level,Evidence at transcript level
UniProt evidence,Evidence at protein level,Evidence at transcript level,Evidence at protein level


In [122]:
meta.Gene.value_counts().value_counts()

1    20181
2       86
3        1
dtype: int64

In [126]:
mapping = meta.set_index('Ensembl')['Gene']

In [134]:
meta['Subcellular location'].value_counts().head()

Nucleus but not nucleoli               1234
Cytoplasm                               846
Nucleus but not nucleoli, Cytoplasm     675
Nucleus                                 489
Mitochondria                            363
dtype: int64

### Mapping tumor to normals

In [189]:
tn_map = {'breast cancer': [('breast','glandular cells')],
          'carcinoid': [('pancreas','islets of Langerhans')],
          'cervical cancer': [('cervix, uterine','squamous epithelial cells'),
                              ('cervix, uterine','glandular cells')],
          'colorectal cancer': [('colon','glandular cells'),
                                ('rectum','glandular cells')],
          'endometrial cancer': [('endometrium 1', 'glandular cells'),
                                 ('endometrium 2', 'glandular cells')],
          'glioma':[('cerebral cortex', 'glial cells')],
          'head and neck cancer': [('oral mucosa', 'squamous epithelial cells'),
                                   ('tonsil', 'squamous epithelial cells'),
                                   ('salivary gland', 'glandular cells')],
          'liver cancer': [('liver', 'bile duct cells'),
                           ('liver', 'hepatocytes')],
          'lung cancer': [('bronchus', 'respiratory epithelial cells'),
                          ('lung', 'pneumocytes')],
          'lymphoma': [('lymph node', 'germinal center cells'),
                       ('lymph node', 'non-germinal center cells')],
          'melanoma': [('skin 1', 'melanocytes')],
          'pancreatic cancer': [('pancreas', 'exocrine glandular cells')],
          'prostate cancer': [('prostate', 'glandular cells')],
          'renal cancer': [('kidney', 'cells in tubules')],
          'skin cancer': [('skin 1', 'keratinocytes')],
          'stomach cancer': [('stomach 1', 'glandular cells'),
                             ('stomach 2', 'glandular cells')],
          'testis cancer': [('testis', 'cells in seminiferous ducts')],
          'urothelial cancer': [('urinary bladder', 'urothelial cells')]}                                  
 

In [210]:
cancer.index.get_level_values('Tumor').unique()

array(['breast cancer', 'carcinoid', 'cervical cancer',
       'colorectal cancer', 'endometrial cancer', 'glioma',
       'head and neck cancer', 'liver cancer', 'lung cancer', 'lymphoma',
       'melanoma', 'ovarian cancer', 'pancreatic cancer',
       'prostate cancer', 'renal cancer', 'skin cancer', 'stomach cancer',
       'testis cancer', 'thyroid cancer', 'urothelial cancer'], dtype=object)

In [138]:
normals[['Tissue','Cell type']].drop_duplicates().as_matrix()

array([['adrenal gland', 'glandular cells'],
       ['appendix', 'glandular cells'],
       ['appendix', 'lymphoid tissue'],
       ['bone marrow', 'hematopoietic cells'],
       ['breast', 'adipocytes'],
       ['breast', 'glandular cells'],
       ['breast', 'myoepithelial cells'],
       ['bronchus', 'respiratory epithelial cells'],
       ['cerebellum', 'cells in granular layer'],
       ['cerebellum', 'cells in molecular layer'],
       ['cerebellum', 'Purkinje cells'],
       ['cerebral cortex', 'endothelial cells'],
       ['cerebral cortex', 'glial cells'],
       ['cerebral cortex', 'neuronal cells'],
       ['cerebral cortex', 'neuropil'],
       ['cervix, uterine', 'glandular cells'],
       ['cervix, uterine', 'squamous epithelial cells'],
       ['colon', 'endothelial cells'],
       ['colon', 'glandular cells'],
       ['colon', 'peripheral nerve/ganglion'],
       ['duodenum', 'glandular cells'],
       ['endometrium 1', 'cells in endometrial stroma'],
       ['endometri