<table width="100%" style="border:none">
  <tr>
    <td valign="top">
      <h1>GTEx Preprocessing</h1>
      <ul>
<li>Author: Jeremy Yang
<li>Required: Python3, Pandas
<li>Clean, tidy, reshape RNAseq expression data.
<li>Save aggregated-samples median TPM file for downstream co-expression analysis.
<li>Save expression profiles (exfiles) TPM file for downstream co-expression analysis.
      </ul>
    </td>
    <td align="right">
        <p>UNM Translational Informatics Division</p>
<img style="float:right" width="100" src="https://brand.unm.edu/logos/unm-logo-mark.jpg" alt="UNM Logo" />
    </td>
  </tr>
  </table>
 

In [171]:
import sys,os,re,time,io
import urllib.request
import google.colab
import numpy,scipy
import pandas as pd
print(f"Python: {sys.version.split()[0]}; Pandas: {pd.__version__}; Scipy: {scipy.__version__} ; Numpy: {numpy.__version__}")

Python: 3.6.9; Pandas: 1.1.5; Scipy: 1.4.1 ; Numpy: 1.19.5


In [172]:
!pip install BioClients



In [173]:
import BioClients.ensembl.biomart

### Download subjects datafile:
GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt

In [174]:
url = "https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt"
subjects = pd.read_csv(url, "\t")
print(f"dataset nrows: {subjects.shape[0]} ; ncols: {subjects.shape[1]}:")

dataset nrows: 980 ; ncols: 4:


In [175]:
subjects.head()

Unnamed: 0,SUBJID,SEX,AGE,DTHHRDY
0,GTEX-1117F,2,60-69,4.0
1,GTEX-111CU,1,50-59,0.0
2,GTEX-111FC,1,60-69,1.0
3,GTEX-111VG,1,60-69,3.0
4,GTEX-111YS,1,60-69,0.0


In [176]:
subjects.AGE.value_counts().sort_index()

20-29     84
30-39     78
40-49    153
50-59    315
60-69    317
70-79     33
Name: AGE, dtype: int64

### Download samples datafile:
new: GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt

In [177]:
url = "https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt"
samples = pd.read_csv(url, sep='\t')
samples = samples[['SAMPID', 'SMATSSCR', 'SMTS', 'SMTSD', 'SMUBRID']]
print("dataset nrows: %d ; ncols: %d:"%(samples.shape[0],samples.shape[1]))

dataset nrows: 22951 ; ncols: 5:


 * SMTS = Tissue Type (parent of SMTSD)
 * SMTSD = Tissue Type, more specific
 * SMATSSCR = Autolysis Score, 0=None, 1=Mild, 2=Moderate, 3=Severe
 * Note that other sample attributes may be suitable for quality criteria.
 * SMUBRID = Uberon ID, anatomical location
 * SUBJID is first two hyphen-delimted fields of SAMPID.

In [178]:
samples['SUBJID'] = samples.SAMPID.str.extract('^([^-]+-[^-]+)-', expand=True)
smtsd_orig = samples.SMTSD.unique()
samples.head()

Unnamed: 0,SAMPID,SMATSSCR,SMTS,SMTSD,SMUBRID,SUBJID
0,GTEX-1117F-0003-SM-58Q7G,,Blood,Whole Blood,13756,GTEX-1117F
1,GTEX-1117F-0003-SM-5DWSB,,Blood,Whole Blood,13756,GTEX-1117F
2,GTEX-1117F-0003-SM-6WBT7,,Blood,Whole Blood,13756,GTEX-1117F
3,GTEX-1117F-0011-R10a-SM-AHZ7F,,Brain,Brain - Frontal Cortex (BA9),9834,GTEX-1117F
4,GTEX-1117F-0011-R10b-SM-CYKQ8,,Brain,Brain - Frontal Cortex (BA9),9834,GTEX-1117F


In [179]:
print("Tissue types: %s"%(str(set(smtsd_orig))))

Tissue types: {'Cells - EBV-transformed lymphocytes', 'Heart - Atrial Appendage', 'Pituitary', 'Adipose - Visceral (Omentum)', 'Stomach', 'Minor Salivary Gland', 'Small Intestine - Terminal Ileum', 'Nerve - Tibial', 'Fallopian Tube', 'Artery - Aorta', 'Brain - Frontal Cortex (BA9)', 'Cervix - Endocervix', 'Kidney - Cortex', 'Whole Blood', 'Cells - Leukemia cell line (CML)', 'Esophagus - Muscularis', 'Kidney - Medulla', 'Ovary', 'Adrenal Gland', 'Artery - Tibial', 'Brain - Substantia nigra', 'Brain - Putamen (basal ganglia)', 'Breast - Mammary Tissue', 'Adipose - Subcutaneous', 'Artery - Coronary', 'Esophagus - Mucosa', 'Muscle - Skeletal', 'Skin - Sun Exposed (Lower leg)', 'Brain - Hypothalamus', 'Brain - Amygdala', 'Uterus', 'Colon - Sigmoid', 'Heart - Left Ventricle', 'Skin - Not Sun Exposed (Suprapubic)', 'Brain - Cortex', 'Thyroid', 'Brain - Cerebellar Hemisphere', 'Brain - Anterior cingulate cortex (BA24)', 'Bladder', 'Esophagus - Gastroesophageal Junction', 'Liver', 'Brain - Hipp

In [180]:
print("Counts: SAMPID: %d; SMTS: %d; SMTSD: %d; SUBJID: %d"%(
      samples.SAMPID.nunique(), samples.SMTS.nunique(), samples.SMTSD.nunique(), samples.SUBJID.nunique()))

Counts: SAMPID: 22951; SMTS: 31; SMTSD: 55; SUBJID: 980


### Remove samples with high degree of autolysis (self-digestion).
The destruction of organism cells or tissues by the organisms’ own enzymes or processes.
0=None, 1=Mild, 2=Moderate, 3=Severe

In [181]:
samples.SMATSSCR.value_counts(dropna=False).sort_index()

0.0     3554
1.0    10410
2.0     1582
3.0      193
NaN     7212
Name: SMATSSCR, dtype: int64

In [182]:
samples = samples[(samples.SMATSSCR != 3) & (samples.SMATSSCR != 2)]
print("Counts: SAMPID: %d; SMTS: %d; SMTSD: %d; SUBJID: %d"%(
      samples.SAMPID.nunique(), samples.SMTS.nunique(), samples.SMTSD.nunique(), samples.SUBJID.nunique()))

Counts: SAMPID: 21176; SMTS: 31; SMTSD: 55; SUBJID: 980


### Clean & tidy cols. 

In [183]:
samples.loc[(samples.SMTS.str.strip() == '') & samples.SMTSD.str.startswith("Skin -"), 'SMTS'] = 'Skin'

In [184]:
(samples.SMTS+" : "+samples.SMTSD).value_counts().sort_index()

Adipose Tissue : Adipose - Subcutaneous                752
Adipose Tissue : Adipose - Visceral (Omentum)          560
Adrenal Gland : Adrenal Gland                          209
Bladder : Bladder                                        8
Blood : Cells - EBV-transformed lymphocytes            192
Blood : Whole Blood                                   3288
Blood Vessel : Artery - Aorta                          444
Blood Vessel : Artery - Coronary                       251
Blood Vessel : Artery - Tibial                         757
Bone Marrow : Cells - Leukemia cell line (CML)         217
Brain : Brain - Amygdala                               177
Brain : Brain - Anterior cingulate cortex (BA24)       213
Brain : Brain - Caudate (basal ganglia)                291
Brain : Brain - Cerebellar Hemisphere                  263
Brain : Brain - Cerebellum                             226
Brain : Brain - Cortex                                 268
Brain : Brain - Frontal Cortex (BA9)                   4

### MERGE samples with subjects:

In [185]:
samples = pd.merge(samples, subjects, how='inner', on='SUBJID')
samples.head()

Unnamed: 0,SAMPID,SMATSSCR,SMTS,SMTSD,SMUBRID,SUBJID,SEX,AGE,DTHHRDY
0,GTEX-1117F-0003-SM-58Q7G,,Blood,Whole Blood,13756,GTEX-1117F,2,60-69,4.0
1,GTEX-1117F-0003-SM-5DWSB,,Blood,Whole Blood,13756,GTEX-1117F,2,60-69,4.0
2,GTEX-1117F-0003-SM-6WBT7,,Blood,Whole Blood,13756,GTEX-1117F,2,60-69,4.0
3,GTEX-1117F-0011-R10a-SM-AHZ7F,,Brain,Brain - Frontal Cortex (BA9),9834,GTEX-1117F,2,60-69,4.0
4,GTEX-1117F-0011-R10b-SM-CYKQ8,,Brain,Brain - Frontal Cortex (BA9),9834,GTEX-1117F,2,60-69,4.0


In [186]:
print(f"Counts: SAMPID: {samples.SAMPID.nunique()}; SMTS: {samples.SMTS.nunique()}; SMTSD: {samples.SMTSD.nunique()}; SUBJID: {samples.SUBJID.nunique()}")

Counts: SAMPID: 21176; SMTS: 31; SMTSD: 55; SUBJID: 980


### Keep only samples from healthier subjects (and remove NAs): 
(DTHHRDY = Hardy Scale)
Death classification based on the 4-point Hardy Scale:
1) Violent and fast death Deaths due to accident, blunt force trauma or suicide, terminal phase estimated at < 10 min. 
2) Fast death of natural causes Sudden unexpected deaths of people who had been reasonably healthy, after a terminal phase estimated at < 1 hr (with sudden death from a myocardial infarction as a model cause of death for this category) 
3) Intermediate death Death after a terminal phase of 1 to 24 hrs (not classifiable as 2 or 4); patients who were ill but death was unexpected 
4) Slow death Death after a long illness, with a terminal phase longer than 1 day (commonly cancer or chronic pulmonary disease); deaths that are not unexpected 
0) Ventilator Case All cases on a ventilator immediately before death.


In [187]:
samples.DTHHRDY.value_counts(sort=True, dropna=False).sort_index()

0.0    11131
1.0      832
2.0     5331
3.0     1030
4.0     2435
NaN      417
Name: DTHHRDY, dtype: int64

In [188]:
samples = samples[samples.DTHHRDY<=2]
samples.DTHHRDY.value_counts(sort=True, dropna=False).sort_index()

0.0    11131
1.0      832
2.0     5331
Name: DTHHRDY, dtype: int64

In [189]:
smtsd_final = samples.SMTSD.unique()
smtsd_lost = set(smtsd_orig) - set(smtsd_final)
print("Tissue types lost: "+str(smtsd_lost))
print(f"Counts: SAMPID: {samples.SAMPID.nunique()}; SMTS: {samples.SMTS.nunique()}; SMTSD: {samples.SMTSD.nunique()}; SUBJID: {samples.SUBJID.nunique()}")

Tissue types lost: {'Cells - Leukemia cell line (CML)'}
Counts: SAMPID: 17294; SMTS: 30; SMTSD: 54; SUBJID: 785


### Clean & tidy:

In [190]:
samples.SEX = samples.SEX.apply(lambda x: 'F' if x==2 else 'M' if x==1 else None)
print(samples.SEX.value_counts(sort=True, dropna=False).sort_index())
if (samples.SEX.isna().sum()>0):
  samples.dropna(subset=['SEX'], inplace=True)
print(f"Counts: SAMPID: {samples.SAMPID.nunique()}; SMTS: {samples.SMTS.nunique()}; SMTSD: {samples.SMTSD.nunique()}; SUBJID: {samples.SUBJID.nunique()}")

F     5761
M    11533
Name: SEX, dtype: int64
Counts: SAMPID: 17294; SMTS: 30; SMTSD: 54; SUBJID: 785


### Download GENE TPMs (full or demo subset)
Full file is ~56k rows, 2.6GB uncompressed.  Demo ~5k rows.

In [191]:
url = "https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz"
t0 = time.time()
rnaseq = pd.read_table(url, compression="gzip", sep="\t", skiprows=2, nrows=1000)
print(f"dataset nrows: {rnaseq.shape[0]} ; ncols: {rnaseq.shape[1]}:")
print(f"Elapsed: {time.time()-t0:.2f}s")

dataset nrows: 1000 ; ncols: 17384:
Elapsed: 21.82s


In [192]:
rnaseq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 17384 entries, Name to GTEX-ZZPU-2726-SM-5NQ8O
dtypes: float64(17382), object(2)
memory usage: 132.6+ MB


In [193]:
rnaseq = rnaseq.drop(columns=['Description'])
rnaseq = rnaseq.rename(columns={'Name':'ENSGV'})
rnaseq.columns

Index(['ENSGV', 'GTEX-1117F-0226-SM-5GZZ7', 'GTEX-1117F-0426-SM-5EGHI',
       'GTEX-1117F-0526-SM-5EGHJ', 'GTEX-1117F-0626-SM-5N9CS',
       'GTEX-1117F-0726-SM-5GIEN', 'GTEX-1117F-1326-SM-5EGHH',
       'GTEX-1117F-2426-SM-5EGGH', 'GTEX-1117F-2526-SM-5GZY6',
       'GTEX-1117F-2826-SM-5GZXL',
       ...
       'GTEX-ZZPU-1126-SM-5N9CW', 'GTEX-ZZPU-1226-SM-5N9CK',
       'GTEX-ZZPU-1326-SM-5GZWS', 'GTEX-ZZPU-1426-SM-5GZZ6',
       'GTEX-ZZPU-1826-SM-5E43L', 'GTEX-ZZPU-2126-SM-5EGIU',
       'GTEX-ZZPU-2226-SM-5EGIV', 'GTEX-ZZPU-2426-SM-5E44I',
       'GTEX-ZZPU-2626-SM-5E45Y', 'GTEX-ZZPU-2726-SM-5NQ8O'],
      dtype='object', length=17383)

### MELT: One row per ENSG+SAMPID+TPM triplet:
Easier to handle but ~3x storage.

In [194]:
rnaseq = rnaseq.melt(id_vars = "ENSGV", var_name = "SAMPID", value_name = "TPM")
rnaseq["ENSG"] = rnaseq["ENSGV"].str.replace("\.[\d]$", "", regex=True)
rnaseq = rnaseq.drop(columns=['ENSGV'])
rnaseq = rnaseq[["ENSG", "SAMPID", "TPM"]]

In [195]:
rnaseq.head()

Unnamed: 0,ENSG,SAMPID,TPM
0,ENSG00000223972,GTEX-1117F-0226-SM-5GZZ7,0.0
1,ENSG00000227232,GTEX-1117F-0226-SM-5GZZ7,8.764
2,ENSG00000278267,GTEX-1117F-0226-SM-5GZZ7,0.0
3,ENSG00000243485,GTEX-1117F-0226-SM-5GZZ7,0.07187
4,ENSG00000237613,GTEX-1117F-0226-SM-5GZZ7,0.0


In [196]:
rnaseq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17382000 entries, 0 to 17381999
Data columns (total 3 columns):
 #   Column  Dtype  
---  ------  -----  
 0   ENSG    object 
 1   SAMPID  object 
 2   TPM     float64
dtypes: float64(1), object(2)
memory usage: 397.8+ MB


### Read and merge gene symbols.
File from https://www.ensembl.org/biomart, dataset human genes, fields Gene stable ID, Gene stable ID version,  NCBI gene ID, HGNC symbol.

In [197]:
genes = BioClients.ensembl.biomart.ENSG2NCBIHGNC()

In [198]:
genes.head()

Unnamed: 0,Gene stable ID,Gene stable ID version,NCBI gene (formerly Entrezgene) ID,HGNC ID,HGNC symbol
0,ENSG00000210049,ENSG00000210049.1,,HGNC:7481,MT-TF
1,ENSG00000211459,ENSG00000211459.2,,HGNC:7470,MT-RNR1
2,ENSG00000210077,ENSG00000210077.1,,HGNC:7500,MT-TV
3,ENSG00000210082,ENSG00000210082.2,,HGNC:7471,MT-RNR2
4,ENSG00000209082,ENSG00000209082.1,,HGNC:7490,MT-TL1


In [199]:
genes = genes.iloc[:,[0,2,4]]
genes.dropna(inplace=True)

In [200]:
genes.columns = ['ENSG', 'NCBI', 'HGNC']
genes['NCBI'] = genes['NCBI'].astype('Int64')
genes.head()

Unnamed: 0,ENSG,NCBI,HGNC
5,ENSG00000198888,4535,MT-ND1
9,ENSG00000198763,4536,MT-ND2
15,ENSG00000198804,4512,MT-CO1
16,ENSG00000210151,113219467,MT-TS1
18,ENSG00000198712,4513,MT-CO2


In [201]:
rnaseq = pd.merge(rnaseq, genes, on='ENSG', how='inner')

In [202]:
rnaseq.head()

Unnamed: 0,ENSG,SAMPID,TPM,NCBI,HGNC
0,ENSG00000223972,GTEX-1117F-0226-SM-5GZZ7,0.0,84771,DDX11L1
1,ENSG00000223972,GTEX-1117F-0226-SM-5GZZ7,0.0,727856,DDX11L1
2,ENSG00000223972,GTEX-1117F-0226-SM-5GZZ7,0.0,100287102,DDX11L1
3,ENSG00000223972,GTEX-1117F-0226-SM-5GZZ7,0.0,100287596,DDX11L1
4,ENSG00000223972,GTEX-1117F-0226-SM-5GZZ7,0.0,102725121,DDX11L1


### Remove genes in pseudoautosomal regions (PAR) of chromosome Y ("ENSGR").

In [203]:
n_ensgr = rnaseq.ENSG.str.startswith('ENSGR').sum()
print(f"ENSGR gene TPMs: {n_ensgr} ({100*n_ensgr/rnaseq.shape[0]:.2f}%)")

ENSGR gene TPMs: 0 (0.00%)


In [204]:
rnaseq = rnaseq[~rnaseq.ENSG.str.startswith('ENSGR')]

### Merge with samples:

In [205]:
rnaseq = pd.merge(rnaseq, samples, how="inner", on="SAMPID")
rnaseq = rnaseq.reset_index(drop=True)

In [206]:
rnaseq.head()

Unnamed: 0,ENSG,SAMPID,TPM,NCBI,HGNC,SMATSSCR,SMTS,SMTSD,SMUBRID,SUBJID,SEX,AGE,DTHHRDY
0,ENSG00000223972,GTEX-111CU-0126-SM-5GZWZ,0.03757,84771,DDX11L1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,M,50-59,0.0
1,ENSG00000223972,GTEX-111CU-0126-SM-5GZWZ,0.03757,727856,DDX11L1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,M,50-59,0.0
2,ENSG00000223972,GTEX-111CU-0126-SM-5GZWZ,0.03757,100287102,DDX11L1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,M,50-59,0.0
3,ENSG00000223972,GTEX-111CU-0126-SM-5GZWZ,0.03757,100287596,DDX11L1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,M,50-59,0.0
4,ENSG00000223972,GTEX-111CU-0126-SM-5GZWZ,0.03757,102725121,DDX11L1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,M,50-59,0.0


In [207]:
rnaseq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3220236 entries, 0 to 3220235
Data columns (total 13 columns):
 #   Column    Dtype  
---  ------    -----  
 0   ENSG      object 
 1   SAMPID    object 
 2   TPM       float64
 3   NCBI      Int64  
 4   HGNC      object 
 5   SMATSSCR  float64
 6   SMTS      object 
 7   SMTSD     object 
 8   SMUBRID   object 
 9   SUBJID    object 
 10  SEX       object 
 11  AGE       object 
 12  DTHHRDY   float64
dtypes: Int64(1), float64(3), object(9)
memory usage: 322.5+ MB


In [208]:
for i,smtsd in enumerate(rnaseq.SMTSD.sort_values().unique()):
  print(f"{i+1}. {smtsd}")

1. Adipose - Subcutaneous
2. Adipose - Visceral (Omentum)
3. Adrenal Gland
4. Artery - Aorta
5. Artery - Coronary
6. Artery - Tibial
7. Bladder
8. Brain - Amygdala
9. Brain - Anterior cingulate cortex (BA24)
10. Brain - Caudate (basal ganglia)
11. Brain - Cerebellar Hemisphere
12. Brain - Cerebellum
13. Brain - Cortex
14. Brain - Frontal Cortex (BA9)
15. Brain - Hippocampus
16. Brain - Hypothalamus
17. Brain - Nucleus accumbens (basal ganglia)
18. Brain - Putamen (basal ganglia)
19. Brain - Spinal cord (cervical c-1)
20. Brain - Substantia nigra
21. Breast - Mammary Tissue
22. Cells - Cultured fibroblasts
23. Cells - EBV-transformed lymphocytes
24. Cervix - Ectocervix
25. Cervix - Endocervix
26. Colon - Sigmoid
27. Colon - Transverse
28. Esophagus - Gastroesophageal Junction
29. Esophagus - Mucosa
30. Esophagus - Muscularis
31. Fallopian Tube
32. Heart - Atrial Appendage
33. Heart - Left Ventricle
34. Kidney - Cortex
35. Kidney - Medulla
36. Liver
37. Lung
38. Minor Salivary Gland
39. 

### Remove data for gene-tissue pairs with all zero expression.

In [209]:
maxtpm_0 = (rnaseq[['ENSG', 'SMTSD', 'TPM']].groupby(by=['ENSG','SMTSD'], as_index=True).max() == 0).rename(columns={'TPM':'maxtpm_0'})
maxtpm_0.maxtpm_0.value_counts()

False    11916
True       936
Name: maxtpm_0, dtype: int64

In [210]:
maxtpm_0.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 12852 entries, ('ENSG00000007968', 'Adipose - Subcutaneous') to ('ENSG00000284372', 'Whole Blood')
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   maxtpm_0  12852 non-null  bool 
dtypes: bool(1)
memory usage: 52.6+ KB


In [211]:
rnaseq = pd.merge(rnaseq, maxtpm_0, left_on=['ENSG', 'SMTSD'], right_index=True)
rnaseq.head()

Unnamed: 0,ENSG,SAMPID,TPM,NCBI,HGNC,SMATSSCR,SMTS,SMTSD,SMUBRID,SUBJID,SEX,AGE,DTHHRDY,maxtpm_0
0,ENSG00000223972,GTEX-111CU-0126-SM-5GZWZ,0.03757,84771,DDX11L1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,M,50-59,0.0,False
1,ENSG00000223972,GTEX-111CU-0126-SM-5GZWZ,0.03757,727856,DDX11L1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,M,50-59,0.0,False
2,ENSG00000223972,GTEX-111CU-0126-SM-5GZWZ,0.03757,100287102,DDX11L1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,M,50-59,0.0,False
3,ENSG00000223972,GTEX-111CU-0126-SM-5GZWZ,0.03757,100287596,DDX11L1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,M,50-59,0.0,False
4,ENSG00000223972,GTEX-111CU-0126-SM-5GZWZ,0.03757,102725121,DDX11L1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,M,50-59,0.0,False


In [212]:
rnaseq = rnaseq[~rnaseq['maxtpm_0']]
rnaseq.drop(columns=['maxtpm_0'], inplace=True)
rnaseq.head()

Unnamed: 0,ENSG,SAMPID,TPM,NCBI,HGNC,SMATSSCR,SMTS,SMTSD,SMUBRID,SUBJID,SEX,AGE,DTHHRDY
0,ENSG00000223972,GTEX-111CU-0126-SM-5GZWZ,0.03757,84771,DDX11L1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,M,50-59,0.0
1,ENSG00000223972,GTEX-111CU-0126-SM-5GZWZ,0.03757,727856,DDX11L1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,M,50-59,0.0
2,ENSG00000223972,GTEX-111CU-0126-SM-5GZWZ,0.03757,100287102,DDX11L1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,M,50-59,0.0
3,ENSG00000223972,GTEX-111CU-0126-SM-5GZWZ,0.03757,100287596,DDX11L1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,M,50-59,0.0
4,ENSG00000223972,GTEX-111CU-0126-SM-5GZWZ,0.03757,102725121,DDX11L1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,M,50-59,0.0


In [213]:
rnaseq.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3088233 entries, 0 to 2626829
Data columns (total 13 columns):
 #   Column    Dtype  
---  ------    -----  
 0   ENSG      object 
 1   SAMPID    object 
 2   TPM       float64
 3   NCBI      Int64  
 4   HGNC      object 
 5   SMATSSCR  float64
 6   SMTS      object 
 7   SMTSD     object 
 8   SMUBRID   object 
 9   SUBJID    object 
 10  SEX       object 
 11  AGE       object 
 12  DTHHRDY   float64
dtypes: Int64(1), float64(3), object(9)
memory usage: 332.8+ MB


### Remove data for gene-tissue pairs not present in both sexes. (This removes most sex specific tissues.)

In [214]:
sex_count = (rnaseq[['ENSG', 'SMTSD', 'SEX']].groupby(by=['ENSG','SMTSD'], as_index=True).nunique()).rename(columns={'SEX':'sex_count'})
print(sex_count.sex_count.value_counts())

2    10072
1     1844
Name: sex_count, dtype: int64


In [215]:
sex_count_is_2 = (rnaseq[['ENSG', 'SMTSD', 'SEX']].groupby(by=['ENSG','SMTSD'], as_index=True).nunique()==2).rename(columns={'SEX':'ok'})
print(sex_count_is_2.ok.value_counts())

True     10072
False     1844
Name: ok, dtype: int64


In [216]:
sex_count.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 11916 entries, ('ENSG00000007968', 'Adipose - Subcutaneous') to ('ENSG00000284372', 'Whole Blood')
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   sex_count  11916 non-null  int64
dtypes: int64(1)
memory usage: 130.4+ KB


In [217]:
rnaseq = pd.merge(rnaseq, sex_count, left_on=['ENSG', 'SMTSD'], right_index=True, how="inner")
rnaseq.head()

Unnamed: 0,ENSG,SAMPID,TPM,NCBI,HGNC,SMATSSCR,SMTS,SMTSD,SMUBRID,SUBJID,SEX,AGE,DTHHRDY,sex_count
0,ENSG00000223972,GTEX-111CU-0126-SM-5GZWZ,0.03757,84771,DDX11L1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,M,50-59,0.0,2
1,ENSG00000223972,GTEX-111CU-0126-SM-5GZWZ,0.03757,727856,DDX11L1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,M,50-59,0.0,2
2,ENSG00000223972,GTEX-111CU-0126-SM-5GZWZ,0.03757,100287102,DDX11L1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,M,50-59,0.0,2
3,ENSG00000223972,GTEX-111CU-0126-SM-5GZWZ,0.03757,100287596,DDX11L1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,M,50-59,0.0,2
4,ENSG00000223972,GTEX-111CU-0126-SM-5GZWZ,0.03757,102725121,DDX11L1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,M,50-59,0.0,2


In [218]:
rnaseq = rnaseq[rnaseq['sex_count'] == 2]
rnaseq.drop(columns=['sex_count'], inplace=True)

In [219]:
rnaseq.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2907916 entries, 0 to 2631932
Data columns (total 13 columns):
 #   Column    Dtype  
---  ------    -----  
 0   ENSG      object 
 1   SAMPID    object 
 2   TPM       float64
 3   NCBI      Int64  
 4   HGNC      object 
 5   SMATSSCR  float64
 6   SMTS      object 
 7   SMTSD     object 
 8   SMUBRID   object 
 9   SUBJID    object 
 10  SEX       object 
 11  AGE       object 
 12  DTHHRDY   float64
dtypes: Int64(1), float64(3), object(9)
memory usage: 313.4+ MB


In [220]:
rnaseq.SMTSD.value_counts()

Muscle - Skeletal                            152280
Whole Blood                                  149073
Skin - Sun Exposed (Lower leg)               135274
Artery - Tibial                              132406
Adipose - Subcutaneous                       129710
Nerve - Tibial                               120870
Skin - Not Sun Exposed (Suprapubic)          116620
Esophagus - Mucosa                           109680
Adipose - Visceral (Omentum)                 108576
Thyroid                                      105964
Esophagus - Muscularis                       105228
Cells - Cultured fibroblasts                  96928
Lung                                          94166
Artery - Aorta                                87320
Breast - Mammary Tissue                       87282
Heart - Left Ventricle                        83814
Heart - Atrial Appendage                      80262
Esophagus - Gastroesophageal Junction         78960
Colon - Sigmoid                               68544
Pancreas    

### Remove mammary tissue (partially sex-specific).

In [221]:
smtsd_breast = "Breast - Mammary Tissue"
rnaseq = rnaseq[rnaseq.SMTSD!=smtsd_breast]


### Aggregate samples, compute median TPM by gene+tissue+sex+age:

In [222]:
rnaseq = rnaseq[['ENSG', 'SMTSD', 'SEX', 'AGE', 'TPM']].groupby(by=['ENSG', 'SMTSD', 'SEX', 'AGE'], as_index=False).median()
print(rnaseq.shape)
rnaseq.head()

(114403, 5)


Unnamed: 0,ENSG,SMTSD,SEX,AGE,TPM
0,ENSG00000007968,Adipose - Subcutaneous,F,20-29,0.28845
1,ENSG00000007968,Adipose - Subcutaneous,F,30-39,0.21745
2,ENSG00000007968,Adipose - Subcutaneous,F,40-49,0.2346
3,ENSG00000007968,Adipose - Subcutaneous,F,50-59,0.27
4,ENSG00000007968,Adipose - Subcutaneous,F,60-69,0.32485


### Aggregate samples, compute median TPM by gene+tissue+sex:

In [223]:
rnaseq = rnaseq[['ENSG', 'SMTSD', 'SEX', 'TPM']].groupby(by=['ENSG','SMTSD','SEX'], as_index=False).median()
print(rnaseq.shape)
rnaseq.head()

(19686, 4)


Unnamed: 0,ENSG,SMTSD,SEX,TPM
0,ENSG00000007968,Adipose - Subcutaneous,F,0.279225
1,ENSG00000007968,Adipose - Subcutaneous,M,0.29675
2,ENSG00000007968,Adipose - Visceral (Omentum),F,0.296525
3,ENSG00000007968,Adipose - Visceral (Omentum),M,0.2876
4,ENSG00000007968,Adrenal Gland,F,0.328875


### Save median TPMs file for analysis, 1-row per gene+tissue+sex:

In [224]:
rnaseq.round(3).to_csv('gtex_rnaseq_prep_median.tsv', sep='\t', index=False)
google.colab.files.download('gtex_rnaseq_prep_median.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Pivot TPMs to generate gene profiles:

In [225]:
tissues = pd.Series(pd.unique(rnaseq.SMTSD.sort_values()))


In [226]:
rnaseq_f = rnaseq[rnaseq.SEX=='F'].drop(columns=['SEX'])
rnaseq_f = rnaseq_f[['ENSG','SMTSD','TPM']]
exfiles_f = rnaseq_f.pivot(index='ENSG', columns='SMTSD')
exfiles_f.columns = exfiles_f.columns.get_level_values(1)
exfiles_f = exfiles_f.reset_index(drop=False)
exfiles_f['SEX'] = 'F'
exfiles_f.head()

SMTSD,ENSG,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Amygdala,Brain - Anterior cingulate cortex (BA24),Brain - Caudate (basal ganglia),Brain - Cerebellar Hemisphere,Brain - Cerebellum,Brain - Cortex,Brain - Frontal Cortex (BA9),Brain - Hippocampus,Brain - Hypothalamus,Brain - Nucleus accumbens (basal ganglia),Brain - Putamen (basal ganglia),Brain - Spinal cord (cervical c-1),Brain - Substantia nigra,Cells - Cultured fibroblasts,Cells - EBV-transformed lymphocytes,Colon - Sigmoid,Colon - Transverse,Esophagus - Gastroesophageal Junction,Esophagus - Mucosa,Esophagus - Muscularis,Heart - Atrial Appendage,Heart - Left Ventricle,Kidney - Cortex,Liver,Lung,Minor Salivary Gland,Muscle - Skeletal,Nerve - Tibial,Pancreas,Pituitary,Skin - Not Sun Exposed (Suprapubic),Skin - Sun Exposed (Lower leg),Small Intestine - Terminal Ileum,Spleen,Stomach,Thyroid,Whole Blood,SEX
0,ENSG00000007968,0.279225,0.296525,0.328875,0.191,0.24955,0.07207,0.1998,0.018925,0.016525,0.014922,0.015165,0.01827,0.03785,0.015403,0.023815,0.02271,0.013605,0.020557,0.04174,0.03884,1.39925,9.7295,0.110363,6.278,0.089563,12.345,0.098895,0.094825,0.071585,0.2964,0.266425,1.3075,0.709,0.032785,0.13255,0.03782,0.051195,5.72875,6.568,3.634,5.46725,0.343425,0.49855,8.1765,F
1,ENSG00000049249,0.034357,0.099865,0.042445,0.1139,0.05789,0.017368,0.03089,0.002398,0.00781,0.004632,0.015967,0.009491,0.004749,0.00986,0.007431,0.062353,0.006629,0.007591,0.02369,0.010933,0.02376,1.5125,0.022398,0.047445,0.024753,0.06026,0.015008,0.027145,0.007085,0.1151,0.03949,0.434625,0.03332,0.005284,0.035605,0.02017,0.06725,0.035698,0.020112,0.43195,1.3595,0.056173,0.06191,1.648,F
2,ENSG00000053372,25.58,27.9475,20.78,16.575,18.845,17.0325,22.02,19.095,25.23,18.51,19.44,17.595,23.61,30.6175,20.485,19.5925,19.2775,12.6175,12.905,15.04,75.195,58.7575,18.0,16.29,18.315,29.185,18.915,8.54475,8.34325,14.76,13.405,19.4525,18.4,16.5275,26.115,14.4275,15.76,26.045,26.0975,23.97,23.475,19.0325,20.31,4.803,F
3,ENSG00000057757,54.005,43.795,47.0925,54.65,50.065,61.4875,51.28,77.2025,138.1,65.1725,169.2,135.2,91.03,166.675,76.205,127.9,84.93,40.48,88.28,62.6,49.6725,63.0275,54.44,36.375,53.1525,62.905,52.205,21.3125,23.31,33.78,23.675,52.39,44.95,41.39,86.4225,18.7,62.87,73.745,80.2175,40.7,52.8675,40.0025,52.76,23.3925,F
4,ENSG00000084628,0.1342,0.10434,8.31,0.042597,0.066312,0.024237,0.09209,4.9235,6.321,3.5845,67.435,68.79,8.07,10.32725,5.72825,7.95,4.874,5.629,4.17,5.921,0.11775,0.03734,0.46065,0.17445,0.16305,0.657975,0.205975,0.045587,0.01994,0.1735,0.00326,0.033063,0.5736,5.3205,0.115325,0.116975,4.686,2.546,2.9435,0.098525,0.0,0.094558,0.0644,0.005713,F


In [227]:
rnaseq_m = rnaseq[rnaseq.SEX=='M'].drop(columns=['SEX'])
rnaseq_m = rnaseq_m[['ENSG','SMTSD','TPM']]
exfiles_m = rnaseq_m.pivot(index='ENSG', columns='SMTSD')
exfiles_m.columns = exfiles_m.columns.get_level_values(1)
exfiles_m = exfiles_m.reset_index(drop=False)
exfiles_m['SEX'] = 'M'
exfiles_m.head()

SMTSD,ENSG,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Amygdala,Brain - Anterior cingulate cortex (BA24),Brain - Caudate (basal ganglia),Brain - Cerebellar Hemisphere,Brain - Cerebellum,Brain - Cortex,Brain - Frontal Cortex (BA9),Brain - Hippocampus,Brain - Hypothalamus,Brain - Nucleus accumbens (basal ganglia),Brain - Putamen (basal ganglia),Brain - Spinal cord (cervical c-1),Brain - Substantia nigra,Cells - Cultured fibroblasts,Cells - EBV-transformed lymphocytes,Colon - Sigmoid,Colon - Transverse,Esophagus - Gastroesophageal Junction,Esophagus - Mucosa,Esophagus - Muscularis,Heart - Atrial Appendage,Heart - Left Ventricle,Kidney - Cortex,Liver,Lung,Minor Salivary Gland,Muscle - Skeletal,Nerve - Tibial,Pancreas,Pituitary,Skin - Not Sun Exposed (Suprapubic),Skin - Sun Exposed (Lower leg),Small Intestine - Terminal Ileum,Spleen,Stomach,Thyroid,Whole Blood,SEX
0,ENSG00000007968,0.29675,0.2876,0.28515,0.16045,0.2138,0.069415,0.2148,0.021465,0.01828,0.01851,0.010715,0.017208,0.019875,0.01637,0.02367,0.021318,0.0156,0.021565,0.026288,0.024,1.2275,8.04225,0.103572,6.392,0.08707,12.87,0.109125,0.072252,0.054495,0.123,0.1109,1.48175,0.7233,0.027832,0.134325,0.052155,0.053393,5.19825,6.002,4.0115,5.978,0.5067,0.4866,8.1985,M
1,ENSG00000049249,0.03911,0.13045,0.047403,0.11585,0.1102,0.015475,0.03703,0.004185,0.004089,0.005937,0.008407,0.009467,0.006767,0.005262,0.009468,0.02482,0.005822,0.008189,0.01723,0.018668,0.017755,2.00725,0.026752,0.056375,0.022135,0.06235,0.015662,0.034223,0.01007,0.05565,0.039255,0.418125,0.0444,0.003598,0.045592,0.027338,0.064155,0.034225,0.02796,0.3525,1.408,0.05891,0.05117,1.628,M
2,ENSG00000053372,24.675,23.15,21.1825,16.265,19.69,17.01,23.19,22.515,25.72,18.64,21.4825,20.075,27.525,34.735,20.49,24.64,20.635,15.0075,13.2825,15.4725,65.355,55.785,17.755,16.4,17.43,28.74,17.73,8.88525,6.59675,9.955,12.97,19.4425,17.38,14.195,26.0025,14.3775,15.3125,25.23,25.285,21.19,21.15,19.56,21.175,4.3045,M
3,ENSG00000057757,53.5325,44.0325,48.125,54.485,51.565,60.03,55.98,85.92,104.3,66.9475,195.925,139.5,100.785,163.15,81.5075,149.1,87.44,53.76,95.7675,84.1875,46.475,61.56,53.14,35.87,52.615,67.405,52.1275,22.6375,18.8625,24.13,23.645,51.32,41.99,37.7825,91.1775,18.435,62.12,75.9825,77.445,37.825,52.7125,41.9075,54.43,22.68,M
4,ENSG00000084628,0.12815,0.1051,7.27125,0.03603,0.06599,0.03124,0.088225,4.47675,4.459,6.2585,72.5125,71.3775,9.37625,10.022,7.3925,8.7555,5.30675,6.72625,4.9855,7.23775,0.1114,0.04183,0.475875,0.1462,0.1911,0.60185,0.172825,0.050543,0.035585,0.2287,0.0,0.036113,0.4348,8.03775,0.189925,0.15855,3.39675,2.33025,2.6645,0.1075,0.00608,0.070255,0.067223,0.0,M


In [228]:
exfiles = pd.concat([exfiles_f, exfiles_m])
cols = ['ENSG', 'SEX']+tissues.tolist()
exfiles = exfiles[cols]
exfiles.head()

SMTSD,ENSG,SEX,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Amygdala,Brain - Anterior cingulate cortex (BA24),Brain - Caudate (basal ganglia),Brain - Cerebellar Hemisphere,Brain - Cerebellum,Brain - Cortex,Brain - Frontal Cortex (BA9),Brain - Hippocampus,Brain - Hypothalamus,Brain - Nucleus accumbens (basal ganglia),Brain - Putamen (basal ganglia),Brain - Spinal cord (cervical c-1),Brain - Substantia nigra,Cells - Cultured fibroblasts,Cells - EBV-transformed lymphocytes,Colon - Sigmoid,Colon - Transverse,Esophagus - Gastroesophageal Junction,Esophagus - Mucosa,Esophagus - Muscularis,Heart - Atrial Appendage,Heart - Left Ventricle,Kidney - Cortex,Liver,Lung,Minor Salivary Gland,Muscle - Skeletal,Nerve - Tibial,Pancreas,Pituitary,Skin - Not Sun Exposed (Suprapubic),Skin - Sun Exposed (Lower leg),Small Intestine - Terminal Ileum,Spleen,Stomach,Thyroid,Whole Blood
0,ENSG00000007968,F,0.279225,0.296525,0.328875,0.191,0.24955,0.07207,0.1998,0.018925,0.016525,0.014922,0.015165,0.01827,0.03785,0.015403,0.023815,0.02271,0.013605,0.020557,0.04174,0.03884,1.39925,9.7295,0.110363,6.278,0.089563,12.345,0.098895,0.094825,0.071585,0.2964,0.266425,1.3075,0.709,0.032785,0.13255,0.03782,0.051195,5.72875,6.568,3.634,5.46725,0.343425,0.49855,8.1765
1,ENSG00000049249,F,0.034357,0.099865,0.042445,0.1139,0.05789,0.017368,0.03089,0.002398,0.00781,0.004632,0.015967,0.009491,0.004749,0.00986,0.007431,0.062353,0.006629,0.007591,0.02369,0.010933,0.02376,1.5125,0.022398,0.047445,0.024753,0.06026,0.015008,0.027145,0.007085,0.1151,0.03949,0.434625,0.03332,0.005284,0.035605,0.02017,0.06725,0.035698,0.020112,0.43195,1.3595,0.056173,0.06191,1.648
2,ENSG00000053372,F,25.58,27.9475,20.78,16.575,18.845,17.0325,22.02,19.095,25.23,18.51,19.44,17.595,23.61,30.6175,20.485,19.5925,19.2775,12.6175,12.905,15.04,75.195,58.7575,18.0,16.29,18.315,29.185,18.915,8.54475,8.34325,14.76,13.405,19.4525,18.4,16.5275,26.115,14.4275,15.76,26.045,26.0975,23.97,23.475,19.0325,20.31,4.803
3,ENSG00000057757,F,54.005,43.795,47.0925,54.65,50.065,61.4875,51.28,77.2025,138.1,65.1725,169.2,135.2,91.03,166.675,76.205,127.9,84.93,40.48,88.28,62.6,49.6725,63.0275,54.44,36.375,53.1525,62.905,52.205,21.3125,23.31,33.78,23.675,52.39,44.95,41.39,86.4225,18.7,62.87,73.745,80.2175,40.7,52.8675,40.0025,52.76,23.3925
4,ENSG00000084628,F,0.1342,0.10434,8.31,0.042597,0.066312,0.024237,0.09209,4.9235,6.321,3.5845,67.435,68.79,8.07,10.32725,5.72825,7.95,4.874,5.629,4.17,5.921,0.11775,0.03734,0.46065,0.17445,0.16305,0.657975,0.205975,0.045587,0.01994,0.1735,0.00326,0.033063,0.5736,5.3205,0.115325,0.116975,4.686,2.546,2.9435,0.098525,0.0,0.094558,0.0644,0.005713


In [229]:
exfiles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 474 entries, 0 to 236
Data columns (total 46 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   ENSG                                       474 non-null    object 
 1   SEX                                        474 non-null    object 
 2   Adipose - Subcutaneous                     466 non-null    float64
 3   Adipose - Visceral (Omentum)               458 non-null    float64
 4   Adrenal Gland                              444 non-null    float64
 5   Artery - Aorta                             462 non-null    float64
 6   Artery - Coronary                          460 non-null    float64
 7   Artery - Tibial                            468 non-null    float64
 8   Bladder                                    392 non-null    float64
 9   Brain - Amygdala                           430 non-null    float64
 10  Brain - Anterior cingulate

### Save expression profiles:

In [230]:
exfiles.round(3).to_csv('exfiles_eps.tsv', sep='\t', index=False)
google.colab.files.download('exfiles_eps.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>