<table width="100%" style="border:none">
  <tr>
    <td valign="top">
      <h1>GTEx RNAseq Preprocessing</h1>
      <ul>
<li>Author: Jeremy Yang
<li>Based on R code by Oleg Ursu.
<li>Required: Python3, Pandas 0.22+
<li>Clean, tidy, reshape RNAseq expression data.
<li>Save aggregated-samples median TPM file for downstream co-expression analysis.
<li>Save expression profiles (exfiles) TPM file for downstream co-expression analysis.
      </ul>
      [About Colaboratory](https://research.google.com/colaboratory/faq.html).
    </td>
    <td align="right">
        <p>NIH Data Commons: Team Helium</p>
<img style="float:right" width="100" src="https://avatars2.githubusercontent.com/u/33356654?s=200&v=4" alt="HeliumDataCommons Logo" />
    </td>
  </tr>
  </table>
 

In [1]:
import sys,os,re,time,io
import urllib.request
import google.colab
import numpy,scipy
import pandas
print('Python: %s; Pandas: %s; Scipy: %s ; Numpy: %s'%(sys.version.split()[0],pandas.__version__,scipy.__version__,numpy.__version__))

Python: 3.6.3; Pandas: 0.22.0; Scipy: 0.19.1 ; Numpy: 1.14.5


### Upload subjects datafile:
(GTEx_v7_Annotations_SubjectPhenotypesDS.txt)

In [2]:
print('Upload GTEx Subjects datafile: ')
uploaded = google.colab.files.upload()
fn = list(uploaded.keys())[0]
print('Uploaded "{name}" with {length} bytes'.format(name=fn, length=len(uploaded[fn])))
subjects = pandas.read_csv(io.StringIO(uploaded[fn].decode('utf8')), sep='\t')
print("dataset nrows: %d ; ncols: %d:"%(subjects.shape[0],subjects.shape[1]), file=sys.stderr)

Upload GTEx Subjects datafile: 


Saving GTEx_v7_Annotations_SubjectPhenotypesDS.txt to GTEx_v7_Annotations_SubjectPhenotypesDS.txt
Uploaded "GTEx_v7_Annotations_SubjectPhenotypesDS.txt" with 15485 bytes


dataset nrows: 752 ; ncols: 4:


In [3]:
subjects.head()

Unnamed: 0,SUBJID,SEX,AGE,DTHHRDY
0,GTEX-1117F,2,60-69,4.0
1,GTEX-111CU,1,50-59,0.0
2,GTEX-111FC,1,60-69,1.0
3,GTEX-111VG,1,60-69,3.0
4,GTEX-111YS,1,60-69,0.0


In [4]:
subjects.AGE.value_counts().sort_index()

20-29     58
30-39     53
40-49    124
50-59    253
60-69    241
70-79     23
Name: AGE, dtype: int64

### Remove less healthy subjects: 
(DTHHRDY = 4-point Hardy Scale Death Classification.)

In [5]:
print("Subjects with Hardy score > 2 or NA: %d (removing)"%(subjects.query('DTHHRDY > 2').shape[0]), file=sys.stderr)
subjects = subjects.query('DTHHRDY <= 2')
print("dataset ncols: %d ; nrows: %d:"%(subjects.shape[0],subjects.shape[1]), file=sys.stderr)
subjects.DTHHRDY.value_counts(sort=True, dropna=False).sort_index()

Subjects with Hardy score > 2 or NA: 124 (removing)
dataset ncols: 609 ; nrows: 4:


0.0    390
1.0     29
2.0    190
Name: DTHHRDY, dtype: int64

### Upload samples datafile:
(GTEx_v7_Annotations_SampleAttributesDS.txt)

In [17]:
print('Upload GTEx Samples datafile: ')
uploaded = google.colab.files.upload()
fn = list(uploaded.keys())[0]
print('Uploaded "{name}" with {length} bytes'.format(name=fn, length=len(uploaded[fn])))
samples = pandas.read_csv(io.StringIO(uploaded[fn].decode('utf8')), sep='\t')
samples = samples[['SAMPID', 'SMATSSCR', 'SMTS', 'SMTSD', 'SMUBRID']]
print("dataset nrows: %d ; ncols: %d:"%(samples.shape[0],samples.shape[1]), file=sys.stderr)

Upload GTEx Samples datafile: 


Saving GTEx_v7_Annotations_SampleAttributesDS.txt to GTEx_v7_Annotations_SampleAttributesDS (3).txt
Uploaded "GTEx_v7_Annotations_SampleAttributesDS.txt" with 8281415 bytes


dataset nrows: 15598 ; ncols: 5:


 * SMTS = Tissue Type (parent of SMTSD)
 * SMTSD = Tissue Type, more specific
 * SMATSSCR = Autolysis Score, 0=None, 1=Mild, 2=Moderate, 3=Severe
 * Note that other sample attributes may be suitable for quality criteria.
 * SMUBRID = Uberon ID, anatomical location
 * SUBJID is first two hyphen-delimted fields of SAMPID.

In [0]:
samples['SUBJID'] = samples.SAMPID.str.extract('^([^-]+-[^-]+)-', expand=True)

In [19]:
samples.head()

Unnamed: 0,SAMPID,SMATSSCR,SMTS,SMTSD,SMUBRID,SUBJID
0,GTEX-1117F-0003-SM-58Q7G,,Blood,Whole Blood,13756,GTEX-1117F
1,GTEX-1117F-0003-SM-5DWSB,,Blood,Whole Blood,13756,GTEX-1117F
2,GTEX-1117F-0003-SM-6WBT7,,Blood,Whole Blood,13756,GTEX-1117F
3,GTEX-1117F-0226-SM-5GZZ7,0.0,Adipose Tissue,Adipose - Subcutaneous,2190,GTEX-1117F
4,GTEX-1117F-0426-SM-5EGHI,0.0,Muscle,Muscle - Skeletal,11907,GTEX-1117F


In [22]:
samples.SMATSSCR.value_counts(dropna=False).sort_index()

 0.0    2533
 1.0    7127
 2.0    1046
 3.0     118
NaN     4774
Name: SMATSSCR, dtype: int64

### Remove samples with high degree of autolysis (self-digestion).


In [26]:
print("Removing %d/%d (%.1f%%)"%((~(samples.SMATSSCR<2)).sum(), samples.SMATSSCR.size, 100*(~(samples.SMATSSCR<2)).sum()/samples.SMATSSCR.size))

Removing 5938/15598 (38.1%)


In [27]:
samples = samples[samples.SMATSSCR < 2]
print("Remaining samples %d"%(samples.SMATSSCR.size))

Remaining samples 9660


### Clean & tidy cols. 

In [0]:
samples.loc[(samples.SMTS.str.strip() == '') & samples.SMTSD.str.startswith("Skin -"), 'SMTS'] = 'Skin'

In [29]:
(samples.SMTS+" : "+samples.SMTSD).value_counts().sort_index()

Adipose Tissue : Adipose - Subcutaneous               517
Adipose Tissue : Adipose - Visceral (Omentum)         360
Adrenal Gland : Adrenal Gland                         164
Bladder : Bladder                                       8
Blood Vessel : Artery - Aorta                         310
Blood Vessel : Artery - Coronary                      181
Blood Vessel : Artery - Tibial                        534
Brain : Brain - Cerebellum                            153
Brain : Brain - Cortex                                152
Breast : Breast - Mammary Tissue                      292
Cervix Uteri : Cervix - Ectocervix                      5
Cervix Uteri : Cervix - Endocervix                      3
Colon : Colon - Sigmoid                               216
Colon : Colon - Transverse                            187
Esophagus : Esophagus - Gastroesophageal Junction     254
Esophagus : Esophagus - Mucosa                        408
Esophagus : Esophagus - Muscularis                    389
Fallopian Tube

### MERGE samples with subjects:

In [30]:
samples = pandas.merge(samples, subjects, how='inner', on='SUBJID')
samples.head()

Unnamed: 0,SAMPID,SMATSSCR,SMTS,SMTSD,SMUBRID,SUBJID,SEX,AGE,DTHHRDY
0,GTEX-111CU-0126-SM-5GZWZ,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,1,50-59,0.0
1,GTEX-111CU-0226-SM-5GZXC,0.0,Thyroid,Thyroid,2046,GTEX-111CU,1,50-59,0.0
2,GTEX-111CU-0326-SM-5GZXO,0.0,Lung,Lung,8952,GTEX-111CU,1,50-59,0.0
3,GTEX-111CU-0426-SM-5GZY1,0.0,Spleen,Spleen,2106,GTEX-111CU,1,50-59,0.0
4,GTEX-111CU-0526-SM-5EGHK,1.0,Pancreas,Pancreas,1150,GTEX-111CU,1,50-59,0.0


### Clean & tidy:

In [31]:
samples.dropna(how='any', inplace=True)
print(samples.shape)
samples.SEX = samples.SEX.apply(lambda x: 'female' if x==2 else 'male' if x==1 else None)
samples.SEX.value_counts().sort_index()

(8422, 9)


female    3061
male      5361
Name: SEX, dtype: int64

### READ GENE TPMs (full or demo subset)
Full file is ~56k rows, 2.6GB uncompressed.  Demo ~1k rows.

*   GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct.gz
*   GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm_demo.gct.gz


In [32]:
print('Upload GTEx RNAseq TPM datafile: ')
t0 = time.time()
uploaded = google.colab.files.upload()
fn = list(uploaded.keys())[0]
print('Uploaded "{name}" with {length} bytes'.format(name=fn, length=len(uploaded[fn])))
rnaseq = pandas.read_table(io.BytesIO(uploaded[fn]), compression='gzip', sep='\t', skiprows=2)
print("dataset nrows: %d ; ncols: %d:"%(rnaseq.shape[0],rnaseq.shape[1]), file=sys.stderr)
print("Elapsed: %ds"%(time.time()-t0))

Upload GTEx RNAseq TPM datafile: 


Saving GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm_demo.gct.gz to GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm_demo.gct.gz
Uploaded "GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm_demo.gct.gz" with 18560824 bytes
Elapsed: 174s


dataset nrows: 1154 ; ncols: 11690:


In [33]:
rnaseq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1154 entries, 0 to 1153
Columns: 11690 entries, Name to GTEX-ZXG5-0005-SM-57WCN
dtypes: float64(11688), object(2)
memory usage: 102.9+ MB


In [34]:
rnaseq = rnaseq.drop(columns=['Description'])
rnaseq = rnaseq.rename(columns={'Name':'ENSG'})
rnaseq.columns

Index(['ENSG', 'GTEX-1117F-0226-SM-5GZZ7', 'GTEX-111CU-1826-SM-5GZYN',
       'GTEX-111FC-0226-SM-5N9B8', 'GTEX-111VG-2326-SM-5N9BK',
       'GTEX-111YS-2426-SM-5GZZQ', 'GTEX-1122O-2026-SM-5NQ91',
       'GTEX-1128S-2126-SM-5H12U', 'GTEX-113IC-0226-SM-5HL5C',
       'GTEX-117YX-2226-SM-5EGJJ',
       ...
       'GTEX-ZVE2-0006-SM-51MRW', 'GTEX-ZVP2-0005-SM-51MRK',
       'GTEX-ZVT2-0005-SM-57WBW', 'GTEX-ZVT3-0006-SM-51MT9',
       'GTEX-ZVT4-0006-SM-57WB8', 'GTEX-ZVTK-0006-SM-57WBK',
       'GTEX-ZVZP-0006-SM-51MSW', 'GTEX-ZVZQ-0006-SM-51MR8',
       'GTEX-ZXES-0005-SM-57WCB', 'GTEX-ZXG5-0005-SM-57WCN'],
      dtype='object', length=11689)

### MELT: One row per ENSG+SAMPID+TPM triplet:
Easier to handle but ~3x storage.

In [0]:
rnaseq = rnaseq.melt(id_vars = "ENSG", var_name = "SAMPID", value_name = "TPM")

In [36]:
rnaseq.head()

Unnamed: 0,ENSG,SAMPID,TPM
0,ENSG00000229344.1,GTEX-1117F-0226-SM-5GZZ7,13.26
1,ENSG00000160072.15,GTEX-1117F-0226-SM-5GZZ7,35.91
2,ENSG00000178642.5,GTEX-1117F-0226-SM-5GZZ7,0.5973
3,ENSG00000158292.6,GTEX-1117F-0226-SM-5GZZ7,18.94
4,ENSG00000162444.11,GTEX-1117F-0226-SM-5GZZ7,238.4


In [37]:
rnaseq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13487952 entries, 0 to 13487951
Data columns (total 3 columns):
ENSG      object
SAMPID    object
TPM       float64
dtypes: float64(1), object(2)
memory usage: 308.7+ MB


### Read and merge gene symbols.
File from https://www.ensembl.org/biomart, dataset human genes, fields Gene stable ID, Gene stable ID version,  NCBI gene ID, HGNC symbol.

In [38]:
print('Upload Biomart ENSG2NCBI genes datafile: ')
uploaded = google.colab.files.upload()
fn = list(uploaded.keys())[0]
print('Uploaded "{name}" with {length} bytes'.format(name=fn, length=len(uploaded[fn])))
genes = pandas.read_csv(io.StringIO(uploaded[fn].decode('utf8')), sep='\t', usecols=[1,2,3], na_values=[''], dtype={2:str})
genes.columns = ['ENSG','NCBI','HGNC']
genes.dropna(inplace=True)

Upload Biomart ENSG2NCBI genes datafile: 


Saving biomart_ENSG2NCBI.tsv to biomart_ENSG2NCBI.tsv
Uploaded "biomart_ENSG2NCBI.tsv" with 2786143 bytes


In [39]:
genes.head()

Unnamed: 0,ENSG,NCBI,HGNC
1,ENSG00000211459.2,4549,MT-RNR1
3,ENSG00000210082.2,4550,MT-RNR2
5,ENSG00000198888.2,4535,MT-ND1
9,ENSG00000198763.3,4536,MT-ND2
15,ENSG00000198804.2,4512,MT-CO1


In [0]:
rnaseq = pandas.merge(rnaseq, genes, on='ENSG', how='inner')

In [41]:
rnaseq.head()

Unnamed: 0,ENSG,SAMPID,TPM,NCBI,HGNC
0,ENSG00000158292.6,GTEX-1117F-0226-SM-5GZZ7,18.94,387509,GPR153
1,ENSG00000158292.6,GTEX-111CU-1826-SM-5GZYN,44.34,387509,GPR153
2,ENSG00000158292.6,GTEX-111FC-0226-SM-5N9B8,9.884,387509,GPR153
3,ENSG00000158292.6,GTEX-111VG-2326-SM-5N9BK,13.78,387509,GPR153
4,ENSG00000158292.6,GTEX-111YS-2426-SM-5GZZQ,22.03,387509,GPR153


### Remove genes in pseudoautosomal regions (PAR) of chromosome Y ("ENSGR").

In [42]:
n_ensgr = rnaseq.ENSG.str.startswith('ENSGR').sum()
print('ENSGR gene TPMs: %d (%.2f%%)'%(n_ensgr,100*n_ensgr/rnaseq.shape[0]))

ENSGR gene TPMs: 0 (0.00%)


In [0]:
rnaseq = rnaseq[~rnaseq.ENSG.str.startswith('ENSGR')]

### Merge with samples:

In [0]:
rnaseq = pandas.merge(rnaseq, samples, how="inner", on="SAMPID")
rnaseq = rnaseq.reset_index(drop=True)

In [45]:
rnaseq.head()

Unnamed: 0,ENSG,SAMPID,TPM,NCBI,HGNC,SMATSSCR,SMTS,SMTSD,SMUBRID,SUBJID,SEX,AGE,DTHHRDY
0,ENSG00000158292.6,GTEX-111CU-1826-SM-5GZYN,44.34,387509,GPR153,0.0,Adipose Tissue,Adipose - Subcutaneous,2190,GTEX-111CU,male,50-59,0.0
1,ENSG00000162444.11,GTEX-111CU-1826-SM-5GZYN,72.3,116362,RBP7,0.0,Adipose Tissue,Adipose - Subcutaneous,2190,GTEX-111CU,male,50-59,0.0
2,ENSG00000142619.4,GTEX-111CU-1826-SM-5GZYN,0.0,51702,PADI3,0.0,Adipose Tissue,Adipose - Subcutaneous,2190,GTEX-111CU,male,50-59,0.0
3,ENSG00000162415.6,GTEX-111CU-1826-SM-5GZYN,1.634,57643,ZSWIM5,0.0,Adipose Tissue,Adipose - Subcutaneous,2190,GTEX-111CU,male,50-59,0.0
4,ENSG00000137976.7,GTEX-111CU-1826-SM-5GZYN,0.0,58511,DNASE2B,0.0,Adipose Tissue,Adipose - Subcutaneous,2190,GTEX-111CU,male,50-59,0.0


In [46]:
rnaseq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 921692 entries, 0 to 921691
Data columns (total 13 columns):
ENSG        921692 non-null object
SAMPID      921692 non-null object
TPM         921692 non-null float64
NCBI        921692 non-null object
HGNC        921692 non-null object
SMATSSCR    921692 non-null float64
SMTS        921692 non-null object
SMTSD       921692 non-null object
SMUBRID     921692 non-null object
SUBJID      921692 non-null object
SEX         921692 non-null object
AGE         921692 non-null object
DTHHRDY     921692 non-null float64
dtypes: float64(3), object(10)
memory usage: 91.4+ MB


### Remove data for gene-tissue pairs with all zero expression.

In [48]:
maxtpm_0 = (rnaseq[['ENSG', 'SMTSD', 'TPM']].groupby(by=['ENSG','SMTSD'], as_index=True).max() == 0).rename(columns={'TPM':'maxtpm_0'})
print(maxtpm_0.maxtpm_0.value_counts(dropna=False))

False    4266
True      570
Name: maxtpm_0, dtype: int64


In [49]:
maxtpm_0.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4836 entries, (ENSG00000039139.9, Adipose - Subcutaneous) to (ENSG00000270084.1, Vagina)
Data columns (total 1 columns):
maxtpm_0    4836 non-null bool
dtypes: bool(1)
memory usage: 15.6+ KB


In [50]:
rnaseq = pandas.merge(rnaseq, maxtpm_0, left_on=['ENSG', 'SMTSD'], right_index=True)
rnaseq.head()

Unnamed: 0,ENSG,SAMPID,TPM,NCBI,HGNC,SMATSSCR,SMTS,SMTSD,SMUBRID,SUBJID,SEX,AGE,DTHHRDY,maxtpm_0
0,ENSG00000158292.6,GTEX-111CU-1826-SM-5GZYN,44.34,387509,GPR153,0.0,Adipose Tissue,Adipose - Subcutaneous,2190,GTEX-111CU,male,50-59,0.0,False
124,ENSG00000158292.6,GTEX-111YS-2426-SM-5GZZQ,22.03,387509,GPR153,0.0,Adipose Tissue,Adipose - Subcutaneous,2190,GTEX-111YS,male,60-69,0.0,False
248,ENSG00000158292.6,GTEX-1122O-2026-SM-5NQ91,32.41,387509,GPR153,0.0,Adipose Tissue,Adipose - Subcutaneous,2190,GTEX-1122O,female,60-69,0.0,False
372,ENSG00000158292.6,GTEX-1128S-2126-SM-5H12U,26.06,387509,GPR153,0.0,Adipose Tissue,Adipose - Subcutaneous,2190,GTEX-1128S,female,60-69,2.0,False
496,ENSG00000158292.6,GTEX-117YX-2226-SM-5EGJJ,63.49,387509,GPR153,0.0,Adipose Tissue,Adipose - Subcutaneous,2190,GTEX-117YX,male,50-59,0.0,False


In [51]:
rnaseq = rnaseq[~rnaseq['maxtpm_0']]
rnaseq.drop(columns=['maxtpm_0'], inplace=True)
rnaseq.head()

Unnamed: 0,ENSG,SAMPID,TPM,NCBI,HGNC,SMATSSCR,SMTS,SMTSD,SMUBRID,SUBJID,SEX,AGE,DTHHRDY
0,ENSG00000158292.6,GTEX-111CU-1826-SM-5GZYN,44.34,387509,GPR153,0.0,Adipose Tissue,Adipose - Subcutaneous,2190,GTEX-111CU,male,50-59,0.0
124,ENSG00000158292.6,GTEX-111YS-2426-SM-5GZZQ,22.03,387509,GPR153,0.0,Adipose Tissue,Adipose - Subcutaneous,2190,GTEX-111YS,male,60-69,0.0
248,ENSG00000158292.6,GTEX-1122O-2026-SM-5NQ91,32.41,387509,GPR153,0.0,Adipose Tissue,Adipose - Subcutaneous,2190,GTEX-1122O,female,60-69,0.0
372,ENSG00000158292.6,GTEX-1128S-2126-SM-5H12U,26.06,387509,GPR153,0.0,Adipose Tissue,Adipose - Subcutaneous,2190,GTEX-1128S,female,60-69,2.0
496,ENSG00000158292.6,GTEX-117YX-2226-SM-5EGJJ,63.49,387509,GPR153,0.0,Adipose Tissue,Adipose - Subcutaneous,2190,GTEX-117YX,male,50-59,0.0


In [52]:
rnaseq.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 840847 entries, 0 to 921691
Data columns (total 13 columns):
ENSG        840847 non-null object
SAMPID      840847 non-null object
TPM         840847 non-null float64
NCBI        840847 non-null object
HGNC        840847 non-null object
SMATSSCR    840847 non-null float64
SMTS        840847 non-null object
SMTSD       840847 non-null object
SMUBRID     840847 non-null object
SUBJID      840847 non-null object
SEX         840847 non-null object
AGE         840847 non-null object
DTHHRDY     840847 non-null float64
dtypes: float64(3), object(10)
memory usage: 89.8+ MB


### Remove data for gene-tissue pairs not present in both sexes. (This removes most sex specific tissues.)

In [54]:
sex_count = (rnaseq[['ENSG', 'SMTSD', 'SEX']].groupby(by=['ENSG','SMTSD'], as_index=True).nunique()).rename(columns={'SEX':'sex_count'})
print(sex_count.sex_count.value_counts())

2    3442
1     824
Name: sex_count, dtype: int64


In [55]:
sex_count.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4266 entries, (ENSG00000039139.9, Adipose - Subcutaneous) to (ENSG00000270084.1, Vagina)
Data columns (total 3 columns):
ENSG         4266 non-null int64
SMTSD        4266 non-null int64
sex_count    4266 non-null int64
dtypes: int64(3)
memory usage: 109.7+ KB


In [56]:
sex_count = sex_count[['sex_count']] #Why needed?
sex_count.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4266 entries, (ENSG00000039139.9, Adipose - Subcutaneous) to (ENSG00000270084.1, Vagina)
Data columns (total 1 columns):
sex_count    4266 non-null int64
dtypes: int64(1)
memory usage: 43.1+ KB


In [57]:
rnaseq = pandas.merge(rnaseq, sex_count, left_on=['ENSG', 'SMTSD'], right_index=True, how="inner")
rnaseq.head()

Unnamed: 0,ENSG,SAMPID,TPM,NCBI,HGNC,SMATSSCR,SMTS,SMTSD,SMUBRID,SUBJID,SEX,AGE,DTHHRDY,sex_count
0,ENSG00000158292.6,GTEX-111CU-1826-SM-5GZYN,44.34,387509,GPR153,0.0,Adipose Tissue,Adipose - Subcutaneous,2190,GTEX-111CU,male,50-59,0.0,2
124,ENSG00000158292.6,GTEX-111YS-2426-SM-5GZZQ,22.03,387509,GPR153,0.0,Adipose Tissue,Adipose - Subcutaneous,2190,GTEX-111YS,male,60-69,0.0,2
248,ENSG00000158292.6,GTEX-1122O-2026-SM-5NQ91,32.41,387509,GPR153,0.0,Adipose Tissue,Adipose - Subcutaneous,2190,GTEX-1122O,female,60-69,0.0,2
372,ENSG00000158292.6,GTEX-1128S-2126-SM-5H12U,26.06,387509,GPR153,0.0,Adipose Tissue,Adipose - Subcutaneous,2190,GTEX-1128S,female,60-69,2.0,2
496,ENSG00000158292.6,GTEX-117YX-2226-SM-5EGJJ,63.49,387509,GPR153,0.0,Adipose Tissue,Adipose - Subcutaneous,2190,GTEX-117YX,male,50-59,0.0,2


In [0]:
rnaseq = rnaseq[rnaseq['sex_count'] == 2]
rnaseq.drop(columns=['sex_count'], inplace=True)

In [59]:
rnaseq.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 775641 entries, 0 to 899247
Data columns (total 13 columns):
ENSG        775641 non-null object
SAMPID      775641 non-null object
TPM         775641 non-null float64
NCBI        775641 non-null object
HGNC        775641 non-null object
SMATSSCR    775641 non-null float64
SMTS        775641 non-null object
SMTSD       775641 non-null object
SMUBRID     775641 non-null object
SUBJID      775641 non-null object
SEX         775641 non-null object
AGE         775641 non-null object
DTHHRDY     775641 non-null float64
dtypes: float64(3), object(10)
memory usage: 82.8+ MB


In [60]:
rnaseq.SMTSD.value_counts()

Muscle - Skeletal                        51870
Skin - Sun Exposed (Lower leg)           44312
Adipose - Subcutaneous                   43056
Artery - Tibial                          41697
Esophagus - Mucosa                       39788
Nerve - Tibial                           39445
Esophagus - Muscularis                   36838
Skin - Not Sun Exposed (Suprapubic)      36772
Thyroid                                  36504
Adipose - Visceral (Omentum)             35340
Lung                                     32592
Heart - Left Ventricle                   29640
Artery - Aorta                           29412
Breast - Mammary Tissue                  27246
Heart - Atrial Appendage                 27246
Esophagus - Gastroesophageal Junction    25199
Pancreas                                 23310
Stomach                                  22310
Colon - Transverse                       20520
Colon - Sigmoid                          19775
Artery - Coronary                        16786
Adrenal Gland

### Remove mammary tissue, although males have some too.

In [0]:
rnaseq = rnaseq[~rnaseq.SMTSD.str.match("^Breast")]

### Aggregate samples, compute median TPM by gene+tissue+sex:

In [62]:
rnaseq = rnaseq[['ENSG', 'SMTSD', 'SEX', 'TPM']].groupby(by=['ENSG','SMTSD','SEX'], as_index=False).median()
print(rnaseq.shape)
rnaseq.head()

(6656, 4)


Unnamed: 0,ENSG,SMTSD,SEX,TPM
0,ENSG00000039139.9,Adipose - Subcutaneous,female,0.07791
1,ENSG00000039139.9,Adipose - Subcutaneous,male,0.07429
2,ENSG00000039139.9,Adipose - Visceral (Omentum),female,0.052615
3,ENSG00000039139.9,Adipose - Visceral (Omentum),male,0.06432
4,ENSG00000039139.9,Adrenal Gland,female,0.07917


### Save median TPMs file for analysis, 1-row per gene+tissue+sex:

In [0]:
rnaseq.round(3).to_csv('gtex_rnaseq_prep_median.tsv', sep='\t', index=False)
google.colab.files.download('gtex_rnaseq_prep_median.tsv')

### Pivot TPMs to generate gene profiles:

In [0]:
tissues = pandas.Series(pandas.unique(rnaseq.SMTSD.sort_values()))


In [65]:
rnaseq_f = rnaseq[rnaseq.SEX=='female'].drop(columns=['SEX'])
rnaseq_f = rnaseq_f[['ENSG','SMTSD','TPM']]
exfiles_f = rnaseq_f.pivot(index='ENSG', columns='SMTSD')
exfiles_f.columns = exfiles_f.columns.get_level_values(1)
exfiles_f = exfiles_f.reset_index(drop=False)
exfiles_f['SEX'] = 'female'
exfiles_f.head()

SMTSD,ENSG,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Cerebellum,Brain - Cortex,...,Nerve - Tibial,Pancreas,Pituitary,Skin - Not Sun Exposed (Suprapubic),Skin - Sun Exposed (Lower leg),Small Intestine - Terminal Ileum,Spleen,Stomach,Thyroid,SEX
0,ENSG00000039139.9,0.07791,0.052615,0.07917,0.08334,0.06381,0.04626,0.6133,0.38585,0.286,...,0.1448,0.56865,3.337,0.07157,0.090805,0.1469,0.09615,0.6284,1.075,female
1,ENSG00000082516.8,15.57,11.12,7.94,10.42,10.57,12.92,13.78,9.5295,5.938,...,15.83,3.412,10.56,11.28,13.91,6.4245,9.688,5.186,10.83,female
2,ENSG00000100302.6,2.329,2.0015,2.873,18.81,6.09,3.982,5.7125,8.445,49.51,...,1.7685,1.031,4.525,2.015,2.481,6.015,0.41745,1.7725,2.306,female
3,ENSG00000100580.7,7.47,7.3135,11.33,7.127,7.6,9.406,10.0505,12.98,9.571,...,10.265,3.321,19.74,6.604,6.659,7.21,14.51,5.252,15.16,female
4,ENSG00000101200.5,0.6961,0.1427,0.2953,0.0,0.05195,0.0,0.19445,0.32035,0.2374,...,0.025665,0.0,0.2779,0.0,0.0,0.02317,0.05532,0.0,0.0754,female


In [66]:
rnaseq_m = rnaseq[rnaseq.SEX=='male'].drop(columns=['SEX'])
rnaseq_m = rnaseq_m[['ENSG','SMTSD','TPM']]
exfiles_m = rnaseq_m.pivot(index='ENSG', columns='SMTSD')
exfiles_m.columns = exfiles_m.columns.get_level_values(1)
exfiles_m = exfiles_m.reset_index(drop=False)
exfiles_m['SEX'] = 'male'
exfiles_m.head()

SMTSD,ENSG,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Cerebellum,Brain - Cortex,...,Nerve - Tibial,Pancreas,Pituitary,Skin - Not Sun Exposed (Suprapubic),Skin - Sun Exposed (Lower leg),Small Intestine - Terminal Ileum,Spleen,Stomach,Thyroid,SEX
0,ENSG00000039139.9,0.07429,0.06432,0.079345,0.07173,0.06564,0.057975,0.9218,0.4645,0.26985,...,0.1655,0.59615,2.8155,0.072565,0.11255,0.1305,0.07748,0.6789,1.096,male
1,ENSG00000082516.8,16.94,10.31,7.6125,10.32,10.43,12.48,13.385,9.9115,5.933,...,16.22,3.7645,9.512,10.125,13.095,6.367,9.679,5.703,12.01,male
2,ENSG00000100302.6,3.261,2.1845,2.553,19.86,7.353,3.634,6.519,7.9985,48.08,...,1.913,1.163,3.498,2.4375,2.7515,6.021,0.425,1.68,2.704,male
3,ENSG00000100580.7,7.337,7.542,10.55,6.99,7.849,9.1845,11.93,13.86,9.336,...,10.17,3.461,19.465,6.239,6.708,6.669,13.54,5.198,15.1,male
4,ENSG00000101200.5,0.602,0.10039,0.33475,0.0,0.0,0.0,0.0,0.4107,0.22565,...,0.0,0.0,0.14225,0.0,0.0,0.0,0.0,0.0,0.0,male


In [67]:
exfiles = pandas.concat([exfiles_f, exfiles_m])
cols = ['ENSG','SEX']+tissues.tolist()
exfiles = exfiles[cols]
exfiles.head()

SMTSD,ENSG,SEX,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Cerebellum,...,Muscle - Skeletal,Nerve - Tibial,Pancreas,Pituitary,Skin - Not Sun Exposed (Suprapubic),Skin - Sun Exposed (Lower leg),Small Intestine - Terminal Ileum,Spleen,Stomach,Thyroid
0,ENSG00000039139.9,female,0.07791,0.052615,0.07917,0.08334,0.06381,0.04626,0.6133,0.38585,...,0.01151,0.1448,0.56865,3.337,0.07157,0.090805,0.1469,0.09615,0.6284,1.075
1,ENSG00000082516.8,female,15.57,11.12,7.94,10.42,10.57,12.92,13.78,9.5295,...,10.22,15.83,3.412,10.56,11.28,13.91,6.4245,9.688,5.186,10.83
2,ENSG00000100302.6,female,2.329,2.0015,2.873,18.81,6.09,3.982,5.7125,8.445,...,6.118,1.7685,1.031,4.525,2.015,2.481,6.015,0.41745,1.7725,2.306
3,ENSG00000100580.7,female,7.47,7.3135,11.33,7.127,7.6,9.406,10.0505,12.98,...,2.065,10.265,3.321,19.74,6.604,6.659,7.21,14.51,5.252,15.16
4,ENSG00000101200.5,female,0.6961,0.1427,0.2953,0.0,0.05195,0.0,0.19445,0.32035,...,0.02967,0.025665,0.0,0.2779,0.0,0.0,0.02317,0.05532,0.0,0.0754


In [68]:
exfiles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 246 entries, 0 to 122
Data columns (total 32 columns):
ENSG                                     246 non-null object
SEX                                      246 non-null object
Adipose - Subcutaneous                   234 non-null float64
Adipose - Visceral (Omentum)             228 non-null float64
Adrenal Gland                            220 non-null float64
Artery - Aorta                           228 non-null float64
Artery - Coronary                        218 non-null float64
Artery - Tibial                          226 non-null float64
Bladder                                  188 non-null float64
Brain - Cerebellum                       208 non-null float64
Brain - Cortex                           210 non-null float64
Colon - Sigmoid                          226 non-null float64
Colon - Transverse                       228 non-null float64
Esophagus - Gastroesophageal Junction    226 non-null float64
Esophagus - Mucosa           

### Save expression profiles:

In [0]:
exfiles.round(3).to_csv('gtex_rnaseq_prep_profiles.tsv', sep='\t', index=False)
google.colab.files.download('gtex_rnaseq_prep_profiles.tsv')