<table width="100%" style="border:none">
  <tr>
    <td valign="top">
      <h1>GTEx RNAseq Preprocessing</h1>
      <ul>
<li>Author: Jeremy Yang
<li>Based on R code by Oleg Ursu.
<li>Required: Python3, Pandas 0.22+
<li>Clean, tidy, reshape RNAseq expression data.
<li>Save aggregated-samples median TPM file for downstream co-expression analysis.
<li>Save expression profiles (exfiles) TPM file for downstream co-expression analysis.
      </ul>
      [About Colaboratory](https://research.google.com/colaboratory/faq.html).
    </td>
    <td align="right">
        <p>NIH Data Commons: Team Helium</p>
<img style="float:right" width="100" src="https://avatars2.githubusercontent.com/u/33356654?s=200&v=4" alt="HeliumDataCommons Logo" />
    </td>
  </tr>
  </table>
 

In [1]:
import sys,os,re,time,io
import urllib.request
try:
    import google.colab
except:
    pass
import numpy,scipy
import pandas as pd
print('Python: %s; Pandas: %s; Scipy: %s ; Numpy: %s'%(sys.version.split()[0],pd.__version__,scipy.__version__,numpy.__version__))

Python: 3.7.4; Pandas: 0.25.1; Scipy: 1.3.1 ; Numpy: 1.17.2


### Get subjects datafile:
(GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt)

In [2]:
url = "https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt"
subjects = pd.read_csv(url, sep="\t")
print("dataset nrows: %d ; ncols: %d:"%(subjects.shape[0],subjects.shape[1]), file=sys.stderr)

dataset nrows: 980 ; ncols: 4:


In [3]:
subjects.head()

Unnamed: 0,SUBJID,SEX,AGE,DTHHRDY
0,GTEX-1117F,2,60-69,4.0
1,GTEX-111CU,1,50-59,0.0
2,GTEX-111FC,1,60-69,1.0
3,GTEX-111VG,1,60-69,3.0
4,GTEX-111YS,1,60-69,0.0


In [4]:
subjects.AGE.value_counts().sort_index()

20-29     84
30-39     78
40-49    153
50-59    315
60-69    317
70-79     33
Name: AGE, dtype: int64

### Remove less healthy subjects: 
(DTHHRDY = 4-point Hardy Scale Death Classification.)

In [5]:
print("Subjects with Hardy score > 2 or NA: %d (removing)"%(subjects.query('DTHHRDY > 2').shape[0]), file=sys.stderr)
subjects = subjects.query('DTHHRDY <= 2')
print("dataset ncols: %d ; nrows: %d:"%(subjects.shape[0],subjects.shape[1]), file=sys.stderr)
subjects.DTHHRDY.value_counts(sort=True, dropna=False).sort_index()

Subjects with Hardy score > 2 or NA: 176 (removing)
dataset ncols: 785 ; nrows: 4:


0.0    511
1.0     35
2.0    239
Name: DTHHRDY, dtype: int64

### Get samples datafile:
(GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt)

In [6]:
url = "https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt"
samples = pd.read_csv(url, sep="\t")
samples = samples[['SAMPID', 'SMATSSCR', 'SMTS', 'SMTSD', 'SMUBRID']]
print("dataset nrows: %d ; ncols: %d:"%(samples.shape[0],samples.shape[1]), file=sys.stderr)

dataset nrows: 22951 ; ncols: 5:


 * SMTS = Tissue Type (parent of SMTSD)
 * SMTSD = Tissue Type, more specific
 * SMATSSCR = Autolysis Score, 0=None, 1=Mild, 2=Moderate, 3=Severe
 * Note that other sample attributes may be suitable for quality criteria.
 * SMUBRID = Uberon ID, anatomical location
 * SUBJID is first two hyphen-delimted fields of SAMPID.

In [7]:
samples['SUBJID'] = samples.SAMPID.str.extract('^([^-]+-[^-]+)-', expand=True)

In [8]:
samples.head()

Unnamed: 0,SAMPID,SMATSSCR,SMTS,SMTSD,SMUBRID,SUBJID
0,GTEX-1117F-0003-SM-58Q7G,,Blood,Whole Blood,13756,GTEX-1117F
1,GTEX-1117F-0003-SM-5DWSB,,Blood,Whole Blood,13756,GTEX-1117F
2,GTEX-1117F-0003-SM-6WBT7,,Blood,Whole Blood,13756,GTEX-1117F
3,GTEX-1117F-0011-R10a-SM-AHZ7F,,Brain,Brain - Frontal Cortex (BA9),9834,GTEX-1117F
4,GTEX-1117F-0011-R10b-SM-CYKQ8,,Brain,Brain - Frontal Cortex (BA9),9834,GTEX-1117F


In [9]:
samples.SMATSSCR.value_counts(dropna=False).sort_index()

0.0     3554
1.0    10410
2.0     1582
3.0      193
NaN     7212
Name: SMATSSCR, dtype: int64

### Remove samples with high degree of autolysis (self-digestion).


In [10]:
print("Removing %d/%d (%.1f%%)"%((~(samples.SMATSSCR<2)).sum(), samples.SMATSSCR.size, 100*(~(samples.SMATSSCR<2)).sum()/samples.SMATSSCR.size))

Removing 8987/22951 (39.2%)


In [11]:
samples = samples[samples.SMATSSCR < 2]
print("Remaining samples %d"%(samples.SMATSSCR.size))

Remaining samples 13964


### Clean & tidy cols. 

In [12]:
samples.loc[(samples.SMTS.str.strip() == '') & samples.SMTSD.str.startswith("Skin -"), 'SMTS'] = 'Skin'

In [13]:
(samples.SMTS+" : "+samples.SMTSD).value_counts().sort_index()

Adipose Tissue : Adipose - Subcutaneous                752
Adipose Tissue : Adipose - Visceral (Omentum)          560
Adrenal Gland : Adrenal Gland                          209
Bladder : Bladder                                        8
Blood Vessel : Artery - Aorta                          444
Blood Vessel : Artery - Coronary                       251
Blood Vessel : Artery - Tibial                         757
Brain : Brain - Cerebellum                             226
Brain : Brain - Cortex                                 268
Breast : Breast - Mammary Tissue                       460
Cervix Uteri : Cervix - Ectocervix                       7
Cervix Uteri : Cervix - Endocervix                       7
Colon : Colon - Sigmoid                                342
Colon : Colon - Transverse                             260
Esophagus : Esophagus - Gastroesophageal Junction      384
Esophagus : Esophagus - Mucosa                         559
Esophagus : Esophagus - Muscularis                     5

### MERGE samples with subjects:

In [14]:
samples = pd.merge(samples, subjects, how='inner', on='SUBJID')
samples.head()

Unnamed: 0,SAMPID,SMATSSCR,SMTS,SMTSD,SMUBRID,SUBJID,SEX,AGE,DTHHRDY
0,GTEX-111CU-0126-SM-5GZWZ,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,1,50-59,0.0
1,GTEX-111CU-0226-SM-5GZXC,0.0,Thyroid,Thyroid,2046,GTEX-111CU,1,50-59,0.0
2,GTEX-111CU-0326-SM-5GZXO,0.0,Lung,Lung,8952,GTEX-111CU,1,50-59,0.0
3,GTEX-111CU-0426-SM-5GZY1,0.0,Spleen,Spleen,2106,GTEX-111CU,1,50-59,0.0
4,GTEX-111CU-0526-SM-5EGHK,1.0,Pancreas,Pancreas,1150,GTEX-111CU,1,50-59,0.0


### Clean & tidy:

In [15]:
samples.dropna(how='any', inplace=True)
print(samples.shape)
samples.SEX = samples.SEX.apply(lambda x: 'female' if x==2 else 'male' if x==1 else None)
samples.SEX.value_counts().sort_index()

(11984, 9)


female    4114
male      7870
Name: SEX, dtype: int64

### READ GENE TPMs (full or demo subset)
Full file is ~56k rows, ~4GB uncompressed.  Demo ~1k rows.

*   GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
*   GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm_DEMO-1000.gct.gz


In [16]:
t0 = time.time()
try:
    rnaseq = pd.read_table("/home/data/GTEx/exfiles_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm_DEMO-1000.gct.gz", compression='gzip', sep='\t', skiprows=2)
    print("dataset nrows: %d ; ncols: %d:"%(rnaseq.shape[0],rnaseq.shape[1]), file=sys.stderr)
except:
    try:
        print('Upload GTEx RNAseq TPM datafile: ')
        uploaded = google.colab.files.upload()
        fn = list(uploaded.keys())[0]
        print('Uploaded "{name}" with {length} bytes'.format(name=fn, length=len(uploaded[fn])))
        rnaseq = pd.read_table(io.BytesIO(uploaded[fn]), compression='gzip', sep='\t', skiprows=2)
        print("dataset nrows: %d ; ncols: %d:"%(rnaseq.shape[0],rnaseq.shape[1]), file=sys.stderr)
    except:
        print("Failed.")
print("Elapsed: %ds"%(time.time()-t0))

Elapsed: 25s


dataset nrows: 997 ; ncols: 17384:


In [17]:
rnaseq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 997 entries, 0 to 996
Columns: 17384 entries, Name to GTEX-ZZPU-2726-SM-5NQ8O
dtypes: float64(17382), object(2)
memory usage: 132.2+ MB


In [18]:
rnaseq = rnaseq.drop(columns=['Description'])
rnaseq = rnaseq.rename(columns={'Name':'ENSG'})
rnaseq.head()

Unnamed: 0,ENSG,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,...,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
0,ENSG00000223972.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.03629,0.0,0.0,0.0,0.0,0.0,0.0,0.01965,0.02522
1,ENSG00000227232.5,8.764,3.861,7.349,11.07,3.306,5.389,11.99,16.95,10.04,...,1.606,2.268,5.386,2.31,2.456,4.023,1.922,2.857,0.8696,2.167
2,ENSG00000278267.1,0.0,0.0,1.004,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ENSG00000243485.5,0.07187,0.0,0.0,0.06761,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.06073,0.0,0.08464,0.1435,0.0,0.05216,0.0,0.0
4,ENSG00000237613.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03904,0.0,...,0.02429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### MELT: One row per ENSG+SAMPID+TPM triplet:
Easier to handle but ~3x storage.

In [19]:
rnaseq = rnaseq.melt(id_vars = "ENSG", var_name = "SAMPID", value_name = "TPM")

In [20]:
rnaseq.head()

Unnamed: 0,ENSG,SAMPID,TPM
0,ENSG00000223972.5,GTEX-1117F-0226-SM-5GZZ7,0.0
1,ENSG00000227232.5,GTEX-1117F-0226-SM-5GZZ7,8.764
2,ENSG00000278267.1,GTEX-1117F-0226-SM-5GZZ7,0.0
3,ENSG00000243485.5,GTEX-1117F-0226-SM-5GZZ7,0.07187
4,ENSG00000237613.2,GTEX-1117F-0226-SM-5GZZ7,0.0


In [21]:
rnaseq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17329854 entries, 0 to 17329853
Data columns (total 3 columns):
ENSG      object
SAMPID    object
TPM       float64
dtypes: float64(1), object(2)
memory usage: 396.6+ MB


### Read and merge gene symbols.
File from https://www.ensembl.org/biomart, dataset human genes, fields Gene stable ID, Gene stable ID version,  NCBI gene ID, HGNC symbol.

In [22]:
try:
    genes = pd.read_csv("/home/data/GTEx/exfiles_data/biomart_ENSG2xrefs_human.tsv", sep='\t', usecols=[1,2,4], na_values=[''], dtype={2:str})
except Exception as e:
    print(e)
    try:
        print('Upload Biomart ENSG2NCBI genes datafile: ')
        uploaded = google.colab.files.upload()
        fn = list(uploaded.keys())[0]
        print('Uploaded "{name}" with {length} bytes'.format(name=fn, length=len(uploaded[fn])))
        genes = pandas.read_csv(io.StringIO(uploaded[fn].decode('utf8')), sep='\t', usecols=[1,2,4], na_values=[''], dtype={2:str})
    except:
        print("Failed.")
genes.columns = ['ENSG', 'NCBI', 'HGNC']
genes.dropna(inplace=True)

In [23]:
genes.head()

Unnamed: 0,ENSG,NCBI,HGNC
0,ENSG00000198888.2,4535,MT-ND1
1,ENSG00000198763.3,4536,MT-ND2
2,ENSG00000198804.2,4512,MT-CO1
3,ENSG00000198712.1,4513,MT-CO2
4,ENSG00000228253.1,4509,MT-ATP8


In [24]:
rnaseq = pd.merge(rnaseq, genes, on='ENSG', how='inner')

In [25]:
rnaseq.head()

Unnamed: 0,ENSG,SAMPID,TPM,NCBI,HGNC
0,ENSG00000187642.9,GTEX-1117F-0226-SM-5GZZ7,1.282,84808,PERM1
1,ENSG00000187642.9,GTEX-1117F-0426-SM-5EGHI,19.69,84808,PERM1
2,ENSG00000187642.9,GTEX-1117F-0526-SM-5EGHJ,2.266,84808,PERM1
3,ENSG00000187642.9,GTEX-1117F-0626-SM-5N9CS,0.7919,84808,PERM1
4,ENSG00000187642.9,GTEX-1117F-0726-SM-5GIEN,47.8,84808,PERM1


### Remove genes in pseudoautosomal regions (PAR) of chromosome Y ("ENSGR").

In [26]:
n_ensgr = rnaseq.ENSG.str.startswith('ENSGR').sum()
print('ENSGR gene TPMs: %d (%.2f%%)'%(n_ensgr,100*n_ensgr/rnaseq.shape[0]))

ENSGR gene TPMs: 0 (0.00%)


In [27]:
rnaseq = rnaseq[~rnaseq.ENSG.str.startswith('ENSGR')]

### Merge with samples:

In [28]:
rnaseq = pd.merge(rnaseq, samples, how="inner", on="SAMPID")
rnaseq = rnaseq.reset_index(drop=True)

In [29]:
rnaseq.head()

Unnamed: 0,ENSG,SAMPID,TPM,NCBI,HGNC,SMATSSCR,SMTS,SMTSD,SMUBRID,SUBJID,SEX,AGE,DTHHRDY
0,ENSG00000187642.9,GTEX-111CU-0126-SM-5GZWZ,0.669,84808,PERM1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,male,50-59,0.0
1,ENSG00000131591.17,GTEX-111CU-0126-SM-5GZWZ,4.028,54991,C1orf159,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,male,50-59,0.0
2,ENSG00000184163.3,GTEX-111CU-0126-SM-5GZWZ,0.7839,388581,C1QTNF12,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,male,50-59,0.0
3,ENSG00000175756.13,GTEX-111CU-0126-SM-5GZWZ,74.15,54998,AURKAIP1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,male,50-59,0.0
4,ENSG00000235098.8,GTEX-111CU-0126-SM-5GZWZ,0.8247,441869,ANKRD65,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,male,50-59,0.0


In [30]:
rnaseq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 848240 entries, 0 to 848239
Data columns (total 13 columns):
ENSG        848240 non-null object
SAMPID      848240 non-null object
TPM         848240 non-null float64
NCBI        848240 non-null object
HGNC        848240 non-null object
SMATSSCR    848240 non-null float64
SMTS        848240 non-null object
SMTSD       848240 non-null object
SMUBRID     848240 non-null object
SUBJID      848240 non-null object
SEX         848240 non-null object
AGE         848240 non-null object
DTHHRDY     848240 non-null float64
dtypes: float64(3), object(10)
memory usage: 84.1+ MB


### Remove data for gene-tissue pairs with all zero expression.

In [31]:
maxtpm_0 = (rnaseq[['ENSG', 'SMTSD', 'TPM']].groupby(by=['ENSG','SMTSD'], as_index=True).max() == 0).rename(columns={'TPM':'maxtpm_0'})
print(maxtpm_0.maxtpm_0.value_counts(dropna=False))

False    3083
True       77
Name: maxtpm_0, dtype: int64


In [32]:
maxtpm_0.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3160 entries, (ENSG00000008130.15, Adipose - Subcutaneous) to (ENSG00000280267.4, Vagina)
Data columns (total 1 columns):
maxtpm_0    3160 non-null bool
dtypes: bool(1)
memory usage: 10.3+ KB


In [33]:
rnaseq = pd.merge(rnaseq, maxtpm_0, left_on=['ENSG', 'SMTSD'], right_index=True)
rnaseq.head()

Unnamed: 0,ENSG,SAMPID,TPM,NCBI,HGNC,SMATSSCR,SMTS,SMTSD,SMUBRID,SUBJID,SEX,AGE,DTHHRDY,maxtpm_0
0,ENSG00000187642.9,GTEX-111CU-0126-SM-5GZWZ,0.669,84808,PERM1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,male,50-59,0.0,False
2320,ENSG00000187642.9,GTEX-111YS-0126-SM-5987T,0.3483,84808,PERM1,1.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111YS,male,60-69,0.0,False
4320,ENSG00000187642.9,GTEX-1122O-0326-SM-5H124,0.1945,84808,PERM1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-1122O,female,60-69,0.0,False
8160,ENSG00000187642.9,GTEX-117YX-0126-SM-5EGH5,0.7508,84808,PERM1,1.0,Adrenal Gland,Adrenal Gland,2369,GTEX-117YX,male,50-59,0.0,False
11120,ENSG00000187642.9,GTEX-11DXX-0126-SM-5EGH7,0.2936,84808,PERM1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-11DXX,female,60-69,0.0,False


In [34]:
rnaseq = rnaseq[~rnaseq['maxtpm_0']]
rnaseq.drop(columns=['maxtpm_0'], inplace=True)
rnaseq.head()

Unnamed: 0,ENSG,SAMPID,TPM,NCBI,HGNC,SMATSSCR,SMTS,SMTSD,SMUBRID,SUBJID,SEX,AGE,DTHHRDY
0,ENSG00000187642.9,GTEX-111CU-0126-SM-5GZWZ,0.669,84808,PERM1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,male,50-59,0.0
2320,ENSG00000187642.9,GTEX-111YS-0126-SM-5987T,0.3483,84808,PERM1,1.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111YS,male,60-69,0.0
4320,ENSG00000187642.9,GTEX-1122O-0326-SM-5H124,0.1945,84808,PERM1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-1122O,female,60-69,0.0
8160,ENSG00000187642.9,GTEX-117YX-0126-SM-5EGH5,0.7508,84808,PERM1,1.0,Adrenal Gland,Adrenal Gland,2369,GTEX-117YX,male,50-59,0.0
11120,ENSG00000187642.9,GTEX-11DXX-0126-SM-5EGH7,0.2936,84808,PERM1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-11DXX,female,60-69,0.0


In [35]:
rnaseq.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 841134 entries, 0 to 687119
Data columns (total 13 columns):
ENSG        841134 non-null object
SAMPID      841134 non-null object
TPM         841134 non-null float64
NCBI        841134 non-null object
HGNC        841134 non-null object
SMATSSCR    841134 non-null float64
SMTS        841134 non-null object
SMTSD       841134 non-null object
SMUBRID     841134 non-null object
SUBJID      841134 non-null object
SEX         841134 non-null object
AGE         841134 non-null object
DTHHRDY     841134 non-null float64
dtypes: float64(3), object(10)
memory usage: 89.8+ MB


### Remove data for gene-tissue pairs not present in both sexes. (This removes most sex specific tissues.)

In [36]:
sex_count = (rnaseq[['ENSG', 'SMTSD', 'SEX']].groupby(by=['ENSG','SMTSD'], as_index=True).nunique()).rename(columns={'SEX':'sex_count'})
print(sex_count.sex_count.value_counts())

2    2414
1     669
Name: sex_count, dtype: int64


In [37]:
sex_count.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3083 entries, (ENSG00000008130.15, Adipose - Subcutaneous) to (ENSG00000280267.4, Vagina)
Data columns (total 3 columns):
ENSG         3083 non-null int64
SMTSD        3083 non-null int64
sex_count    3083 non-null int64
dtypes: int64(3)
memory usage: 79.4+ KB


In [38]:
sex_count = sex_count[['sex_count']] #Why needed?
sex_count.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3083 entries, (ENSG00000008130.15, Adipose - Subcutaneous) to (ENSG00000280267.4, Vagina)
Data columns (total 1 columns):
sex_count    3083 non-null int64
dtypes: int64(1)
memory usage: 31.2+ KB


In [39]:
rnaseq = pd.merge(rnaseq, sex_count, left_on=['ENSG', 'SMTSD'], right_index=True, how="inner")
rnaseq.head()

Unnamed: 0,ENSG,SAMPID,TPM,NCBI,HGNC,SMATSSCR,SMTS,SMTSD,SMUBRID,SUBJID,SEX,AGE,DTHHRDY,sex_count
0,ENSG00000187642.9,GTEX-111CU-0126-SM-5GZWZ,0.669,84808,PERM1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111CU,male,50-59,0.0,2
2320,ENSG00000187642.9,GTEX-111YS-0126-SM-5987T,0.3483,84808,PERM1,1.0,Adrenal Gland,Adrenal Gland,2369,GTEX-111YS,male,60-69,0.0,2
4320,ENSG00000187642.9,GTEX-1122O-0326-SM-5H124,0.1945,84808,PERM1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-1122O,female,60-69,0.0,2
8160,ENSG00000187642.9,GTEX-117YX-0126-SM-5EGH5,0.7508,84808,PERM1,1.0,Adrenal Gland,Adrenal Gland,2369,GTEX-117YX,male,50-59,0.0,2
11120,ENSG00000187642.9,GTEX-11DXX-0126-SM-5EGH7,0.2936,84808,PERM1,0.0,Adrenal Gland,Adrenal Gland,2369,GTEX-11DXX,female,60-69,0.0,2


In [40]:
rnaseq = rnaseq[rnaseq['sex_count'] == 2]
rnaseq.drop(columns=['sex_count'], inplace=True)

In [41]:
rnaseq.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 778809 entries, 0 to 688639
Data columns (total 13 columns):
ENSG        778809 non-null object
SAMPID      778809 non-null object
TPM         778809 non-null float64
NCBI        778809 non-null object
HGNC        778809 non-null object
SMATSSCR    778809 non-null float64
SMTS        778809 non-null object
SMTSD       778809 non-null object
SMUBRID     778809 non-null object
SUBJID      778809 non-null object
SEX         778809 non-null object
AGE         778809 non-null object
DTHHRDY     778809 non-null float64
dtypes: float64(3), object(10)
memory usage: 83.2+ MB


In [42]:
rnaseq.SMTSD.value_counts()

Muscle - Skeletal                        51520
Artery - Tibial                          44320
Skin - Sun Exposed (Lower leg)           43914
Adipose - Subcutaneous                   43600
Nerve - Tibial                           40800
Skin - Not Sun Exposed (Suprapubic)      39200
Adipose - Visceral (Omentum)             37120
Esophagus - Mucosa                       36080
Esophagus - Muscularis                   35120
Thyroid                                  35022
Lung                                     30731
Breast - Mammary Tissue                  29680
Artery - Aorta                           29600
Heart - Left Ventricle                   28960
Heart - Atrial Appendage                 27440
Esophagus - Gastroesophageal Junction    26544
Colon - Sigmoid                          23040
Pancreas                                 21280
Stomach                                  20560
Colon - Transverse                       19118
Artery - Coronary                        16720
Spleen       

### Remove mammary tissue, although males have some too.

In [43]:
rnaseq = rnaseq[~rnaseq.SMTSD.str.match("^Breast")]

### Aggregate samples, compute median TPM by gene+tissue+sex:

In [44]:
rnaseq = rnaseq[['ENSG', 'SMTSD', 'SEX', 'TPM']].groupby(by=['ENSG','SMTSD','SEX'], as_index=False).median()
print(rnaseq.shape)
rnaseq.head()

(4670, 4)


Unnamed: 0,ENSG,SMTSD,SEX,TPM
0,ENSG00000008130.15,Adipose - Subcutaneous,female,63.265
1,ENSG00000008130.15,Adipose - Subcutaneous,male,53.59
2,ENSG00000008130.15,Adipose - Visceral (Omentum),female,58.31
3,ENSG00000008130.15,Adipose - Visceral (Omentum),male,51.745
4,ENSG00000008130.15,Adrenal Gland,female,67.88


### Save median TPMs file for analysis, 1-row per gene+tissue+sex:

In [45]:
rnaseq.round(3).to_csv('gtex_rnaseq_prep_median.tsv', sep='\t', index=False)
try:
    google.colab.files.download('gtex_rnaseq_prep_median.tsv')
except:
    pass

### Pivot TPMs to generate gene profiles:

In [46]:
tissues = pd.Series(pd.unique(rnaseq.SMTSD.sort_values()))

In [47]:
rnaseq_f = rnaseq[rnaseq.SEX=='female'].drop(columns=['SEX'])
rnaseq_f = rnaseq_f[['ENSG','SMTSD','TPM']]
exfiles_f = rnaseq_f.pivot(index='ENSG', columns='SMTSD')
exfiles_f.columns = exfiles_f.columns.get_level_values(1)
exfiles_f = exfiles_f.reset_index(drop=False)
exfiles_f['SEX'] = 'female'
exfiles_f.head()

SMTSD,ENSG,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Cerebellum,Brain - Cortex,...,Nerve - Tibial,Pancreas,Pituitary,Skin - Not Sun Exposed (Suprapubic),Skin - Sun Exposed (Lower leg),Small Intestine - Terminal Ileum,Spleen,Stomach,Thyroid,SEX
0,ENSG00000008130.15,63.265,58.31,67.88,52.91,55.63,64.275,41.735,23.725,15.05,...,50.225,15.85,30.73,30.165,31.61,45.95,91.19,36.19,42.065,female
1,ENSG00000011007.12,28.49,28.155,16.3,22.59,23.8,21.98,28.805,20.82,8.917,...,24.46,13.88,20.6,27.905,28.11,20.55,23.11,31.0,28.05,female
2,ENSG00000041988.15,15.13,11.91,12.2,16.9,16.21,18.935,16.975,18.31,9.9845,...,19.89,5.858,15.11,11.265,12.22,8.854,13.01,9.044,18.205,female
3,ENSG00000049246.14,14.63,6.434,9.508,12.72,12.08,21.875,20.825,64.96,12.745,...,29.135,4.622,30.83,14.36,19.15,5.8585,5.333,9.725,13.635,female
4,ENSG00000053371.12,57.755,47.125,95.15,54.31,49.07,43.6,59.745,37.32,36.78,...,60.745,28.96,62.87,52.97,55.31,65.385,43.95,50.38,53.47,female


In [48]:
rnaseq_m = rnaseq[rnaseq.SEX=='male'].drop(columns=['SEX'])
rnaseq_m = rnaseq_m[['ENSG','SMTSD','TPM']]
exfiles_m = rnaseq_m.pivot(index='ENSG', columns='SMTSD')
exfiles_m.columns = exfiles_m.columns.get_level_values(1)
exfiles_m = exfiles_m.reset_index(drop=False)
exfiles_m['SEX'] = 'male'
exfiles_m.head()

SMTSD,ENSG,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Cerebellum,Brain - Cortex,...,Nerve - Tibial,Pancreas,Pituitary,Skin - Not Sun Exposed (Suprapubic),Skin - Sun Exposed (Lower leg),Small Intestine - Terminal Ileum,Spleen,Stomach,Thyroid,SEX
0,ENSG00000008130.15,53.59,51.745,70.475,52.49,52.275,54.415,47.39,24.5,15.39,...,45.585,16.99,27.78,27.965,28.88,43.16,99.15,36.04,43.15,male
1,ENSG00000011007.12,27.04,27.14,16.345,22.02,23.735,20.855,28.83,21.35,8.581,...,24.085,14.3,19.56,26.475,28.73,18.4,23.62,31.59,29.15,male
2,ENSG00000041988.15,15.95,12.715,13.675,17.93,16.445,18.64,13.825,19.67,9.6495,...,21.025,5.909,15.22,11.725,12.51,9.394,14.62,9.469,18.76,male
3,ENSG00000049246.14,17.05,9.078,9.585,13.4,13.08,23.565,19.87,70.39,12.575,...,30.95,4.491,34.41,16.65,21.845,6.213,5.0715,9.4715,17.03,male
4,ENSG00000053371.12,56.14,48.81,93.77,54.62,46.905,45.555,54.465,37.43,35.355,...,61.415,30.95,58.4,57.485,56.64,63.71,46.54,53.14,59.35,male


In [49]:
exfiles = pd.concat([exfiles_f, exfiles_m])
cols = ['ENSG','SEX']+tissues.tolist()
exfiles = exfiles[cols]
exfiles.head()

SMTSD,ENSG,SEX,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Cerebellum,...,Muscle - Skeletal,Nerve - Tibial,Pancreas,Pituitary,Skin - Not Sun Exposed (Suprapubic),Skin - Sun Exposed (Lower leg),Small Intestine - Terminal Ileum,Spleen,Stomach,Thyroid
0,ENSG00000008130.15,female,63.265,58.31,67.88,52.91,55.63,64.275,41.735,23.725,...,9.904,50.225,15.85,30.73,30.165,31.61,45.95,91.19,36.19,42.065
1,ENSG00000011007.12,female,28.49,28.155,16.3,22.59,23.8,21.98,28.805,20.82,...,18.36,24.46,13.88,20.6,27.905,28.11,20.55,23.11,31.0,28.05
2,ENSG00000041988.15,female,15.13,11.91,12.2,16.9,16.21,18.935,16.975,18.31,...,8.409,19.89,5.858,15.11,11.265,12.22,8.854,13.01,9.044,18.205
3,ENSG00000049246.14,female,14.63,6.434,9.508,12.72,12.08,21.875,20.825,64.96,...,6.047,29.135,4.622,30.83,14.36,19.15,5.8585,5.333,9.725,13.635
4,ENSG00000053371.12,female,57.755,47.125,95.15,54.31,49.07,43.6,59.745,37.32,...,34.4,60.745,28.96,62.87,52.97,55.31,65.385,43.95,50.38,53.47


In [50]:
exfiles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158 entries, 0 to 78
Data columns (total 32 columns):
ENSG                                     158 non-null object
SEX                                      158 non-null object
Adipose - Subcutaneous                   158 non-null float64
Adipose - Visceral (Omentum)             158 non-null float64
Adrenal Gland                            156 non-null float64
Artery - Aorta                           158 non-null float64
Artery - Coronary                        158 non-null float64
Artery - Tibial                          158 non-null float64
Bladder                                  152 non-null float64
Brain - Cerebellum                       150 non-null float64
Brain - Cortex                           148 non-null float64
Colon - Sigmoid                          158 non-null float64
Colon - Transverse                       156 non-null float64
Esophagus - Gastroesophageal Junction    156 non-null float64
Esophagus - Mucosa            

### Save expression profiles:

In [51]:
exfiles.round(3).to_csv('gtex_rnaseq_prep_profiles.tsv', sep='\t', index=False)
try:
    google.colab.files.download('gtex_rnaseq_prep_profiles.tsv')
except:
    pass