- This notebook is used for performing preprocessing of GTEx expression data
- webiste available at https://www.gtexportal.org/home/datasets
- We're analyzing RNA Sequencing Gene read counts

In [2]:
import pandas as pd
import numpy as np

In [6]:
# unzip within read_csv would be a more optimal approach
raw = pd.read_csv("~/dataSets/GTEX/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct",sep = "\t",skiprows = 2)

In [3]:
raw = pd.read_csv("~/dataSets/GTEX/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct",sep = "\t",skiprows = 2,nrows = 5)

In [4]:
raw

Unnamed: 0,Name,Description,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,...,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
0,ENSG00000223972.5,DDX11L1,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,1,1
1,ENSG00000227232.5,WASH7P,187,109,143,251,113,139,199,473,...,72,96,136,79,89,86,49,84,34,66
2,ENSG00000278267.1,MIR6859-1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ENSG00000243485.5,MIR1302-2HG,1,0,0,1,0,0,0,0,...,0,0,1,0,2,2,0,1,0,0
4,ENSG00000237613.2,FAM138A,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0


# 7 gb of raw GTEX genes

# Trim Genes by using protein coding gene list from biomart

In [11]:
# load in list for protein coding genes
proteinGenes = list(pd.read_csv("/home/ydong/dataSets/proteinCodedGenes.csv",header = None)[0])

In [14]:
# get the protein coding GTEx genes, and trim ENSGXXXX.Y to ENSGXXXX
rawNames = list(raw["Name"])
names = []
keepL = []
for i in range(len(rawNames)):
    curr = rawNames[i].split(".")[0]
    if curr in proteinGenes:
        keepL.append(i)
        names.append(curr)
raw = raw.iloc[keepL,:]

In [40]:
raw = raw.set_index("Name")
originNames = list(raw.index)

In [46]:
# get rid of dulplicate genes
dup = list(set([x for x in names if names.count(x) > 1]))

# check for each duplication
for j in range(len(dup)):

    allZero = True
    # get their column number in normedMat
    indices = [i for i, x in enumerate(names) if x == dup[j]]
    
    curr = raw.iloc[indices,:].drop(["Description"],axis = 1)
       
    # pick the gene with highest median expression across individuals:
    meds = []
    for c in range(len(curr.index)):
        currMed = np.median(curr.iloc[c,:])
        meds.append(currMed)
        if currMed != 0:
            allZero = False
        
    # if all of the medians are zero, discard all of the currently chosen genes 
    if allZero == True:
        for d in list(curr.columns):
            originNames.remove(d)
    # keep the one with highest "expression" and discard the rest
    else:
        bestI = meds.index(max(meds))
        choices = list(curr.index)
        choices.pop(bestI)
        for d in range(len(choices)):
            originNames.remove(choices[d])

In [48]:
len(originNames)

19199

# removed 18 duplications

In [51]:
raw = raw.loc[originNames,:]
raw = raw.drop("Description",axis = 1)

# Removed 5 tissues, 50 left

In [82]:
# tissue annotation
samples = pd.read_csv('/home/ydong/dataSets/GTEX/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt', sep="\t")
#find the sample IDs for muscle samples
#all of them are Muscle-skeletal samples
tissues = samples[["SAMPID","SMTSD"]]
tissues = tissues.set_index("SAMPID")
tissueTypes = list(tissues["SMTSD"].unique())

In [86]:
# remove tissues with < 50 samples
count = 0
dropList = []
for i in tissueTypes:
    if sum(tissues["SMTSD"] == i) <=50:
        dropList.append(i)
    count +=1
for i in dropList:
    tissueTypes.remove(i)

In [94]:
rawTis = list(raw.columns)
dropList = []
for i in rawTis:
    if tissues["SMTSD"][i] not in tissueTypes:
        dropList.append(i)
        
raw = raw.drop(dropList,axis = 1)

# Library Normalization (like DESEQ2)

In [98]:
# get median expression
medianExpression = []
for n in raw.index:
    cols = raw.loc[n,:]
    medianExpression.append(int(np.median(list(cols))))
medianExpression = np.array(medianExpression)

In [128]:
# compute fold change regardless of nan values
FC = []
for sample in list(raw.columns):
    currFC = []
    currCol = np.array(raw[sample])
    currFC = currCol/medianExpression
    FC.append(np.nanmedian(currFC))

  """
  """


In [141]:
for i in range(len(FC)):
# for i in [0,1]:
    currSample = list(raw.columns)[i]
    currCol = raw[currSample]
    currFC = FC[i]
    raw.iloc[:,i] = np.around(currCol / currFC, 3)

In [142]:
raw.to_csv("normedGTEx.csv")

# Keep genes with median expression > 0 in at least 1 tissue

In [145]:
len(tissueTypes)

50

In [166]:
# list of gtex corresponding tissues
indexDict = {key: [] for key in tissueTypes} 
gtexTis = []
dropList = []
for i in range(len(raw.columns)):
    currSamp  = raw.columns[i]
    currTis = tissues["SMTSD"][currSamp]
    if currTis not in indexDict:
        dropList.append(currSamp)
    else:
        gtexTis.append(currTis)
        indexDict[currTis].append(i)

In [185]:
#Loop through all tissues to see if there's one tissue with median > 0
dropGeneList = []
for currGene in raw.index:
    currRow = raw.loc[currGene,:]
    keep = False
    
    for currTis in tissueTypes:
        tisIdx = indexDict[currTis]
        currMedian = np.median(currRow.iloc[tisIdx])
        if currMedian >0:
            keep = True
            break
    
    if keep == False:
        dropGeneList.append(currGene)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [187]:
len(dropGeneList)

350

In [189]:
raw = raw.drop(dropGeneList,axis = 0)

In [191]:
raw.to_csv("GTExTrimmedNormedCounts.csv")

# generate median scale

In [192]:
# get median expression
medianExpression = []
for n in raw.index:
    cols = raw.loc[n,:]
    medianExpression.append(int(np.median(list(cols))))
medianExpression = np.array(medianExpression)

In [195]:
medianScale = pd.DataFrame(medianExpression,index = raw.index)
medianScale = medianScale[medianScale["0"]!= 0]
medianScale.to_csv("medianScale.csv")

In [6]:
medianScale

Unnamed: 0_level_0,0
Name,Unnamed: 1_level_1
ENSG00000186092.4,1
ENSG00000187634.11,181
ENSG00000188976.10,3276
ENSG00000187961.13,759
ENSG00000187583.10,47
...,...
ENSG00000212907.2,86431
ENSG00000198886.2,926360
ENSG00000198786.2,223490
ENSG00000198695.2,70595


##  Log transform the matrix

In [3]:
raw = pd.read_csv("GTExTrimmedNormedCounts.csv",index_col = "Name")

In [6]:
raw = raw + 1
raw

Unnamed: 0_level_0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,GTEX-1117F-2926-SM-5GZYI,...,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000186092.4,1.0,5.409,2.332,1.000,2.549,2.450,3.239,1.000,1.000,2.185,...,5.019,4.020,2.0,4.442,3.664,2.091,4.135,1.000,7.729,1.000
ENSG00000187634.11,21.0,78.154,19.645,667.201,1571.843,167.705,1216.930,134.066,50.583,47.226,...,781.744,397.324,83.0,181.154,87.138,50.091,48.026,489.701,180.988,13.647
ENSG00000188976.10,3840.0,11545.376,4875.233,2849.455,2784.830,2497.233,3156.147,4508.605,3615.565,4657.954,...,3116.943,2701.284,3026.0,3208.193,3331.957,2577.727,5642.504,3445.031,9809.511,2309.589
ENSG00000187961.13,509.0,153.103,380.551,937.055,755.438,721.457,806.022,1371.928,1157.264,1777.730,...,631.021,576.990,1134.0,855.869,635.933,114.455,745.572,499.756,403.030,516.614
ENSG00000187583.10,86.0,51.701,80.905,64.715,101.695,92.326,44.666,6259.471,163.631,3808.110,...,34.159,51.578,29.0,73.291,27.641,23.909,5766.338,35.189,44.735,24.349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000212907.2,24132.0,193023.768,33625.216,23698.461,278593.172,106976.597,18420.206,21785.277,84269.452,33848.959,...,243205.298,137282.941,116738.0,179743.812,155728.358,78865.000,93407.945,44003.152,302434.861,101507.918
ENSG00000198886.2,273189.0,2186512.951,369734.195,263081.144,3790888.580,1235995.620,217413.914,277694.504,1004167.823,515795.675,...,1744130.055,1475150.392,1202014.0,1913739.507,1465082.274,762931.545,953131.497,506749.248,2722876.337,913490.294
ENSG00000198786.2,85259.0,882866.942,158080.626,35859.679,562079.123,131535.930,33406.594,61912.503,109954.184,109019.234,...,1262615.931,276934.520,374008.0,666223.962,616904.761,138239.909,156583.701,367522.895,1335442.161,337774.002
ENSG00000198695.2,32049.0,362834.337,68927.446,6868.213,147593.582,27453.760,7649.264,13150.880,19271.739,44710.838,...,466105.225,97165.186,140564.0,267236.503,238035.237,39553.000,38638.799,189908.412,511059.589,114822.437


In [11]:
new = np.log(raw)
new.to_csv("logTrimmedNormedCounts.csv")