In [None]:
# We are working with the following data :

# TPM normalized 
# bulk-gex_v8_rna-seq_GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct

# METADATA data that is downloaded from github GTEX website
# https://github.com/broadinstitute/gtex-v8/tree/master/data
# GTEx_Analysis_v8_RNAseq_samples.txt


In [None]:

# We have applied the following pre-processing steps described below :
# I have preferred to use awk/unix shell commands as it expedites the processing of large files

# to remove the first 2 lines

file="bulk-gex_v8_rna-seq_GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct"
awk 'NR>2' $file > df.txt

# to remove the 1st column that contains the ENSEMBL ID
cut -f2- df.txt > df2.txt

# to compute the number of columns (samples)
head -n1 df2.txt | grep -o "\t" | wc -l
# 17382

# to compute the number of rows (genes)
cat df2.txt | wc -l
# 56201

# to keep only en entry per gene, if the gene name is duplicated
awk '!a[$1]++' df2.txt > df3.txt

cat  df3.txt | wc -l
# 54592 genes

In [None]:
# To select only the protein coding genes
# To get names of the protein coding genesin the human genome ; 
# we downloaded the file Homo_sapiens.GRCh38.104.chr.gtf.gz 
# and extracted the names of the protein coding genes

wget http://ftp.ensembl.org/pub/release-104/gtf/homo_sapiens/Homo_sapiens.GRCh38.104.chr.gtf.gz ./
grep "gene_biotype \"protein_coding\"" > Homo_sapiens.GRCh38.104.protein_coding.gtf
awk '$3=="gene"{print $14}' Homo_sapiens.GRCh38.104.protein_coding.gtf \
| sed 's/\"//g' | sed 's/\;//g'| sort -u \
> Homo_sapiens.GRCh38.104.protein_coding.txt

# in this particular repo from Ensembl we can count 19352 names of the protein coding genes 

In [None]:
# To extract from data frame df3 only the expression data of the protein-coding genes, 
# by considering the names listed in  Homo_sapiens.GRCh38.104.protein_coding.txt

file1="Homo_sapiens.GRCh38.104.protein_coding.txt"
file2="df3.txt"
awk 'NR==FNR{c[$1]++;next};c[$1] > 0' $file1 $file2 > df3.protein_coding.txt

# To add the head containing the names of the samples to the file df3.protein_coding.txt
cat df3.header df3.protein_coding.txt > df3.protein_coding_genes.txt

In [None]:

# To restrict the analysis to the following tissues :
# liver, heart, kidney, lung, muscle, pancreas, spleen, stomach, pituitary gland, and thyroid.
# To extract the IDs of the samples from the metadata file : GTEx_Analysis_v8_RNAseq_samples.txt

grep "Liver" GTEx_Analysis_v8_RNAseq_samples.txt > GTEx_Analysis_v8_RNAseq_samples.Liver.txt
grep "Heart" GTEx_Analysis_v8_RNAseq_samples.txt > GTEx_Analysis_v8_RNAseq_samples.Heart.txt
grep "Kidney" GTEx_Analysis_v8_RNAseq_samples.txt > GTEx_Analysis_v8_RNAseq_samples.Kidney.txt
grep "Lung" GTEx_Analysis_v8_RNAseq_samples.txt > GTEx_Analysis_v8_RNAseq_samples.Lung.txt
grep "Muscle" GTEx_Analysis_v8_RNAseq_samples.txt > GTEx_Analysis_v8_RNAseq_samples.Muscle.txt
grep "Pancreas" GTEx_Analysis_v8_RNAseq_samples.txt > GTEx_Analysis_v8_RNAseq_samples.Pancreas.txt
grep "Spleen" GTEx_Analysis_v8_RNAseq_samples.txt > GTEx_Analysis_v8_RNAseq_samples.Spleen.txt
grep "Stomach" GTEx_Analysis_v8_RNAseq_samples.txt > GTEx_Analysis_v8_RNAseq_samples.Stomach.txt
grep "Pituitary" GTEx_Analysis_v8_RNAseq_samples.txt > GTEx_Analysis_v8_RNAseq_samples.Pituitary.txt
grep "Thyroid" GTEx_Analysis_v8_RNAseq_samples.txt > GTEx_Analysis_v8_RNAseq_samples.Thyroid.txt

# We are working with the following number of samples 

861 : GTEx_Analysis_v8_RNAseq_samples.Heart.txt
89  : GTEx_Analysis_v8_RNAseq_samples.Kidney.txt
226 : GTEx_Analysis_v8_RNAseq_samples.Liver.txt
578 : GTEx_Analysis_v8_RNAseq_samples.Lung.txt
803 : GTEx_Analysis_v8_RNAseq_samples.Muscle.txt
328 : GTEx_Analysis_v8_RNAseq_samples.Pancreas.txt
283 : GTEx_Analysis_v8_RNAseq_samples.Pituitary.txt
241 : GTEx_Analysis_v8_RNAseq_samples.Spleen.txt
359 : GTEx_Analysis_v8_RNAseq_samples.Stomach.txt
653 : GTEx_Analysis_v8_RNAseq_samples.Thyroid.txt

cat \
GTEx_Analysis_v8_RNAseq_samples.Heart.txt \
GTEx_Analysis_v8_RNAseq_samples.Kidney.txt \
GTEx_Analysis_v8_RNAseq_samples.Liver.txt \
GTEx_Analysis_v8_RNAseq_samples.Lung.txt \
GTEx_Analysis_v8_RNAseq_samples.Muscle.txt \
GTEx_Analysis_v8_RNAseq_samples.Pancreas.txt \
GTEx_Analysis_v8_RNAseq_samples.Pituitary.txt \
GTEx_Analysis_v8_RNAseq_samples.Spleen.txt \
GTEx_Analysis_v8_RNAseq_samples.Stomach.txt \
GTEx_Analysis_v8_RNAseq_samples.Thyroid.txt \
> GTEx_Analysis_v8_RNAseq_samples.ASSIGN.txt

# There are 4421 samples

In [None]:
# We perform the next steps of the analysis in R.