# Process GREIN Human Data

Retrieve the downloaded expression data, update gene identifiers to entrez, and curate sample IDs. The script will also identify a balanced hold-out test set to compare projection performance into learned latent spaces across algorithms.

In [1]:
import os
import random
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
random.seed(1234)

## Read Phenotype Information: skipping until phenotype data found

In [3]:
# path = os.path.join('download', 'TARGET_phenotype.gz')
# pheno_df = pd.read_table(path)

# print(pheno_df.shape)
# pheno_df.head(3)

## Read Probe Mapping Info 
(where chromosomes start and end, saved in a file in downloads)

In [None]:
# path = os.path.join('download', 'gencode.v23.annotation.gene.probemap')
# probe_map_df = pd.read_table(path)

# # Inner merge gene df to get ensembl to entrez mapping
# probe_map_df = probe_map_df.merge(gene_df, how='inner', left_on='gene', right_on='symbol')

# # Mapping to rename gene expression index
# ensembl_to_entrez = dict(zip(probe_map_df.id, probe_map_df.entrez_gene_id))

# print(probe_map_df.shape)
# probe_map_df.head(3)

## Read Gene Expression Data

In [5]:
file = os.path.join('download', 'grein_count_matrix_human.pkl')
expr_df = pd.read_pickle(file)

print(expr_df.shape)
expr_df.head(4)

(27990, 389)


Unnamed: 0,gene,gene_symbol,GSM2667747,GSM2667748,GSM2667749,GSM2667750,GSM2667751,GSM2667752,GSM2667753,GSM2667754,...,GSM2677357_y,GSM2677358_y,GSM2677359_y,GSM2677360_y,GSM2677361_y,GSM2677362_y,GSM2677363_y,GSM2677364_y,GSM2677365_y,GSM2677366_y
0,ENSG00000000003,TSPAN6,1766.0677,420.8206,300.8009,3142.8439,2207.8392,4367.5056,228.5759,650.3287,...,1746.5758,1976.5137,2638.7298,1729.9494,2379.3129,2268.7277,2562.7032,2698.5277,2250.7768,2217.6718
1,ENSG00000000005,TNMD,43.928,18.0091,0.0,115.8917,63.5936,39.7803,11.7588,1.0153,...,0.0,0.0,0.0,0.0,0.0,0.9994,0.0,0.0,0.0,0.0
2,ENSG00000000419,DPM1,1097.7855,367.2333,316.5226,3895.0588,1536.1375,1084.5276,127.7205,245.1014,...,1364.0155,1370.6393,1062.8008,1309.8411,1817.321,1230.3613,1113.5876,1330.7528,1347.0459,1366.543
3,ENSG00000000457,SCYL3,601.4702,270.9239,163.6869,1177.2241,873.9074,845.1423,229.8548,134.4694,...,823.638,845.0037,811.9292,775.972,1017.1045,869.4791,740.8164,1028.2608,986.2346,926.6533


## Process gene expression matrix

This involves updating Entrez gene ids, sorting and subsetting

In [6]:
# expr_df = (expr_df
#     .dropna(axis='rows')
#     .reindex(probe_map_df.id)
#     .rename(index=ensembl_to_entrez)
#     .rename(index=old_to_new_entrez)
#     .groupby(level=0).mean()
#     .transpose()
#     .sort_index(axis='rows')
#     .sort_index(axis='columns')
# )

expr_df.index.rename('sample_id', inplace=True)

print(expr_df.shape)
expr_df.head(2)

(27990, 389)


Unnamed: 0_level_0,gene,gene_symbol,GSM2667747,GSM2667748,GSM2667749,GSM2667750,GSM2667751,GSM2667752,GSM2667753,GSM2667754,...,GSM2677357_y,GSM2677358_y,GSM2677359_y,GSM2677360_y,GSM2677361_y,GSM2677362_y,GSM2677363_y,GSM2677364_y,GSM2677365_y,GSM2677366_y
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,ENSG00000000003,TSPAN6,1766.0677,420.8206,300.8009,3142.8439,2207.8392,4367.5056,228.5759,650.3287,...,1746.5758,1976.5137,2638.7298,1729.9494,2379.3129,2268.7277,2562.7032,2698.5277,2250.7768,2217.6718
1,ENSG00000000005,TNMD,43.928,18.0091,0.0,115.8917,63.5936,39.7803,11.7588,1.0153,...,0.0,0.0,0.0,0.0,0.0,0.9994,0.0,0.0,0.0,0.0


## Stratify Balanced Training and Testing Sets in TARGET Gene Expression

Output training and testing gene expression datasets

In [None]:
#strat = pheno_df.set_index('sample_id').reindex(expr_df.index).primary_disease_code

In [None]:
# cancertype_count_df = (
#     pd.DataFrame(strat.value_counts()) #not using value_counts, what number should we use here? 
#     .reset_index()
#     .rename({'index': 'cancertype', 'primary_disease_code': 'n ='}, axis='columns')
# )

# file = os.path.join('data', 'target_sample_counts.tsv') #change which file - do we have a file that works for this?
# cancertype_count_df.to_csv(file, sep='\t', index=False)

# cancertype_count_df

In [7]:
train_df, test_df = train_test_split(expr_df,
                                     test_size=0.1,
                                     random_state=123) #if no stratify defined, should just randomize on its own

In [8]:
print(train_df.shape)
test_df.shape

(25191, 389)


(2799, 389)

In [12]:
#save train dataframe to file 
train_file = os.path.join('data', 'train_grein_human_processed.tsv.gz')
train_df.to_csv(train_file, sep='\t', compression='gzip', float_format='%.3g')

In [13]:
#save test dataframe to file 
test_file = os.path.join('data', 'test_grein_human_processed.tsv.gz')
test_df.to_csv(test_file, sep='\t', compression='gzip', float_format='%.3g')

## Sort genes based on median absolute deviation and output to file

In [15]:
# Determine most variably expressed genes and subset
train_df_mad = abs(train_df.drop(['gene', 'gene_symbol'], axis = 1) - train_df.drop(['gene', 'gene_symbol'], axis = 1).median()).median().reset_index()
train_df_mad.columns = ['gene_id', 'median_abs_deviation']

file = os.path.join('data', 'grein_mad_human_genes.tsv')
train_df_mad.to_csv(file, sep='\t', index=False)