# Process GREIN Human Data

Retrieve the downloaded expression data, update gene identifiers to entrez, and curate sample IDs. The script will also identify a balanced hold-out test set to compare projection performance into learned latent spaces across algorithms.

In [1]:
import os
import random
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
random.seed(1234)

## Read Phenotype Information: skipping until phenotype data found

In [3]:
# path = os.path.join('download', 'TARGET_phenotype.gz')
# pheno_df = pd.read_table(path)

# print(pheno_df.shape)
# pheno_df.head(3)

## Read Probe Mapping Info 
(where chromosomes start and end, saved in a file in downloads)

In [4]:
# path = os.path.join('download', 'gencode.v23.annotation.gene.probemap')
# probe_map_df = pd.read_table(path)

# # Inner merge gene df to get ensembl to entrez mapping
# probe_map_df = probe_map_df.merge(gene_df, how='inner', left_on='gene', right_on='symbol')

# # Mapping to rename gene expression index
# ensembl_to_entrez = dict(zip(probe_map_df.id, probe_map_df.entrez_gene_id))

# print(probe_map_df.shape)
# probe_map_df.head(3)

## Read Gene Expression Data

In [5]:
# reading in rat gene expression data

file = os.path.join('download', 'grein_count_matrix_human.pkl')
expr_df = pd.read_pickle(file)

print(expr_df.shape)
expr_df.head(4)

(27990, 57)


Unnamed: 0,gene,gene_symbol,GSM2667747,GSM2667748,GSM2667749,GSM2667750,GSM2667751,GSM2667752,GSM2667753,GSM2667754,...,GSM2668086,GSM2668087,GSM2668088,GSM2668089,GSM2671001,GSM2671002,GSM2671003,GSM2671004,GSM2671005,GSM2671006
0,ENSG00000000003,TSPAN6,1766.0677,420.8206,300.8009,3142.8439,2207.8392,4367.5056,228.5759,650.3287,...,254.7605,143.2263,174.3319,202.3253,1053.8788,2405.3001,940.577,973.7475,760.01,1472.2712
1,ENSG00000000005,TNMD,43.928,18.0091,0.0,115.8917,63.5936,39.7803,11.7588,1.0153,...,0.0,0.0,0.0,0.0,3.9978,11.0025,4.9979,8.0054,3.9978,11.0011
2,ENSG00000000419,DPM1,1097.7855,367.2333,316.5226,3895.0588,1536.1375,1084.5276,127.7205,245.1014,...,613.1214,374.0965,302.6622,454.057,1227.1084,2980.0756,1045.8061,1098.7163,759.7729,1398.8077
3,ENSG00000000457,SCYL3,601.4702,270.9239,163.6869,1177.2241,873.9074,845.1423,229.8548,134.4694,...,196.5995,185.3106,260.9485,183.7584,586.7993,1504.0115,619.3764,661.9514,469.5312,1113.1412


In [6]:
# Clean up x and y from expr_df

# create a list of cleaned columns
cleaned_column = ['gene', 'gene_symbol']
for column in expr_df.drop(['gene', 'gene_symbol'], axis = 1).columns:
    cleaned_column.append(column.split("_")[0])

# set colums to cleaned column
expr_df.columns = cleaned_column

In [7]:
# Check that it worked

pd.set_option('display.max_columns', None) 
expr_df.head()

Unnamed: 0,gene,gene_symbol,GSM2667747,GSM2667748,GSM2667749,GSM2667750,GSM2667751,GSM2667752,GSM2667753,GSM2667754,GSM2667755,GSM2667756,GSM2667757,GSM2667758,GSM2667759,GSM2667760,GSM2667761,GSM2667762,GSM2667763,GSM2667764,GSM2667765,GSM2667766,GSM2667767,GSM2667768,GSM2667769,GSM2667770,GSM2667771,GSM2667772,GSM2667773,GSM2667774,GSM2667775,GSM2667776,GSM2667777,GSM2667778,GSM2667779,GSM2667780,GSM2667781,GSM2667782,GSM2667783,GSM2667784,GSM2667785,GSM2667786,GSM2668081,GSM2668082,GSM2668083,GSM2668084,GSM2668085,GSM2668086,GSM2668087,GSM2668088,GSM2668089,GSM2671001,GSM2671002,GSM2671003,GSM2671004,GSM2671005,GSM2671006
0,ENSG00000000003,TSPAN6,1766.0677,420.8206,300.8009,3142.8439,2207.8392,4367.5056,228.5759,650.3287,4880.0185,8629.5404,2951.193,370.2859,321.4122,3721.4916,3499.9075,2462.5672,300.7646,370.7972,3470.2847,3503.2877,3362.5934,226.824,131.7482,4486.6134,7225.4738,3185.4922,531.7498,796.0883,3175.4423,2064.0788,1267.623,1845.9171,409.791,2540.874,2906.8171,2829.6631,1920.6238,1165.686,1874.826,2539.2382,265.0806,300.3062,331.6881,183.7755,278.5022,254.7605,143.2263,174.3319,202.3253,1053.8788,2405.3001,940.577,973.7475,760.01,1472.2712
1,ENSG00000000005,TNMD,43.928,18.0091,0.0,115.8917,63.5936,39.7803,11.7588,1.0153,67.1228,61.4995,54.1517,11.8111,5.713,79.6294,79.6162,69.2199,13.824,5.965,138.7558,84.4619,181.6344,46.6972,12.8469,168.0059,403.5948,58.7409,18.8835,12.094,67.2416,10.9099,133.5243,147.7444,15.2011,202.6725,343.3962,676.6289,376.7828,141.2399,380.3029,662.0918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9978,11.0025,4.9979,8.0054,3.9978,11.0011
2,ENSG00000000419,DPM1,1097.7855,367.2333,316.5226,3895.0588,1536.1375,1084.5276,127.7205,245.1014,2608.6175,3797.5758,855.0836,124.4127,220.1181,2932.3449,1795.7509,1499.5849,264.1154,536.7919,3716.6839,1998.1502,1210.7017,112.5526,150.2928,3655.9428,2458.6382,834.2019,135.4603,258.5075,2264.3384,1241.8778,217.2588,623.3691,165.4709,1166.0064,1281.7182,952.9915,635.3723,879.0727,1695.9609,1736.9615,537.7681,461.4597,436.1302,416.2508,568.8483,613.1214,374.0965,302.6622,454.057,1227.1084,2980.0756,1045.8061,1098.7163,759.7729,1398.8077
3,ENSG00000000457,SCYL3,601.4702,270.9239,163.6869,1177.2241,873.9074,845.1423,229.8548,134.4694,676.4072,1177.2415,1155.1429,232.3545,115.9401,1458.976,1182.6409,907.3663,335.9731,153.7599,1083.9457,711.613,1259.2643,275.6282,30.2989,1144.3081,1374.0298,1118.6175,325.3986,202.9192,1189.7186,881.3931,124.4839,948.1346,57.5744,547.4528,439.9557,1454.0821,1793.4598,232.8164,773.3686,939.7315,157.2023,151.5847,196.0139,142.7542,170.0119,196.5995,185.3106,260.9485,183.7584,586.7993,1504.0115,619.3764,661.9514,469.5312,1113.1412
4,ENSG00000000460,C1orf112,1040.8861,349.1418,298.18,1967.3975,2115.2425,980.9359,253.4457,145.7038,1024.4449,1294.0961,599.8307,117.7714,67.3678,785.8567,466.6687,1315.1359,300.5638,246.0739,1469.0028,1906.7889,1471.1647,229.3009,67.8212,1176.1864,1307.8388,678.5934,177.1994,92.6689,560.3419,386.5186,92.7886,354.358,49.4149,211.9013,110.9184,673.9635,749.7606,86.1525,266.8906,338.4995,83.2406,63.1939,50.6896,71.372,161.8204,204.1681,186.2225,127.6457,172.0572,441.0758,1258.3932,466.5542,450.2848,289.2767,467.7875


## Process gene expression matrix

This involves updating Entrez gene ids, sorting and subsetting

In [8]:
# expr_df = (expr_df
#     .dropna(axis='rows')
#     .reindex(probe_map_df.id)
#     .rename(index=ensembl_to_entrez)
#     .rename(index=old_to_new_entrez)
#     .groupby(level=0).mean()
#     .transpose()
#     .sort_index(axis='rows')
#     .sort_index(axis='columns')
# )

expr_df.index.rename('sample_id', inplace=True)

print(expr_df.shape)
expr_df.head(2)

(27990, 57)


Unnamed: 0_level_0,gene,gene_symbol,GSM2667747,GSM2667748,GSM2667749,GSM2667750,GSM2667751,GSM2667752,GSM2667753,GSM2667754,GSM2667755,GSM2667756,GSM2667757,GSM2667758,GSM2667759,GSM2667760,GSM2667761,GSM2667762,GSM2667763,GSM2667764,GSM2667765,GSM2667766,GSM2667767,GSM2667768,GSM2667769,GSM2667770,GSM2667771,GSM2667772,GSM2667773,GSM2667774,GSM2667775,GSM2667776,GSM2667777,GSM2667778,GSM2667779,GSM2667780,GSM2667781,GSM2667782,GSM2667783,GSM2667784,GSM2667785,GSM2667786,GSM2668081,GSM2668082,GSM2668083,GSM2668084,GSM2668085,GSM2668086,GSM2668087,GSM2668088,GSM2668089,GSM2671001,GSM2671002,GSM2671003,GSM2671004,GSM2671005,GSM2671006
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1
0,ENSG00000000003,TSPAN6,1766.0677,420.8206,300.8009,3142.8439,2207.8392,4367.5056,228.5759,650.3287,4880.0185,8629.5404,2951.193,370.2859,321.4122,3721.4916,3499.9075,2462.5672,300.7646,370.7972,3470.2847,3503.2877,3362.5934,226.824,131.7482,4486.6134,7225.4738,3185.4922,531.7498,796.0883,3175.4423,2064.0788,1267.623,1845.9171,409.791,2540.874,2906.8171,2829.6631,1920.6238,1165.686,1874.826,2539.2382,265.0806,300.3062,331.6881,183.7755,278.5022,254.7605,143.2263,174.3319,202.3253,1053.8788,2405.3001,940.577,973.7475,760.01,1472.2712
1,ENSG00000000005,TNMD,43.928,18.0091,0.0,115.8917,63.5936,39.7803,11.7588,1.0153,67.1228,61.4995,54.1517,11.8111,5.713,79.6294,79.6162,69.2199,13.824,5.965,138.7558,84.4619,181.6344,46.6972,12.8469,168.0059,403.5948,58.7409,18.8835,12.094,67.2416,10.9099,133.5243,147.7444,15.2011,202.6725,343.3962,676.6289,376.7828,141.2399,380.3029,662.0918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9978,11.0025,4.9979,8.0054,3.9978,11.0011


## Stratify Balanced Training and Testing Sets in TARGET Gene Expression

Output training and testing gene expression datasets

In [9]:
#strat = pheno_df.set_index('sample_id').reindex(expr_df.index).primary_disease_code

In [10]:
# cancertype_count_df = (
#     pd.DataFrame(strat.value_counts()) #not using value_counts, what number should we use here? 
#     .reset_index()
#     .rename({'index': 'cancertype', 'primary_disease_code': 'n ='}, axis='columns')
# )

# file = os.path.join('data', 'target_sample_counts.tsv') #change which file - do we have a file that works for this?
# cancertype_count_df.to_csv(file, sep='\t', index=False)

# cancertype_count_df

In [11]:
train_df, test_df = train_test_split(expr_df,
                                     test_size=0.1,
                                     random_state=123) #if no stratify defined, should just randomize on its own

In [12]:
print(train_df.shape)
test_df.shape

(25191, 57)


(2799, 57)

In [13]:
#save train dataframe to file 
train_file = os.path.join('data', 'train_grein_human_expression_matrix_processed.tsv.gz')
train_df.to_csv(train_file, sep='\t', compression='gzip', float_format='%.3g')

In [14]:
#save test dataframe to file 
test_file = os.path.join('data', 'test_grein_human_expression_matrix_processed.tsv.gz')
test_df.to_csv(test_file, sep='\t', compression='gzip', float_format='%.3g')

## Sort genes based on median absolute deviation and output to file

In [15]:
# function to calculate median absolute deviation
def mad(df): 
    """Function to calculate median absolute deviation for a dataframe
  argument1 (dataframe): Dataframe for which to calculate the median absolute deviation, row by row

  Returns: The median absolute deviation for each row of the data frame
  """
    row_medians = df.median(axis='columns', numeric_only=True)
    abs_row_median_diffs = abs(df.sub(row_medians, axis='rows'))
    return abs_row_median_diffs.median(axis='columns', numeric_only=True)

In [16]:
# add gene_id as a column in dataframe
train_df['gene_id'] = train_df['gene'] + train_df['gene_symbol']
col = train_df.pop('gene_id')
train_df.insert(0, col.name, col)

train_df.head()

Unnamed: 0_level_0,gene_id,gene,gene_symbol,GSM2667747,GSM2667748,GSM2667749,GSM2667750,GSM2667751,GSM2667752,GSM2667753,GSM2667754,GSM2667755,GSM2667756,GSM2667757,GSM2667758,GSM2667759,GSM2667760,GSM2667761,GSM2667762,GSM2667763,GSM2667764,GSM2667765,GSM2667766,GSM2667767,GSM2667768,GSM2667769,GSM2667770,GSM2667771,GSM2667772,GSM2667773,GSM2667774,GSM2667775,GSM2667776,GSM2667777,GSM2667778,GSM2667779,GSM2667780,GSM2667781,GSM2667782,GSM2667783,GSM2667784,GSM2667785,GSM2667786,GSM2668081,GSM2668082,GSM2668083,GSM2668084,GSM2668085,GSM2668086,GSM2668087,GSM2668088,GSM2668089,GSM2671001,GSM2671002,GSM2671003,GSM2671004,GSM2671005,GSM2671006
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1
27744,ENSG00000283046HTN1,ENSG00000283046,HTN1,125.4665,140.2465,17.6274,69.7745,73.7733,27.6203,98.1395,12.7103,3.626,24.9714,51.9728,66.2,3.0588,29.6436,78.6089,192.38,153.2949,4.0605,61.9086,132.972,178.1983,117.1625,6.5244,18.5502,57.2569,106.832,148.316,14.3281,19.252,147.4773,28.1115,281.7885,1.5309,18.0041,14.8531,305.5563,848.8558,13.7288,23.7737,30.6058,5.7421,4.749,7.7266,1.089,3.8178,8.0805,5.0509,7.7629,7.2611,58.678,185.7149,86.5948,123.6247,97.2255,128.9266
9537,ENSG00000153037SRP19,ENSG00000153037,SRP19,894.5558,357.1081,187.4946,1421.0479,456.0143,802.667,124.856,164.7368,3532.883,1244.874,1010.8748,139.985,138.333,3899.5298,930.8268,1137.9553,254.7177,245.631,1621.9018,456.3314,1035.1774,116.1225,123.1838,2368.7324,921.9137,1162.6473,218.2426,274.7572,2619.1004,674.3049,181.9868,715.1966,68.2822,985.3323,772.8154,1533.7694,953.2965,784.0392,2934.4142,1425.9365,598.988,577.2314,684.2199,376.1889,521.701,656.0729,793.1388,850.7232,926.3904,642.685,1674.6958,614.6459,749.6716,533.6487,943.2409
9769,ENSG00000155115GTF3C6,ENSG00000155115,GTF3C6,851.6143,187.0203,243.4404,2046.5828,476.0532,1977.8905,94.7822,881.8821,10161.3111,1382.629,1326.2597,141.7311,477.2282,5214.7134,767.6003,982.4279,117.6392,369.838,2667.1936,459.5399,1652.4284,63.1625,275.7449,3911.4982,1060.2774,1230.0665,147.4217,466.3677,4542.0178,614.1263,202.8739,289.3384,166.5569,1629.0443,566.4231,1570.2533,625.5507,2881.6804,3669.3315,1164.9276,777.0025,705.8804,790.7663,536.9289,669.2589,755.729,827.731,785.4842,994.0243,268.1323,567.9704,218.9266,288.9725,250.9841,536.0405
15239,ENSG00000184083FAM120C,ENSG00000184083,FAM120C,632.3881,285.1058,122.8606,832.8702,957.455,1495.592,184.7934,91.7322,383.1359,1374.5838,1628.9039,232.164,120.3054,898.8325,1279.4379,824.3853,188.6935,166.3629,881.8608,1010.1709,1574.6568,261.7398,58.3872,756.5876,2166.1719,1840.3923,257.8297,213.4336,876.4116,1554.1902,507.1274,1001.3734,76.3392,493.3627,725.1371,2403.1399,1715.6976,344.4401,526.2655,1218.9705,80.7898,72.3151,55.019,147.1572,131.1924,162.856,110.0482,141.0302,109.8994,159.4169,277.8073,134.6019,252.2688,188.7017,352.3888
203,ENSG00000008196TFAP2B,ENSG00000008196,TFAP2B,23.2873,2.0018,19.1983,54.7994,19.094,974.568,88.7088,88.8378,620.9386,1124.1208,6430.8975,744.3863,499.9203,5383.4356,7004.764,18.7766,6.8329,7.8788,36.7517,19.2128,1565.6738,132.576,22.7705,1405.099,2319.8399,7436.2614,813.1225,864.8631,4407.3383,5406.7102,439.1032,1307.2588,149.5145,998.4512,1370.776,4782.1691,2161.576,1078.2573,2003.782,3485.0708,51.0392,55.8615,39.7806,27.1175,30.9595,31.9595,120.6547,156.4436,155.0772,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# Determine most variably expressed genes and subset
# create dataframe to save median absolute deviation data for rats
train_df_mad = mad(train_df.drop(['gene_id','gene', 'gene_symbol'], axis = 1))
train_df_mad.columns = ['gene_id', 'median_abs_deviation']

train_df_mad = train_df_mad.sort_values(ascending=False)
train_df_mad

sample_id
26434    2.318332e+06
25781    2.305465e+06
27829    4.295272e+05
26724    1.809434e+05
9917     1.733389e+05
             ...     
8967     0.000000e+00
26441    0.000000e+00
26418    0.000000e+00
27924    0.000000e+00
19966    0.000000e+00
Length: 25191, dtype: float64

In [18]:
# Save to tsv file
file = os.path.join('data', 'grein_mad_human_genes.tsv')
train_df_mad.to_csv(file, sep='\t', index=False)