# Process GREIN Mice Data

Retrieve the downloaded expression data, update gene identifiers to entrez, and curate sample IDs. The script will also identify a balanced hold-out test set to compare projection performance into learned latent spaces across algorithms.

In [1]:
import os
import random
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
random.seed(1234)

## Read Phenotype Information: skipping until phenotype data found

In [3]:
# path = os.path.join('download', 'TARGET_phenotype.gz')
# pheno_df = pd.read_table(path)

# print(pheno_df.shape)
# pheno_df.head(3)

## Read Probe Mapping Info 
(where chromosomes start and end, saved in a file in downloads)

In [4]:
# path = os.path.join('download', 'gencode.v23.annotation.gene.probemap')
# probe_map_df = pd.read_table(path)

# # Inner merge gene df to get ensembl to entrez mapping
# probe_map_df = probe_map_df.merge(gene_df, how='inner', left_on='gene', right_on='symbol')

# # Mapping to rename gene expression index
# ensembl_to_entrez = dict(zip(probe_map_df.id, probe_map_df.entrez_gene_id))

# print(probe_map_df.shape)
# probe_map_df.head(3)

## Read Gene Expression Data

In [5]:
# reading in rat gene expression data

file = os.path.join('download', 'grein_count_matrix_mice.pkl')
expr_df = pd.read_pickle(file)

print(expr_df.shape)
expr_df.head(4)

(23735, 109)


Unnamed: 0,gene,gene_symbol,GSM2668159,GSM2668160,GSM2668161,GSM2668162,GSM2668163,GSM2668164,GSM2668165,GSM2668166,...,GSM2684036,GSM2684037,GSM2684038,GSM2684039,GSM2684040,GSM2684041,GSM2684042,GSM2684043,GSM2684044,GSM2684045
0,ENSMUSG00000000001,Gnai3,1643.7329,1287.4175,540.1979,877.4265,2050.4781,1490.6809,1252.3796,2130.0521,...,1066.6858,1024.1555,1200.6371,1309.4173,717.883,902.5204,595.6157,565.6663,761.7363,941.8854
1,ENSMUSG00000000003,Pbsn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0775,8.3259,6.8965,2.6833,5.6937,6.3878,3.0077,4.0,3.0179,4.1877
2,ENSMUSG00000000028,Cdc45,254.4606,89.381,94.1421,96.0356,132.1116,138.3019,0.0,77.1044,...,43.0341,51.0001,66.3696,66.9355,39.9999,65.4094,32.5245,41.5892,45.5967,43.6638
3,ENSMUSG00000000031,H19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,44.1487,42.8114,32.186,49.2632,28.1929,40.5748,23.3903,17.3259,32.0964,45.8233


In [6]:
# Clean up x and y from expr_df

# create a list of cleaned columns
cleaned_column = ['gene', 'gene_symbol']
for column in expr_df.drop(['gene', 'gene_symbol'], axis = 1).columns:
    cleaned_column.append(column.split("_")[0])

# set colums to cleaned column
expr_df.columns = cleaned_column

In [7]:
# Check that it worked

pd.set_option('display.max_columns', None) 
expr_df.head()

Unnamed: 0,gene,gene_symbol,GSM2668159,GSM2668160,GSM2668161,GSM2668162,GSM2668163,GSM2668164,GSM2668165,GSM2668166,GSM2668167,GSM2668168,GSM2668169,GSM2668170,GSM2668171,GSM2668172,GSM2668173,GSM2668174,GSM2668175,GSM2668176,GSM2668177,GSM2668178,GSM2668179,GSM2668180,GSM2668181,GSM2668182,GSM2668183,GSM2668184,GSM2668185,GSM2668186,GSM2668542,GSM2668543,GSM2668544,GSM2668545,GSM2668546,GSM2668547,GSM2668548,GSM2668549,GSM2668550,GSM2668551,GSM2668552,GSM2668553,GSM2670763,GSM2670764,GSM2670765,GSM2670766,GSM2670767,GSM2670768,GSM2670769,GSM2670770,GSM2670771,GSM2670772,GSM2670773,GSM2670774,GSM2670775,GSM2670776,GSM2670777,GSM2670778,GSM2670779,GSM2670780,GSM2670781,GSM2683998,GSM2683999,GSM2684000,GSM2684001,GSM2684002,GSM2684003,GSM2684004,GSM2684005,GSM2684006,GSM2684007,GSM2684008,GSM2684009,GSM2684010,GSM2684011,GSM2684012,GSM2684013,GSM2684014,GSM2684015,GSM2684016,GSM2684017,GSM2684018,GSM2684019,GSM2684020,GSM2684021,GSM2684022,GSM2684023,GSM2684024,GSM2684025,GSM2684026,GSM2684027,GSM2684028,GSM2684029,GSM2684030,GSM2684031,GSM2684032,GSM2684033,GSM2684034,GSM2684035,GSM2684036,GSM2684037,GSM2684038,GSM2684039,GSM2684040,GSM2684041,GSM2684042,GSM2684043,GSM2684044,GSM2684045
0,ENSMUSG00000000001,Gnai3,1643.7329,1287.4175,540.1979,877.4265,2050.4781,1490.6809,1252.3796,2130.0521,2338.1267,1722.0743,402.069,1122.3573,1425.2469,1792.9385,940.3199,1349.1471,1318.6913,2050.7322,1714.2953,1688.6046,1325.5167,1394.2698,2381.2034,2036.7515,1892.083,2437.9253,2022.948,1471.3884,380.2329,450.1123,455.3374,483.285,374.8241,372.9131,542.9858,495.0347,366.7709,388.4958,529.9966,423.5318,918.1818,527.3237,396.89,836.5907,265.4515,1359.2019,728.6851,208.2501,311.7179,555.3363,275.2549,124.6632,709.2186,288.5279,135.3879,303.7142,338.0358,204.8242,267.8283,878.9797,821.9529,1527.866,1003.9972,1228.9966,1307.9967,1203.9988,1344.2266,1686.9964,1808.4329,1717.4778,340.9912,364.9849,439.9758,518.9706,603.9064,677.906,551.9998,2060.8463,1197.9477,1378.9682,2064.8308,1436.2164,1490.1157,773.5258,1310.8914,1183.0885,1406.8733,1412.3281,1617.9342,1785.1985,1441.6078,1387.8009,773.9299,1407.8683,1104.7982,929.7197,1104.4064,1066.6858,1024.1555,1200.6371,1309.4173,717.883,902.5204,595.6157,565.6663,761.7363,941.8854
1,ENSMUSG00000000003,Pbsn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.7205,24.1178,38.3254,49.0156,21.1259,64.9452,66.889,62.7784,29.2323,47.8099,37.4875,22.5241,36.6138,104.0,6.8214,4.4684,10.9547,42.7898,22.9079,3.238,4.0,5.0,9.0,10.0,5.0,3.0,3.3788,9.0,8.5983,5.245,1.0,1.1044,2.3806,1.0,2.3046,3.3894,1.0,4.3233,0.0,3.29,6.0108,7.0,5.0,1.2668,4.0029,4.1959,5.0,2.0,3.0,2.0,2.005,4.0,2.0135,3.0088,1.0,4.6277,8.0,10.0775,8.3259,6.8965,2.6833,5.6937,6.3878,3.0077,4.0,3.0179,4.1877
2,ENSMUSG00000000028,Cdc45,254.4606,89.381,94.1421,96.0356,132.1116,138.3019,0.0,77.1044,69.1572,107.8632,95.0045,121.2276,86.1667,42.0082,14.0057,114.2047,142.8226,286.6273,213.6906,1.0001,99.276,62.816,232.9229,94.955,115.0746,156.9445,75.9055,0.0,18.5334,12.2678,24.775,16.3993,14.5611,11.9198,23.7429,15.2613,15.6228,13.9495,20.6969,19.8446,307.6745,131.1603,176.1366,136.5733,60.9621,283.03,104.9266,9.4557,160.2284,136.9521,116.9283,8.3645,106.9402,5.0179,560.8443,71.551,103.7415,16.0798,33.562,41.7121,53.2873,92.9999,63.5882,68.0,71.0002,68.9999,77.3518,107.9998,90.0298,113.722,15.0,14.0,25.0,24.0,25.3175,26.0,22.7093,149.1077,48.6238,63.4592,103.0859,74.6911,80.1894,43.3289,82.2376,53.2918,65.0001,58.0,73.9605,92.5469,83.4316,81.6527,33.9615,62.3856,47.9151,44.1138,64.3947,43.0341,51.0001,66.3696,66.9355,39.9999,65.4094,32.5245,41.5892,45.5967,43.6638
3,ENSMUSG00000000031,H19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0125,0.0,0.0,4.3586,0.7966,1.5774,4.4046,5.2231,2.1322,5.8048,0.7886,0.0,1.8432,11.7232,8.9101,62.8456,29.4068,65.2544,48.7197,31.6144,89.4463,83.3443,35.9953,12.9037,53.8086,141.5783,35.8635,77.9501,20.5761,59.9009,93.7005,37.404,53.9057,39.4706,25.3927,20.4678,33.0,15.0,32.3586,23.0,43.8909,27.0144,33.0,53.6633,37.7132,9.0,7.2338,13.2128,16.0402,17.0074,17.5595,15.2454,41.9047,36.6375,22.4593,26.8662,75.1182,26.3289,16.6896,24.7786,42.0369,29.7566,33.2289,30.1475,44.0252,29.9961,21.4491,18.9255,17.502,28.0914,19.8776,43.8708,44.1487,42.8114,32.186,49.2632,28.1929,40.5748,23.3903,17.3259,32.0964,45.8233
4,ENSMUSG00000000037,Scml2,9.0502,35.249,0.0,0.0,33.0767,6.0215,0.0,0.0,52.1539,0.9956,0.0,0.0,0.0,20.0066,19.0607,0.0,0.0,5.9708,6.0156,0.0,0.9833,0.0,2.9969,6.9853,0.0,12.0057,0.0,79.7357,10.8582,7.8907,15.3003,8.1114,5.9653,10.4362,9.7124,50.3787,34.7158,7.2128,13.0845,14.1594,230.7525,159.7686,283.9823,250.714,182.2083,786.7576,396.3978,232.2167,377.8486,225.6909,195.0414,157.7761,210.3321,82.8281,53.4121,95.5271,119.7845,37.8276,73.0635,53.3216,50.362,90.0675,61.0001,100.2063,115.0,61.9999,72.2475,101.0,166.3183,131.287,29.0196,33.0,28.1781,42.1033,35.0,36.5808,36.0,153.6101,59.0785,86.4744,168.5623,90.1483,167.8826,60.7228,81.771,78.3964,73.2413,56.44,117.0074,118.421,107.3256,101.5772,67.7745,92.9825,49.8698,121.426,142.9899,135.1294,109.8084,116.7191,152.9527,85.867,129.9611,94.4892,64.7652,115.7004,77.5016


## Process gene expression matrix

This involves updating Entrez gene ids, sorting and subsetting

In [8]:
# expr_df = (expr_df
#     .dropna(axis='rows')
#     .reindex(probe_map_df.id)
#     .rename(index=ensembl_to_entrez)
#     .rename(index=old_to_new_entrez)
#     .groupby(level=0).mean()
#     .transpose()
#     .sort_index(axis='rows')
#     .sort_index(axis='columns')
# )

expr_df.index.rename('sample_id', inplace=True)

print(expr_df.shape)
expr_df.head(2)

(23735, 109)


Unnamed: 0_level_0,gene,gene_symbol,GSM2668159,GSM2668160,GSM2668161,GSM2668162,GSM2668163,GSM2668164,GSM2668165,GSM2668166,GSM2668167,GSM2668168,GSM2668169,GSM2668170,GSM2668171,GSM2668172,GSM2668173,GSM2668174,GSM2668175,GSM2668176,GSM2668177,GSM2668178,GSM2668179,GSM2668180,GSM2668181,GSM2668182,GSM2668183,GSM2668184,GSM2668185,GSM2668186,GSM2668542,GSM2668543,GSM2668544,GSM2668545,GSM2668546,GSM2668547,GSM2668548,GSM2668549,GSM2668550,GSM2668551,GSM2668552,GSM2668553,GSM2670763,GSM2670764,GSM2670765,GSM2670766,GSM2670767,GSM2670768,GSM2670769,GSM2670770,GSM2670771,GSM2670772,GSM2670773,GSM2670774,GSM2670775,GSM2670776,GSM2670777,GSM2670778,GSM2670779,GSM2670780,GSM2670781,GSM2683998,GSM2683999,GSM2684000,GSM2684001,GSM2684002,GSM2684003,GSM2684004,GSM2684005,GSM2684006,GSM2684007,GSM2684008,GSM2684009,GSM2684010,GSM2684011,GSM2684012,GSM2684013,GSM2684014,GSM2684015,GSM2684016,GSM2684017,GSM2684018,GSM2684019,GSM2684020,GSM2684021,GSM2684022,GSM2684023,GSM2684024,GSM2684025,GSM2684026,GSM2684027,GSM2684028,GSM2684029,GSM2684030,GSM2684031,GSM2684032,GSM2684033,GSM2684034,GSM2684035,GSM2684036,GSM2684037,GSM2684038,GSM2684039,GSM2684040,GSM2684041,GSM2684042,GSM2684043,GSM2684044,GSM2684045
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1
0,ENSMUSG00000000001,Gnai3,1643.7329,1287.4175,540.1979,877.4265,2050.4781,1490.6809,1252.3796,2130.0521,2338.1267,1722.0743,402.069,1122.3573,1425.2469,1792.9385,940.3199,1349.1471,1318.6913,2050.7322,1714.2953,1688.6046,1325.5167,1394.2698,2381.2034,2036.7515,1892.083,2437.9253,2022.948,1471.3884,380.2329,450.1123,455.3374,483.285,374.8241,372.9131,542.9858,495.0347,366.7709,388.4958,529.9966,423.5318,918.1818,527.3237,396.89,836.5907,265.4515,1359.2019,728.6851,208.2501,311.7179,555.3363,275.2549,124.6632,709.2186,288.5279,135.3879,303.7142,338.0358,204.8242,267.8283,878.9797,821.9529,1527.866,1003.9972,1228.9966,1307.9967,1203.9988,1344.2266,1686.9964,1808.4329,1717.4778,340.9912,364.9849,439.9758,518.9706,603.9064,677.906,551.9998,2060.8463,1197.9477,1378.9682,2064.8308,1436.2164,1490.1157,773.5258,1310.8914,1183.0885,1406.8733,1412.3281,1617.9342,1785.1985,1441.6078,1387.8009,773.9299,1407.8683,1104.7982,929.7197,1104.4064,1066.6858,1024.1555,1200.6371,1309.4173,717.883,902.5204,595.6157,565.6663,761.7363,941.8854
1,ENSMUSG00000000003,Pbsn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.7205,24.1178,38.3254,49.0156,21.1259,64.9452,66.889,62.7784,29.2323,47.8099,37.4875,22.5241,36.6138,104.0,6.8214,4.4684,10.9547,42.7898,22.9079,3.238,4.0,5.0,9.0,10.0,5.0,3.0,3.3788,9.0,8.5983,5.245,1.0,1.1044,2.3806,1.0,2.3046,3.3894,1.0,4.3233,0.0,3.29,6.0108,7.0,5.0,1.2668,4.0029,4.1959,5.0,2.0,3.0,2.0,2.005,4.0,2.0135,3.0088,1.0,4.6277,8.0,10.0775,8.3259,6.8965,2.6833,5.6937,6.3878,3.0077,4.0,3.0179,4.1877


## Stratify Balanced Training and Testing Sets in TARGET Gene Expression

Output training and testing gene expression datasets

In [9]:
#strat = pheno_df.set_index('sample_id').reindex(expr_df.index).primary_disease_code

In [10]:
# cancertype_count_df = (
#     pd.DataFrame(strat.value_counts()) #not using value_counts, what number should we use here? 
#     .reset_index()
#     .rename({'index': 'cancertype', 'primary_disease_code': 'n ='}, axis='columns')
# )

# file = os.path.join('data', 'target_sample_counts.tsv') #change which file - do we have a file that works for this?
# cancertype_count_df.to_csv(file, sep='\t', index=False)

# cancertype_count_df

In [11]:
train_df, test_df = train_test_split(expr_df,
                                     test_size=0.1,
                                     random_state=123) #if no stratify defined, should just randomize on its own

In [12]:
print(train_df.shape)
test_df.shape

(21361, 109)


(2374, 109)

In [13]:
#save train dataframe to file 
train_file = os.path.join('data', 'train_grein_mice_expression_matrix_processed.tsv.gz')
train_df.to_csv(train_file, sep='\t', compression='gzip', float_format='%.3g')

In [14]:
#save test dataframe to file 
test_file = os.path.join('data', 'test_grein_mice_expression_matrix_processed.tsv.gz')
test_df.to_csv(test_file, sep='\t', compression='gzip', float_format='%.3g')

## Sort genes based on median absolute deviation and output to file

In [15]:
# function to calculate median absolute deviation
def mad(df): 
    """Function to calculate median absolute deviation for a dataframe
  argument1 (dataframe): Dataframe for which to calculate the median absolute deviation, row by row

  Returns: The median absolute deviation for each row of the data frame
  """
    row_medians = df.median(axis='columns', numeric_only=True)
    abs_row_median_diffs = abs(df.sub(row_medians, axis='rows'))
    return abs_row_median_diffs.median(axis='columns', numeric_only=True)

In [16]:
# add gene_id as a column in dataframe
train_df['gene_id'] = train_df['gene'] + train_df['gene_symbol']
col = train_df.pop('gene_id')
train_df.insert(0, col.name, col)

train_df.head()

Unnamed: 0_level_0,gene_id,gene,gene_symbol,GSM2668159,GSM2668160,GSM2668161,GSM2668162,GSM2668163,GSM2668164,GSM2668165,GSM2668166,GSM2668167,GSM2668168,GSM2668169,GSM2668170,GSM2668171,GSM2668172,GSM2668173,GSM2668174,GSM2668175,GSM2668176,GSM2668177,GSM2668178,GSM2668179,GSM2668180,GSM2668181,GSM2668182,GSM2668183,GSM2668184,GSM2668185,GSM2668186,GSM2668542,GSM2668543,GSM2668544,GSM2668545,GSM2668546,GSM2668547,GSM2668548,GSM2668549,GSM2668550,GSM2668551,GSM2668552,GSM2668553,GSM2670763,GSM2670764,GSM2670765,GSM2670766,GSM2670767,GSM2670768,GSM2670769,GSM2670770,GSM2670771,GSM2670772,GSM2670773,GSM2670774,GSM2670775,GSM2670776,GSM2670777,GSM2670778,GSM2670779,GSM2670780,GSM2670781,GSM2683998,GSM2683999,GSM2684000,GSM2684001,GSM2684002,GSM2684003,GSM2684004,GSM2684005,GSM2684006,GSM2684007,GSM2684008,GSM2684009,GSM2684010,GSM2684011,GSM2684012,GSM2684013,GSM2684014,GSM2684015,GSM2684016,GSM2684017,GSM2684018,GSM2684019,GSM2684020,GSM2684021,GSM2684022,GSM2684023,GSM2684024,GSM2684025,GSM2684026,GSM2684027,GSM2684028,GSM2684029,GSM2684030,GSM2684031,GSM2684032,GSM2684033,GSM2684034,GSM2684035,GSM2684036,GSM2684037,GSM2684038,GSM2684039,GSM2684040,GSM2684041,GSM2684042,GSM2684043,GSM2684044,GSM2684045
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1
17917,ENSMUSG00000068758Il3ra,ENSMUSG00000068758,Il3ra,11.9828,40.9656,0.0,0.0,0.9989,14.9433,0.0,0.0,62.9205,21.0187,38.9969,2.9937,40.9097,134.9636,0.0,15.9725,0.0,13.9706,42.0143,0.0,0.0,20.0796,5.002,70.0605,43.0638,55.0124,25.0226,71.0897,12.1558,7.0235,4.9887,5.058,10.9767,13.8727,7.0361,2.9982,4.9582,4.8966,15.767,4.9247,236.3051,92.1212,57.3227,89.2178,65.9174,356.7376,108.3991,200.7319,73.5101,65.6579,145.7582,200.5719,222.9326,158.4543,109.5373,283.6776,162.3846,120.5198,318.137,37.2894,35.9853,44.9998,41.6827,40.9999,67.9999,28.0,33.676,41.9998,81.008,114.8578,7.0,25.0,6.3528,10.0,8.0,12.1235,19.0946,216.9183,188.9999,176.0055,176.0426,136.031,113.0738,113.9998,62.7575,144.9999,107.9998,138.1323,185.6192,53.0587,150.0,149.2843,81.9999,51.7913,29.9999,232.768,337.3443,335.9585,196.9172,356.9148,304.7092,121.8106,232.6756,151.5955,186.0761,268.1759,247.3839
579,ENSMUSG00000003418St8sia6,ENSMUSG00000003418,St8sia6,18.1422,30.3469,0.0,0.0,0.0,21.2393,0.0,0.0,0.0,0.995,5.0013,0.0,0.0,92.0425,20.0743,57.4555,58.5812,4.0431,14.9298,0.0,0.9758,14.7924,5.9921,0.0,18.976,12.9837,46.8235,0.0,74.9506,100.9342,74.1113,91.6142,138.2701,155.7855,87.0024,93.3763,64.6162,81.0797,100.1781,118.1372,537.8632,508.8233,492.5071,634.3279,353.7334,1389.554,926.4721,513.4449,398.4491,348.852,1582.3708,842.3449,1061.305,808.4869,580.079,355.7784,684.8252,174.0424,297.7976,313.8647,265.1567,369.7389,353.8806,378.9996,458.9342,250.1668,360.2319,553.833,667.6954,622.9958,92.5467,76.1629,124.9472,180.3255,191.6521,125.8883,144.5774,228.2233,148.6981,138.2763,155.6492,272.964,262.6524,160.9906,123.7229,127.9342,180.2145,152.5422,136.5346,196.2626,176.7026,166.2312,87.5508,186.6364,120.3753,191.781,230.1643,243.7221,209.7705,216.6055,202.3894,111.3832,100.852,85.2136,117.732,117.7262,125.3183
13157,ENSMUSG00000045106Ccdc73,ENSMUSG00000045106,Ccdc73,83.0175,79.8824,62.4192,16.5262,40.5543,42.2349,38.0589,29.7406,70.4162,28.6647,44.277,30.766,51.5988,34.8641,31.6716,67.0162,0.0,45.0593,52.3139,65.4666,38.326,26.3535,134.9575,102.3826,13.0159,78.062,56.1034,41.2064,126.1848,193.5476,119.321,196.1594,132.0996,168.9149,254.4494,224.2651,109.1272,110.7577,201.9134,201.198,177.2203,136.6687,206.2644,191.2369,98.7972,352.5961,197.5441,143.2142,227.7984,165.1048,158.0401,76.3973,224.349,122.127,130.9573,104.9055,186.0259,71.7623,101.8724,170.7236,161.6358,260.0168,230.3318,247.3692,295.1478,204.0003,241.2253,293.0005,347.1572,357.4046,46.8349,64.2621,108.189,97.4026,114.6374,131.4163,131.5553,658.2776,288.4371,440.2014,570.7147,524.4639,465.1324,196.0059,300.6355,292.2468,309.3377,288.8055,437.0729,388.8956,408.3262,336.7752,169.3997,449.4964,242.8592,456.0769,589.5169,498.9208,445.9103,448.3959,581.5629,348.9253,498.6724,291.2903,315.4466,384.1025,350.173
5179,ENSMUSG00000025860Xiap,ENSMUSG00000025860,Xiap,1022.3717,762.1216,617.6816,544.2932,1531.803,571.7379,355.7479,629.0973,983.7529,567.432,556.1348,585.1108,756.0624,637.4608,696.3762,590.2674,739.6453,723.2553,686.002,383.8713,723.7527,482.5543,930.4777,674.4708,1193.2866,747.1166,899.8367,746.6897,1547.0179,1308.6642,1508.5193,1514.9264,1542.8546,1289.7644,1650.3835,1440.2789,1333.9408,1221.0353,1837.5549,1405.3872,1501.9851,1407.2131,1160.8417,1745.9413,903.834,3385.4658,2657.8556,1312.3054,1828.2741,1157.53,933.7541,978.0021,1047.7926,566.3269,629.587,427.1914,613.4494,373.3501,505.4499,2391.7467,2245.3734,3768.6767,2236.0426,3220.3086,3109.1466,2815.6632,2910.313,4021.3059,3868.8881,3948.6026,780.0998,706.207,1182.9588,1125.4732,1248.7681,1444.9424,1502.9623,4533.3781,2539.117,2975.8486,4128.1258,2970.8298,3532.0412,2016.789,3173.6356,2514.683,2963.2316,3164.2443,3540.1693,3868.1553,3456.0366,2893.8253,1685.7996,2962.5104,2109.248,2788.2836,3620.5334,3329.8773,2722.2506,3351.1653,3488.7358,2143.5809,2642.7959,1864.8358,1883.3946,2403.6687,2524.7038
19871,ENSMUSG00000079179Rab10os,ENSMUSG00000079179,Rab10os,932.0895,845.8073,384.748,687.5476,515.6809,804.5711,654.3081,537.8268,599.2562,1379.82,348.4017,501.9284,1060.5993,464.064,528.9542,659.3803,246.6213,876.5924,767.127,755.0076,674.5297,1185.1232,1171.7755,543.1042,529.0792,1100.4784,584.7283,815.3015,363.7911,245.1179,327.7957,291.8318,262.9547,242.1732,361.1099,308.5972,308.4677,214.1519,376.1679,391.4498,1674.6391,772.2581,598.6678,822.9616,757.5468,4025.9555,1343.3488,877.6311,615.981,544.6845,723.6468,400.462,695.1648,237.5455,388.8723,889.5764,901.1219,460.9277,650.2279,610.8754,513.8968,1044.0696,669.0002,823.2131,980.9799,823.8113,864.2595,1039.7488,1476.1574,1297.014,210.269,260.5468,367.5713,346.7173,368.816,460.9727,371.1256,2408.5914,963.5675,1135.4691,1865.5139,1180.2075,1565.1426,893.0549,1057.8691,1037.1987,1108.1403,1110.9105,1491.9617,1526.151,1334.8485,977.9853,607.8587,1356.0389,626.4626,1485.9413,1822.9591,1705.5778,1425.3282,1974.7249,1732.3232,1185.3404,1543.3531,1088.037,1186.2525,1429.4938,1170.0054


In [17]:
# Determine most variably expressed genes and subset
# create dataframe to save median absolute deviation data for rats
train_df_mad = mad(train_df.drop(['gene_id','gene', 'gene_symbol'], axis = 1))
train_df_mad.columns = ['gene_id', 'median_abs_deviation']

train_df_mad = train_df_mad.sort_values(ascending=False)
train_df_mad

sample_id
17238    78640.4733
17247    60054.9391
18525    59895.6883
17244    48540.4072
4557     44518.6377
            ...    
19459        0.0000
21414        0.0000
19538        0.0000
20793        0.0000
18860        0.0000
Length: 21361, dtype: float64

In [18]:
# Save to tsv file
file = os.path.join('data', 'grein_mad_mice_genes.tsv')
train_df_mad.to_csv(file, sep='\t', index=False)