### This notebook modifies the meta_file so it matches the samples in the expression_file, then filters it again by only keeping the samples that satisfy the "controls" (so GSEA compares the samples differed only by the class_name).

### Dataframe samp_to_class (cls file for GSEA) is made, according to the class_name specified by the user.

### Expression_file is filtered so its samples match those in the meta_file.

### Benefits:
### 1. Correctly matches the samples in expression_file and meta_file; 
### 2. Keep other conditions the same (by specifying controls) when comparing results for a certain class;
### 3. User can specify the class name and the controls


In [14]:
import pandas as pd
import numpy as np
import gseapy as gp

gmt_file = '../../gseapy_databases/h.all.v6.1.symbols.gmt'
expression_file = '../../gseapy_databases/exp_ALCvsCHOWinNTinAlbCre.tsv'
meta_file = '../../gseapy_databases/dHEP_metadata.csv'
output_dir = '/../../gseapy_databases/mouse_liver_output'

# ONLY compare the samples differed by the class_name, 
# while keeping all other parameters the same
class_name = 'Tissue'  #'Treatment' 
class_A = 'Liver'  #'DEN_HFD_alcohol'
class_B = 'liver_tumor'  #'DEN_only'

# User types in the controls (attributes that remain constant)
controls = {
    'mouse genotype': 'Alb-Cre;IL-17RA-flox/flox',
    'Model': 'DEN ',  ### There is a space at the end
    'Treatment': 'DEN_HFD_alcohol'}
print controls

{'mouse genotype': 'Alb-Cre;IL-17RA-flox/flox', 'Model': 'DEN ', 'Treatment': 'DEN_HFD_alcohol'}


In [15]:
df_expression = pd.read_table(expression_file, index_col='Unnamed: 0')
df_expression.head()

Unnamed: 0,dHEP_520_A_NT,dHEP_550_A_NT,F_F_158_NC_NT,dHEP_549_A_NT,F_F_503_A_NT,dHEP_165_NC_NT,F_F_595_A_NT,F_F_523_A_NT,F_F_157_NC_NT,dHEP_518_A_NT,dHEP_164_NC_NT,dHEP_155_NC_NT,F_F_159_NC_NT,F_F_581_A_NT
Gnai3,6.482501,6.648272,6.644257,6.734103,6.557685,6.672892,6.707925,6.59124,6.632168,6.483054,6.81548,6.667075,6.768433,6.708067
Cdc45,0.779113,0.881496,0.026606,0.036682,0.688223,0.222011,0.281181,0.493432,0.192387,1.325266,1.160785,0.731233,0.930572,0.931043
Apoh,10.437362,10.285868,10.683067,10.462445,10.204836,10.558875,10.289569,10.032467,10.643058,10.364403,10.533293,10.551628,10.83181,10.346315
Narf,5.990224,6.089289,5.152588,6.280477,6.093053,6.277727,6.377708,5.902462,5.912509,6.144837,6.152632,5.551067,5.925389,6.137725
Cav2,1.732318,2.221725,2.06925,2.199621,1.634452,2.148011,2.390303,2.233464,1.226334,1.180876,2.263471,2.416598,1.768019,2.586876


In [16]:
focal_samples = list(df_expression)  # header
len(focal_samples)

14

In [17]:
df_meta = pd.read_csv(meta_file)
df_meta.head()

Unnamed: 0,Sample_name,Sample_Name2,mouse genotype,short_genotype,Model,Treatment,Tissue
0,dHEP_518_A_NT,dHEP-518-A-NT,Alb-Cre;IL-17RA-flox/flox,AlbCre_IL17RA_floxflox,DEN,DEN_HFD_alcohol,Liver
1,dHEP_520_A_NT,dHEP-520-A-NT,Alb-Cre;IL-17RA-flox/flox,AlbCre_IL17RA_floxflox,DEN,DEN_HFD_alcohol,Liver
2,dHEP_549_A_NT,dHEP-549-A-NT,Alb-Cre;IL-17RA-flox/flox,AlbCre_IL17RA_floxflox,DEN,DEN_HFD_alcohol,Liver
3,dHEP_550_A_NT,dHEP-550-A-NT,Alb-Cre;IL-17RA-flox/flox,AlbCre_IL17RA_floxflox,DEN,DEN_HFD_alcohol,Liver
4,F_F_503_A_NT,F/F-503-A-NT,IL-17RA-flox/flox,IL17RA_floxflox,DEN,DEN_HFD_alcohol,Liver


In [18]:
# keep the samples that are in the expression file
df_meta = df_meta[df_meta['Sample_name'].isin(focal_samples)]
df_meta.shape

(14, 7)

In [19]:
#df_meta['Model'] == 'DEN'  #this would print as False, because it is actually 'DEN ' with a space


In [20]:
# filter the samples: ONLY keep the samples that match the controls
for key, value in controls.iteritems(): 
    df_meta = df_meta[df_meta[key]==value]

df_meta   


Unnamed: 0,Sample_name,Sample_Name2,mouse genotype,short_genotype,Model,Treatment,Tissue
0,dHEP_518_A_NT,dHEP-518-A-NT,Alb-Cre;IL-17RA-flox/flox,AlbCre_IL17RA_floxflox,DEN,DEN_HFD_alcohol,Liver
1,dHEP_520_A_NT,dHEP-520-A-NT,Alb-Cre;IL-17RA-flox/flox,AlbCre_IL17RA_floxflox,DEN,DEN_HFD_alcohol,Liver
2,dHEP_549_A_NT,dHEP-549-A-NT,Alb-Cre;IL-17RA-flox/flox,AlbCre_IL17RA_floxflox,DEN,DEN_HFD_alcohol,Liver
3,dHEP_550_A_NT,dHEP-550-A-NT,Alb-Cre;IL-17RA-flox/flox,AlbCre_IL17RA_floxflox,DEN,DEN_HFD_alcohol,Liver


In [21]:
# extract only the COLUMNS with sample_name and class name
samp_to_class = df_meta[['Sample_name', class_name]]

# only keep the rows with class_A and class_B
samp_to_class = samp_to_class[(samp_to_class[class_name] == class_A) | (samp_to_class[class_name] == class_B)]

print(len(samp_to_class))
samp_to_class

4


Unnamed: 0,Sample_name,Tissue
0,dHEP_518_A_NT,Liver
1,dHEP_520_A_NT,Liver
2,dHEP_549_A_NT,Liver
3,dHEP_550_A_NT,Liver


In [22]:
# Filter expression file
real_focal_samples = samp_to_class['Sample_name'].tolist()
df_expression = df_expression[real_focal_samples]
df_expression.shape

(12651, 4)

In [23]:
cap_gene = [str(g).upper() for g in df_expression.index.tolist()] # cap the genes
df_expression['Name'] = cap_gene                                  # create a new column
df_expression = df_expression[['Name'] + real_focal_samples]           # put the 'Name' column at front
df_expression.index = range(0,len(df_expression))                 # number the rows
df_expression.head()
df_expression.shape

(12651, 5)

In [24]:
cls_list = samp_to_class[class_name].tolist()
cls_list

['Liver', 'Liver', 'Liver', 'Liver']

In [25]:
gs_res = gp.gsea(data=df_expression, 
                 gene_sets=gmt_file,
                 cls=samp_to_class[class_name].tolist(),  # we only need Treatment column here, since the Sample_name is in the expression file
                 permutation_num=100, # reduce number to speed up test
                 weighted_score_type = 1,  # default: 1
                 outdir=output_dir,
                 method='log2_ratio_of_classes',
                 processes=4,    ## 1 is default
                 format='png')

IndexError: list index out of range

In [None]:
#access the dataframe results throught res2d attribute
gs_res.res2d.head()

In [None]:
#gp.gsea?