# GSEApy Analysis

### 1. Import Required Packages
### 2. Import Prior Clustered Data
### 3. Prep Data


## <br> 1. Import Required Packages

In [1]:
import os
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import seaborn as sns
import gseapy as gp

from scipy import sparse
from anndata import AnnData
from anndata.experimental.multi_files import AnnCollection

In [2]:
print(gp.__version__)

1.0.5


Set figure parameters.

In [3]:
sc.set_figure_params(figsize=(6,6))
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'
pd.options.display.max_colwidth = 200
#plt.rcParams['font.sans-serif']=['Arial']
plt.rcParams['pdf.fonttype'] = 'truetype'

## <br> 2. Import DEseq2 LRT Data

In [4]:
LRT_MASTER =  pd.read_csv('../03_Differential_Gene_Expression_Analysis---Deseq2/Results/All_Celltypes_LRT.txt',
                         delimiter='\t', index_col=1, header=0)

LRT_MASTER = LRT_MASTER.drop('Unnamed: 0', axis=1)

In [5]:
LRT_MASTER

Unnamed: 0_level_0,padj_B_Cell,padj_Cholangiocyte,padj_Hepatocyte,padj_HSC,padj_LSEC,padj_Macrophage,padj_Neutrophil,padj_pDC,padj_PF,padj_T_Cell
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Gm42418,0.541213,0.458789,0.013423,0.713926,0.294861,0.218712,0.999981,0.999893,0.856809,0.846682
Malat1,0.993333,0.999986,0.174310,0.771468,0.825937,0.999975,0.999981,0.999893,0.999968,0.999548
Cmss1,0.999953,0.999986,0.104654,0.999946,0.682680,0.999975,0.999981,0.999893,0.999968,0.999548
Foxp1,0.999953,0.999986,0.000830,0.810051,0.951791,0.999975,0.999981,0.999893,0.999968,0.999548
Camk1d,0.946550,0.999986,0.026259,0.819480,0.580589,0.417520,0.999981,0.999893,0.999968,0.999548
...,...,...,...,...,...,...,...,...,...,...
Klrc3,,,,,,,,,,
Cnga1,,,,,,,,,,
Klra4,,,,,,,,,,
Klra9,,,,,,,,,,


In [6]:
print(LRT_MASTER.columns)

Index(['padj_B_Cell', 'padj_Cholangiocyte', 'padj_Hepatocyte', 'padj_HSC',
       'padj_LSEC', 'padj_Macrophage', 'padj_Neutrophil', 'padj_pDC',
       'padj_PF', 'padj_T_Cell'],
      dtype='object')


## <br> 3. Prep Data

In [7]:
# Columns to subset and process
columns_to_process = ['padj_B_Cell', 'padj_Cholangiocyte', 'padj_Hepatocyte', 'padj_HSC',
                      'padj_LSEC', 'padj_Macrophage', 'padj_Neutrophil', 'padj_pDC',
                      'padj_PF', 'padj_T_Cell']

# Iterate over the columns
for column in columns_to_process:
    # Subset the DataFrame based on the column
    subset_df = LRT_MASTER[[column]].copy()

    # Sort the 'padj' column in ascending order
    subset_df.sort_values(by=column, inplace=True)

    # Remove rows with NaN values
    subset_df.dropna(inplace=True)

    # Create a uniquely named DataFrame with the cell type
    cell_type = column.split('_')[1]
    new_df_name = f"subset_{cell_type}_DataFrame"
    globals()[new_df_name] = subset_df

    # Display a message indicating the DataFrame name
    print(f"Created DataFrame {new_df_name} for {cell_type} subset")

    # Display the updated DataFrame
    print(subset_df)

Created DataFrame subset_B_DataFrame for B subset
          padj_B_Cell
index                
Fmo3         0.000007
Arhgef26     0.000007
Sds          0.000007
Zbtb16       0.000054
Fhit         0.000663
...               ...
Rubcnl       0.999953
Fign         0.999953
Usp4         0.999953
Bbox1        0.999953
Supt20       0.999955

[5234 rows x 1 columns]
Created DataFrame subset_Cholangiocyte_DataFrame for Cholangiocyte subset
         padj_Cholangiocyte
index                      
Fgb                0.041805
Npas2              0.041805
Shank2             0.116947
Fmo3               0.116947
Cyp2c40            0.116947
...                     ...
Gpx1               0.999986
Gcc2               0.999986
Cyp2c67            0.999986
Trmt1l             0.999986
Slc28a3            0.999986

[5574 rows x 1 columns]
Created DataFrame subset_Hepatocyte_DataFrame for Hepatocyte subset
         padj_Hepatocyte
index                   
Hmox1       1.357855e-29
Exoc3       3.811516e-25
Hspa4l  

In [8]:
print(subset_B_DataFrame.shape)
print(subset_Cholangiocyte_DataFrame.shape)
print(subset_Hepatocyte_DataFrame.shape)
print(subset_HSC_DataFrame.shape)
print(subset_LSEC_DataFrame.shape)
print(subset_Macrophage_DataFrame.shape)
print(subset_Neutrophil_DataFrame.shape)
print(subset_pDC_DataFrame.shape)
print(subset_PF_DataFrame.shape)
print(subset_T_DataFrame.shape)

(5234, 1)
(5574, 1)
(16245, 1)
(8675, 1)
(9001, 1)
(10137, 1)
(1386, 1)
(2875, 1)
(6374, 1)
(5095, 1)


## <br> 4. 

In [10]:
pre_res_B_Cell = gp.prerank(rnk=subset_B_DataFrame, # or rnk = rnk,
                                 gene_sets="TZ_mGSKB-parsed_Hep-Secretome_Symbol.gmt",
                                 threads=8,
                                 min_size=10,
                                 max_size=1000,
                                 permutation_num=1000, # reduce number to speed up testing
                                 outdir="./Results/B_Cell_LRT_GSEA", # don't write to disk
                                 seed=6,
                                 verbose=True, # see what's going on behind the scenes
                                )

The order of those genes will be arbitrary, which may produce unexpected results.
2023-07-05 15:17:31,314 [INFO] Parsing data files for GSEA.............................
2023-07-05 15:17:32,456 [INFO] 18120 gene_sets have been filtered out when max_size=1000 and min_size=10
2023-07-05 15:17:32,457 [INFO] 4813 gene_sets used for further statistical testing.....
2023-07-05 15:17:32,458 [INFO] Start to run GSEA...Might take a while..................
2023-07-05 15:18:28,900 [INFO] Start to generate gseapy reports, and produce figures...
2023-07-05 15:18:28,901 [INFO] Congratulations. GSEApy runs successfully................



In [9]:
pre_res_Hepatocyte = gp.prerank(rnk=subset_Hepatocyte_DataFrame, # or rnk = rnk,
                                 gene_sets="TZ_mGSKB-parsed_Hep-Secretome_Symbol.gmt",
                                 threads=8,
                                 min_size=10,
                                 max_size=1000,
                                 permutation_num=1000, # reduce number to speed up testing
                                 outdir="./Results/Hepatocyte_LRT_GSEA", # don't write to disk
                                 seed=6,
                                 verbose=True, # see what's going on behind the scenes
                                )

The order of those genes will be arbitrary, which may produce unexpected results.
2023-07-05 14:59:27,933 [INFO] Parsing data files for GSEA.............................
2023-07-05 14:59:28,316 [INFO] 14719 gene_sets have been filtered out when max_size=1000 and min_size=10
2023-07-05 14:59:28,318 [INFO] 8214 gene_sets used for further statistical testing.....
2023-07-05 14:59:28,318 [INFO] Start to run GSEA...Might take a while..................
2023-07-05 15:03:19,454 [INFO] Start to generate gseapy reports, and produce figures...
2023-07-05 15:03:19,455 [INFO] Congratulations. GSEApy runs successfully................



In [None]:
pre_res.res2d.head(5)

In [None]:
#pre_res.res2d['Term'] = pre_res.res2d['Term'].str.replace('_Ensembl', '')

In [None]:
#pre_res.res2d.head(5)

In [None]:
subset = pre_res.res2d[pre_res.res2d['Term'].str.contains('KEGG')]

In [None]:
subset

In [None]:
terms = pre_res.res2d.Term
pre_res.plot(terms=terms[0])