# DEG Analysis Using LRT
This notebook shows how to perform DEG analysis using likelihood ratio test (LRT) on a sample scRNA-seq dataset which contains 17 clusters and each cluster contains 20 genes.

In [1]:
from models.poi import Poi
from models.nb import NB
from models.poih import PoiH
from models.nbh import NBH
from models.zipoi import ZIPoi
from models.zinb import ZINB
from models.mast import MAST

import numpy as np
import pandas as pd
import time
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from lrtest import LRTest
from tensorzinb.utils import correct_pvalues_for_multiple_testing,normalize_features

# Model description

`model_classes` defines supported models and `methods` defines the supported methods for each model. You can choose the model from `model_classes` and its corresponding supported methods in `methods` for DEG analysis.

In [2]:
model_classes = [Poi, NB, PoiH, NBH, ZIPoi, ZINB, MAST]

methods={
    Poi: ['stan','statsmodels'],
    NB: ['stan','statsmodels','tensorflow'],
    PoiH: ['stan','statsmodels'],
    NBH: ['stan','statsmodels'],
    ZIPoi: ['stan','statsmodels'],
    ZINB: ['stan','statsmodels','tensorflow'],
    MAST: ['statsmodels'],
}

## load data

In [3]:
df_g = pd.read_csv('./data/model_sel_genes.csv')
df_m = pd.read_csv('./data/meta.zip',sep='\t')
df_m.set_index('cell', inplace=True)
columns = {"post-mortem interval (hours)": "PMI", "RNA Integrity Number": "RIN", "RNA ribosomal percent": "ribo_pct", "RNA mitochondr. percent": "mito_pct"}
df_feature = df_m.rename(columns=columns)
df =  pd.read_csv('./data/model_sel_count.zip').set_index('cell')


## generate additional features

In [4]:
df_feature['genes_log'] = np.log(df_feature['genes'])
df_feature['UMIs_log'] = np.log(df_feature['UMIs'])
df_feature['scaler'] = 10000.0/df_feature['UMIs']

In [5]:
exog_features = [
    "UMIs",
    "genes",
    "UMIs_log",
    "genes_log",
    "sex",
    "age",
    "Capbatch",
    "PMI",
    "RIN",
    "ribo_pct",
    "mito_pct",
]

exog_features_infl = [
    "UMIs",
    "UMIs_log",
    "genes_log",
    "sex",
    "age",
    "Capbatch",
    "ribo_pct",
    "mito_pct",
]

conditions = ["diagnosis"]

features_to_norm = [
    "UMIs",
    "genes",
    "UMIs_log",
    "genes_log",
    "age",
    "PMI",
    "RIN",
    "ribo_pct",
    "mito_pct",
]

scaler_col='scaler'

## DEG analysis

Define models to perform DEG analysis. The model is selected from `model_classes` and its corresponding supported methods can be selected from `methods`.

In [6]:
models_to_run = [MAST]
method = 'statsmodels'

clusters = df_g.cluster.unique()

In [7]:
import datetime

suffix = datetime.date.today().strftime("%m_%d_%Y")
pickle_name = "lrtest_{}.pickle".format(suffix)

In [8]:
dfrs = []
for cluster in clusters:
    print(cluster)
    gene_ids = df_g[df_g.cluster == cluster].gene_id.values

    df_feature1 = df_feature[df_feature.cluster == cluster].copy()
    df_feature1 = normalize_features(df_feature1, features_to_norm)
    df_data = pd.merge(
        df[gene_ids], df_feature1.reset_index()[["cell"]], on="cell"
    ).set_index("cell")

    for model_class in models_to_run:
        lrtest = LRTest(
            model_class,
            df_data,
            df_feature1,
            conditions,
            exog_features,
            infl_features=exog_features_infl,
            scaler_col=scaler_col,
        )

        dfr = lrtest.run(method=method)

        dfr = dfr.rename(columns={"subject": "gene_id"})
        dfr["cluster"] = cluster
        dfrs.append(dfr)

        with open(pickle_name, "wb") as output_file:
            pickle.dump(dfrs, output_file)

with open(pickle_name, "wb") as output_file:
    pickle.dump(dfrs, output_file)

There is no need to worry about the `newton, Singular matrix` error message. The logit model will be solved using another method such as `bfgs`.

## Calculate FDR 

In [9]:
df_deg = pd.concat(dfrs)
df_deg['fdr']=correct_pvalues_for_multiple_testing(df_deg['pvalue'])

In [10]:
df_deg[df_deg.pvalue<0.01]

Unnamed: 0,llf0,aic0,df0,cpu_time0,llf1,aic1,df1,cpu_time1,llfd,aicd,pvalue,model,method,gene_id,cluster,fdr
0,-14626.855370,29319.710740,33,0.285879,-14539.241421,29148.482842,35,0.281256,87.613949,-171.227898,0.000000e+00,mast,statsmodels,ENSG00000205542,Neu-NRGN-II,0.000000e+00
1,-11617.525948,23301.051896,33,0.286814,-11577.748596,23225.497191,35,0.288291,39.777353,-75.554705,0.000000e+00,mast,statsmodels,ENSG00000167996,Neu-NRGN-II,0.000000e+00
2,-10005.156379,20076.312758,33,0.291184,-9994.799306,20059.598612,35,0.275999,10.357073,-16.714146,3.176730e-05,mast,statsmodels,ENSG00000176884,Neu-NRGN-II,3.816566e-05
3,-13150.258869,26366.517739,33,0.287466,-13130.155081,26330.310161,35,0.257161,20.103789,-36.207577,1.857956e-09,mast,statsmodels,ENSG00000117632,Neu-NRGN-II,2.938164e-09
4,-15138.365127,30342.730254,33,0.292162,-15120.040634,30310.081268,35,0.294834,18.324493,-32.648987,1.100965e-08,mast,statsmodels,ENSG00000034510,Neu-NRGN-II,1.656320e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14,-15865.385485,31796.770969,33,0.279884,-15617.177379,31304.354758,35,0.278395,248.208106,-492.416211,0.000000e+00,mast,statsmodels,ENSG00000107317,AST-PP,0.000000e+00
16,-11166.688207,22399.376414,33,0.279628,-11158.002793,22386.005587,35,0.277253,8.685414,-13.370828,1.690335e-04,mast,statsmodels,ENSG00000152661,AST-PP,1.928570e-04
17,-12395.808251,24857.616503,33,0.286410,-12327.492551,24724.985102,35,0.281698,68.315700,-132.631401,0.000000e+00,mast,statsmodels,ENSG00000087250,AST-PP,0.000000e+00
18,-12920.783495,25907.566991,33,0.288469,-12853.650313,25777.300625,35,0.283972,67.133183,-130.266365,0.000000e+00,mast,statsmodels,ENSG00000145934,AST-PP,0.000000e+00


In [11]:
df_deg[df_deg.fdr<0.05]

Unnamed: 0,llf0,aic0,df0,cpu_time0,llf1,aic1,df1,cpu_time1,llfd,aicd,pvalue,model,method,gene_id,cluster,fdr
0,-14626.855370,29319.710740,33,0.285879,-14539.241421,29148.482842,35,0.281256,87.613949,-171.227898,0.000000e+00,mast,statsmodels,ENSG00000205542,Neu-NRGN-II,0.000000e+00
1,-11617.525948,23301.051896,33,0.286814,-11577.748596,23225.497191,35,0.288291,39.777353,-75.554705,0.000000e+00,mast,statsmodels,ENSG00000167996,Neu-NRGN-II,0.000000e+00
2,-10005.156379,20076.312758,33,0.291184,-9994.799306,20059.598612,35,0.275999,10.357073,-16.714146,3.176730e-05,mast,statsmodels,ENSG00000176884,Neu-NRGN-II,3.816566e-05
3,-13150.258869,26366.517739,33,0.287466,-13130.155081,26330.310161,35,0.257161,20.103789,-36.207577,1.857956e-09,mast,statsmodels,ENSG00000117632,Neu-NRGN-II,2.938164e-09
4,-15138.365127,30342.730254,33,0.292162,-15120.040634,30310.081268,35,0.294834,18.324493,-32.648987,1.100965e-08,mast,statsmodels,ENSG00000034510,Neu-NRGN-II,1.656320e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14,-15865.385485,31796.770969,33,0.279884,-15617.177379,31304.354758,35,0.278395,248.208106,-492.416211,0.000000e+00,mast,statsmodels,ENSG00000107317,AST-PP,0.000000e+00
16,-11166.688207,22399.376414,33,0.279628,-11158.002793,22386.005587,35,0.277253,8.685414,-13.370828,1.690335e-04,mast,statsmodels,ENSG00000152661,AST-PP,1.928570e-04
17,-12395.808251,24857.616503,33,0.286410,-12327.492551,24724.985102,35,0.281698,68.315700,-132.631401,0.000000e+00,mast,statsmodels,ENSG00000087250,AST-PP,0.000000e+00
18,-12920.783495,25907.566991,33,0.288469,-12853.650313,25777.300625,35,0.283972,67.133183,-130.266365,0.000000e+00,mast,statsmodels,ENSG00000145934,AST-PP,0.000000e+00
