# DEG Analysis Using LRT
This notebook shows how to perform DEG analysis using likelihood ratio test (LRT) on a sample scRNA-seq dataset which contains 17 clusters and each cluster contains 20 genes.

In [1]:
from models.poi import Poi
from models.nb import NB
from models.poih import PoiH
from models.nbh import NBH
from models.zipoi import ZIPoi
from models.zinb import ZINB
from models.mast import MAST

import numpy as np
import pandas as pd
import time
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from lrtest import LRTest
from tensorzinb.utils import correct_pvalues_for_multiple_testing,normalize_features

# Model description

`model_classes` defines supported models and `methods` defines the supported methods for each model. You can choose the model from `model_classes` and its corresponding supported methods in `methods` for DEG analysis.

In [2]:
model_classes = [Poi, NB, PoiH, NBH, ZIPoi, ZINB, MAST]

methods={
    Poi: ['stan','statsmodels'],
    NB: ['stan','statsmodels','tensorflow'],
    PoiH: ['stan','statsmodels'],
    NBH: ['stan','statsmodels'],
    ZIPoi: ['stan','statsmodels'],
    ZINB: ['stan','statsmodels','tensorflow'],
    MAST: ['statsmodels'],
}

## load data

In [3]:
df_g = pd.read_csv('./data/model_sel_genes.csv')
df_m = pd.read_csv('./data/meta.zip',sep='\t')
df_m.set_index('cell', inplace=True)
columns = {"post-mortem interval (hours)": "PMI", "RNA Integrity Number": "RIN", "RNA ribosomal percent": "ribo_pct", "RNA mitochondr. percent": "mito_pct"}
df_feature = df_m.rename(columns=columns)
df =  pd.read_csv('./data/model_sel_count.zip').set_index('cell')


## generate additional features

In [4]:
df_feature['genes_log'] = np.log(df_feature['genes'])
df_feature['UMIs_log'] = np.log(df_feature['UMIs'])
df_feature['scaler'] = 10000.0/df_feature['UMIs']

In [5]:
exog_features = [
    "UMIs",
    "genes",
    "UMIs_log",
    "genes_log",
    "sex",
    "age",
    "Capbatch",
    "PMI",
    "RIN",
    "ribo_pct",
    "mito_pct",
]

exog_features_infl = [
    "UMIs",
    "UMIs_log",
    "genes_log",
    "sex",
    "age",
    "Capbatch",
    "ribo_pct",
    "mito_pct",
]

conditions = ["diagnosis"]

features_to_norm = [
    "UMIs",
    "genes",
    "UMIs_log",
    "genes_log",
    "age",
    "PMI",
    "RIN",
    "ribo_pct",
    "mito_pct",
]

scaler_col='scaler'

## DEG analysis

Define models to perform DEG analysis. The model is selected from `model_classes` and its corresponding supported methods can be selected from `methods`.

In [6]:
models_to_run = [MAST]
method = 'statsmodels'

clusters = df_g.cluster.unique()

In [7]:
import datetime

suffix = datetime.date.today().strftime("%m_%d_%Y")
pickle_name = "lrtest_{}.pickle".format(suffix)

In [8]:
dfrs = []
for cluster in clusters:
    print(cluster)
    gene_ids = df_g[df_g.cluster == cluster].gene_id.values

    df_feature1 = df_feature[df_feature.cluster == cluster].copy()
    df_feature1 = normalize_features(df_feature1, features_to_norm)
    df_data = pd.merge(
        df[gene_ids], df_feature1.reset_index()[["cell"]], on="cell"
    ).set_index("cell")

    for model_class in models_to_run:
        lrtest = LRTest(
            model_class,
            df_data,
            df_feature1,
            conditions,
            exog_features,
            infl_features=exog_features_infl,
            scaler_col=scaler_col,
        )

        dfr = lrtest.run(method=method)

        dfr = dfr.rename(columns={"subject": "gene_id"})
        dfr["cluster"] = cluster
        dfrs.append(dfr)

        with open(pickle_name, "wb") as output_file:
            pickle.dump(dfrs, output_file)

with open(pickle_name, "wb") as output_file:
    pickle.dump(dfrs, output_file)

Neu-NRGN-II
L5/6
Oligodendrocytes
_logit_fit error, newton, Singular matrix
_logit_fit error, newton, Singular matrix
OPC
AST-FB
Endothelial
_logit_fit error, newton, Singular matrix
_logit_fit error, newton, Singular matrix
Microglia
_logit_fit error, newton, Singular matrix
Neu-NRGN-I
_logit_fit error, newton, Singular matrix
_logit_fit error, newton, Singular matrix
_logit_fit error, newton, Singular matrix
_logit_fit error, newton, Singular matrix
_logit_fit error, newton, Singular matrix
IN-VIP
L5/6-CC
_logit_fit error, newton, Singular matrix
_logit_fit error, newton, Singular matrix
IN-SV2C
_logit_fit error, newton, Singular matrix
_logit_fit error, newton, Singular matrix
_logit_fit error, newton, Singular matrix
_logit_fit error, newton, Singular matrix
_logit_fit error, newton, Singular matrix
_logit_fit error, newton, Singular matrix
L2/3
_logit_fit error, newton, Singular matrix
_logit_fit error, newton, Singular matrix
_logit_fit error, newton, Singular matrix
_logit_fit e

There is no need to worry about the `newton, Singular matrix` error message. The logit model will be solved using another method such as `bfgs`.

## Calculate FDR 

In [9]:
df_deg = pd.concat(dfrs)
df_deg['fdr']=correct_pvalues_for_multiple_testing(df_deg['pvalue'])

In [10]:
df_deg[df_deg.pvalue>0.01]

Unnamed: 0,llf0,aic0,df0,cpu_time0,llf1,aic1,df1,cpu_time1,llfd,aicd,pvalue,model,method,gene_id,cluster,fdr
11,-10236.138084,20538.276167,33,0.280787,-10233.675946,20537.351892,35,0.287397,2.462138,-0.924275,0.085253,mast,statsmodels,ENSG00000198712,L5/6,0.087836
13,-13857.833111,27781.666223,33,0.357099,-13857.074997,27784.149995,35,0.370808,0.758114,2.483772,0.468549,mast,statsmodels,ENSG00000138670,L5/6,0.471141
15,-7895.845935,15857.691871,33,0.282677,-7891.977956,15853.955912,35,0.290355,3.867979,-3.735958,0.020901,mast,statsmodels,ENSG00000198763,L5/6,0.022069
16,-7916.010734,15898.021467,33,0.277937,-7913.182387,15896.364773,35,0.269749,2.828347,-1.656694,0.05911,mast,statsmodels,ENSG00000171867,L5/6,0.061318
18,-21727.829234,43521.658468,33,0.319515,-21724.1395,43518.279001,35,0.294187,3.689734,-3.379467,0.024979,mast,statsmodels,ENSG00000109846,Oligodendrocytes,0.026293
19,-16479.043248,33024.086495,33,0.285981,-16477.110007,33024.220014,35,0.305958,1.93324,0.133519,0.144679,mast,statsmodels,ENSG00000189058,Oligodendrocytes,0.148165
18,-4239.416028,8544.832055,33,0.27343,-4236.58842,8543.176841,35,0.284419,2.827607,-1.655215,0.059154,mast,statsmodels,ENSG00000198763,AST-FB,0.061318
0,-13143.294921,26352.589841,33,0.00501,-13142.065143,26354.130286,35,0.001364,1.229778,1.540444,0.292358,mast,statsmodels,ENSG00000251562,Endothelial,0.296721
6,-4465.361613,8996.723225,33,0.322558,-4462.05085,8994.101701,35,0.327885,3.310762,-2.621524,0.036488,mast,statsmodels,ENSG00000198712,Endothelial,0.03829
9,-3666.808245,7399.616489,33,0.054606,-3664.316681,7398.633362,35,0.277728,2.491563,-0.983127,0.08278,mast,statsmodels,ENSG00000198763,Endothelial,0.085548


In [11]:
df_deg[df_deg.fdr>0.05]

Unnamed: 0,llf0,aic0,df0,cpu_time0,llf1,aic1,df1,cpu_time1,llfd,aicd,pvalue,model,method,gene_id,cluster,fdr
11,-10236.138084,20538.276167,33,0.280787,-10233.675946,20537.351892,35,0.287397,2.462138,-0.924275,0.085253,mast,statsmodels,ENSG00000198712,L5/6,0.087836
13,-13857.833111,27781.666223,33,0.357099,-13857.074997,27784.149995,35,0.370808,0.758114,2.483772,0.468549,mast,statsmodels,ENSG00000138670,L5/6,0.471141
16,-7916.010734,15898.021467,33,0.277937,-7913.182387,15896.364773,35,0.269749,2.828347,-1.656694,0.05911,mast,statsmodels,ENSG00000171867,L5/6,0.061318
19,-16479.043248,33024.086495,33,0.285981,-16477.110007,33024.220014,35,0.305958,1.93324,0.133519,0.144679,mast,statsmodels,ENSG00000189058,Oligodendrocytes,0.148165
18,-4239.416028,8544.832055,33,0.27343,-4236.58842,8543.176841,35,0.284419,2.827607,-1.655215,0.059154,mast,statsmodels,ENSG00000198763,AST-FB,0.061318
0,-13143.294921,26352.589841,33,0.00501,-13142.065143,26354.130286,35,0.001364,1.229778,1.540444,0.292358,mast,statsmodels,ENSG00000251562,Endothelial,0.296721
9,-3666.808245,7399.616489,33,0.054606,-3664.316681,7398.633362,35,0.277728,2.491563,-0.983127,0.08278,mast,statsmodels,ENSG00000198763,Endothelial,0.085548
16,-3891.692832,7849.385664,33,0.300953,-3890.725022,7851.450044,35,0.291064,0.96781,2.06438,0.379914,mast,statsmodels,ENSG00000198840,Endothelial,0.384437
5,-6068.045755,12202.091509,33,0.290021,-6065.184018,12200.368036,35,0.277407,2.861737,-1.723473,0.057169,mast,statsmodels,ENSG00000176884,Neu-NRGN-I,0.059625
6,-8363.370944,16792.741887,33,0.336966,-8361.265578,16792.531155,35,0.316988,2.105366,-0.210732,0.121801,mast,statsmodels,ENSG00000111640,Neu-NRGN-I,0.125113
