# LRT using TensorZINB

This shows how to use TensorZINB to perform DEG analysis using a sample dataset which contains 17 clusters and each cluster contains 20 genes. TensorZINB is applied to each cluster.

In [1]:
from tensorzinb.lrtest import LRTest
import numpy as np
import pandas as pd
import time
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

from tensorzinb.utils import correct_pvalues_for_multiple_testing,normalize_features

load data

In [2]:
df_g = pd.read_csv('model_sel_genes.csv')
df_m = pd.read_csv('meta.zip',sep='\t')
df_m.set_index('cell', inplace=True)
columns = {"post-mortem interval (hours)": "PMI", "RNA Integrity Number": "RIN", "RNA ribosomal percent": "ribo_pct", "RNA mitochondr. percent": "mito_pct"}
df_feature = df_m.rename(columns=columns)
df =  pd.read_csv('model_sel_count.zip').set_index('cell')

create additional features

In [3]:
df_feature['genes_log'] = np.log(df_feature['genes'])
df_feature['UMIs_log'] = np.log(df_feature['UMIs'])

perform DEG for each cluster

In [4]:
nb_features = [
    "UMIs",
    "genes",
    "UMIs_log",
    "genes_log",
    "sex",
    "age",
    "Capbatch",
    "PMI",
    "RIN",
    "ribo_pct",
    "mito_pct",
]

infl_features = [
    "UMIs",
    "UMIs_log",
    "genes_log",
    "sex",
    "age",
    "Capbatch",
    "ribo_pct",
    "mito_pct",
]

conditions = ["diagnosis"]

features_to_norm = [
    "UMIs",
    "genes",
    "UMIs_log",
    "genes_log",
    "age",
    "PMI",
    "RIN",
    "ribo_pct",
    "mito_pct",
]

In [5]:
clusters = df_g.cluster.unique()

In [6]:
dfrs=[]

for cluster in clusters:
    gene_ids= df_g[df_g.cluster==cluster].gene_id.values

    df_feature1 = df_feature[df_feature.cluster==cluster].copy()
    df_feature1 = normalize_features(df_feature1, features_to_norm)

    df_data=pd.merge(df[gene_ids], df_feature1.reset_index()[['cell']],on='cell').set_index('cell')
    lrtest = LRTest(df_data, df_feature1, conditions, nb_features, infl_features=infl_features)
    
    dfr = lrtest.run()
    
    columns = {"test": "gene_id"}
    dfr = dfr.rename(columns=columns)
    dfr['cluster']=cluster
    
    dfrs.append(dfr)

Metal device set to: Apple M2


In [7]:
df_deg=pd.concat(dfrs,axis=0,ignore_index=True)

In [8]:
df_deg['fdr']=correct_pvalues_for_multiple_testing(df_deg['pvalue'])

In [9]:
df_deg[df_deg['fdr']<0.05]

Unnamed: 0,gene_id,llf0,aic0,df0,cpu_time0,llf1,aic1,df1,cpu_time1,llfd,aicd,pvalue,cluster,fdr
33,ENSG00000138670,-13832.290595,27732.581189,34,0.318333,-13831.321845,27734.643689,36,0.104632,0.96875,2.0625,0.379557,L5/6,0.382936
36,ENSG00000171867,-7795.76613,15659.53226,34,0.318333,-7794.957536,15661.915072,36,0.104632,0.808594,2.382812,0.445484,L5/6,0.446798
37,ENSG00000010404,-7615.503141,15299.006281,34,0.318333,-7612.956266,15297.912531,36,0.104632,2.546875,-1.09375,0.078326,L5/6,0.0807
73,ENSG00000168702,-28234.145237,56536.290475,34,2.279833,-28231.223362,56534.446725,36,0.885801,2.921875,-1.84375,0.053833,OPC,0.056144
98,ENSG00000198763,-4246.842923,8561.685845,34,0.936911,-4244.706936,8561.413872,36,0.471034,2.135986,-0.271973,0.118128,AST-FB,0.12134
100,ENSG00000251562,-13034.244623,26136.489246,34,0.403177,-13032.619623,26137.239246,36,0.153348,1.625,0.75,0.196912,Endothelial,0.200449
106,ENSG00000198712,-4374.425106,8816.850213,34,0.403177,-4371.643368,8815.286736,36,0.153348,2.781738,-1.563477,0.061931,Endothelial,0.064197
109,ENSG00000198763,-3576.352047,7220.704094,34,0.403177,-3574.92767,7221.855339,36,0.153348,1.424377,1.151245,0.240658,Endothelial,0.243523
110,ENSG00000114933,-3904.755967,7877.511934,34,0.403177,-3903.85579,7879.71158,36,0.153348,0.900177,2.199646,0.406498,Endothelial,0.408903
116,ENSG00000198840,-3807.207168,7682.414336,34,0.403177,-3806.483047,7684.966094,36,0.153348,0.724121,2.551758,0.48475,Endothelial,0.48475


In [10]:
df_deg.to_csv('df_deg_zinb_sel.csv')