# LRT using TensorZINB

This shows how to use TensorZINB to perform DEG analysis using a sample dataset which contains 17 clusters and each cluster contains 20 genes. TensorZINB is applied to each cluster.

In [1]:
from tensorzinb.lrtest import LRTest
import numpy as np
import pandas as pd
import time
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

from tensorzinb.utils import correct_pvalues_for_multiple_testing,normalize_features

load data

In [2]:
df_g = pd.read_csv('model_sel_genes.csv')
df_m = pd.read_csv('meta.zip',sep='\t')
df_m.set_index('cell', inplace=True)
columns = {"post-mortem interval (hours)": "PMI", "RNA Integrity Number": "RIN", "RNA ribosomal percent": "ribo_pct", "RNA mitochondr. percent": "mito_pct"}
df_feature = df_m.rename(columns=columns)
df =  pd.read_csv('model_sel_count.zip').set_index('cell')

create additional features

In [3]:
df_feature['genes_log'] = np.log(df_feature['genes'])
df_feature['UMIs_log'] = np.log(df_feature['UMIs'])

perform DEG for each cluster

In [4]:
nb_features = [
    "UMIs",
    "genes",
    "UMIs_log",
    "genes_log",
    "sex",
    "age",
    "Capbatch",
    "PMI",
    "RIN",
    "ribo_pct",
    "mito_pct",
]

infl_features = [
    "UMIs",
    "UMIs_log",
    "genes_log",
    "sex",
    "age",
    "Capbatch",
    "ribo_pct",
    "mito_pct",
]

conditions = ["diagnosis"]

features_to_norm = [
    "UMIs",
    "genes",
    "UMIs_log",
    "genes_log",
    "age",
    "PMI",
    "RIN",
    "ribo_pct",
    "mito_pct",
]

In [5]:
clusters = df_g.cluster.unique()

In [6]:
dfrs=[]

for cluster in clusters:
    gene_ids= df_g[df_g.cluster==cluster].gene_id.values

    df_feature1 = df_feature[df_feature.cluster==cluster].copy()
    df_feature1 = normalize_features(df_feature1, features_to_norm)

    df_data=pd.merge(df[gene_ids], df_feature1.reset_index()[['cell']],on='cell').set_index('cell')
    lrtest = LRTest(df_data, df_feature1, conditions, nb_features, infl_features=infl_features)
    
    dfr = lrtest.run()
    
    columns = {"test": "gene_id"}
    dfr = dfr.rename(columns=columns)
    dfr['cluster']=cluster
    
    dfrs.append(dfr)

Metal device set to: Apple M2


In [7]:
df_deg=pd.concat(dfrs,axis=0,ignore_index=True)

In [8]:
df_deg['fdr']=correct_pvalues_for_multiple_testing(df_deg['pvalue'])

In [9]:
df_deg[df_deg['fdr']<0.05]

Unnamed: 0,gene_id,llf0,aic0,df0,cpu_time0,llf1,aic1,df1,cpu_time1,llfd,aicd,pvalue,cluster,fdr
0,ENSG00000205542,-14512.756078,29093.512155,34,3.164311,-14428.763158,28929.526316,36,1.603692,83.992920,-163.985840,0.000000e+00,Neu-NRGN-II,0.000000e+00
1,ENSG00000167996,-11492.078389,23052.156779,34,3.164311,-11458.832296,22989.664591,36,1.603692,33.246094,-62.492188,3.663736e-15,Neu-NRGN-II,9.092483e-15
2,ENSG00000176884,-9914.683911,19897.367822,34,3.164311,-9903.139478,19878.278955,36,1.603692,11.544434,-19.088867,9.689831e-06,Neu-NRGN-II,1.238550e-05
3,ENSG00000117632,-13051.690271,26171.380543,34,3.164311,-13030.478357,26132.956715,36,1.603692,21.211914,-38.423828,6.134551e-10,Neu-NRGN-II,1.069614e-09
4,ENSG00000034510,-15046.573941,30161.147882,34,3.164311,-15015.494351,30102.988702,36,1.603692,31.079590,-58.159180,3.175238e-14,Neu-NRGN-II,7.344088e-14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,ENSG00000183166,-6794.552717,13657.105433,34,0.617688,-6784.383283,13640.766566,36,0.738243,10.169434,-16.338867,3.832402e-05,AST-PP,4.772955e-05
336,ENSG00000152661,-11049.212130,22166.424260,34,0.617688,-11040.328829,22152.657659,36,0.738243,8.883301,-13.766602,1.386856e-04,AST-PP,1.666188e-04
337,ENSG00000087250,-12157.679663,24383.359327,34,0.617688,-12058.530738,24189.061475,36,0.738243,99.148926,-194.297852,0.000000e+00,AST-PP,0.000000e+00
338,ENSG00000145934,-12833.751948,25735.503896,34,0.617688,-12745.074702,25562.149404,36,0.738243,88.677246,-173.354492,0.000000e+00,AST-PP,0.000000e+00


In [10]:
df_deg.to_csv('df_deg_zinb_sel.csv')