# Porting DESeq into python using rpy2#

I will use a small example of [ERCC transcript](https://www.thermofisher.com/order/catalog/product/4456740) from [samples A and B in MAQC data](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3272078/).

In [10]:
import pandas as pd
genesHSA21 = pd.read_csv("./dataSaveOriginal/HSA21genes.csv")
genesHSA21List = []

for gene in genesHSA21["hgnc_symbol"].tolist():
    if type(gene) == str:
        genesHSA21List.append(gene)

genesHSA21List



['PRDM15',
 'PFKL',
 'HUNK',
 'COL6A1',
 'IFNAR1',
 'SOD1',
 'COL6A2',
 'SIK1',
 'DNMT3L',
 'TRPM2',
 'TMEM50B',
 'APP',
 'DOP1B',
 'URB1',
 'CXADR',
 'BTG3',
 'C21orf91',
 'CHODL',
 'TMPRSS15',
 'NCAM2',
 'MRPL39',
 'JAM2',
 'ATP5PF',
 'GABPA',
 'ADAMTS1',
 'ADAMTS5',
 'HSPA13',
 'SAMSN1',
 'USP25',
 'N6AMT1',
 'RWDD2B',
 'USP16',
 'CCT8',
 'MAP3K7CL',
 'BACH1',
 'CLDN17',
 'CLDN8',
 'TIAM1',
 'SCAF4',
 'VPS26C',
 'DYRK1A',
 'KCNJ6',
 'KCNJ15',
 'ERG',
 'ETS2',
 'LCA5L',
 'MX1',
 'C2CD2',
 'MIS18A',
 'CFAP298',
 'SYNJ1',
 'PAXBP1',
 'IFNAR2',
 'IFNGR2',
 'GART',
 'SON',
 'DONSON',
 'KCNE2',
 'RCAN1',
 'CLIC6',
 'RUNX1',
 'CBR1',
 'CBR3',
 'MORC3',
 'CHAF1B',
 'CLDN14',
 'SIM2',
 'HLCS',
 'ABCG1',
 'TFF3',
 'TFF2',
 'TFF1',
 'TMPRSS3',
 'UBASH3A',
 'RSPH1',
 'SLC37A1',
 'PDE9A',
 'WDR4',
 'NDUFV3',
 'PKNOX1',
 'CBS',
 'U2AF1',
 'CRYAA',
 'HSF2BP',
 'RRP1B',
 'PDXK',
 'CSTB',
 'RRP1',
 'AGPAT3',
 'TRAPPC10',
 'GATD3A',
 'ICOSLG',
 'AIRE',
 'CFAP410',
 'LRRC3',
 'ITGB2',
 'SLX9',
 'FTCD'

In [3]:
from itertools import groupby
from json import load
import pandas as pd
import scanpy as sc
import numpy as np
import sys
import tests.loadScanpy as loadScanpy
import classifyClusters as classify
import os
from scipy.sparse import csr_matrix
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
import diffxpy.api as de
from diffexpr.diffexpr.py_deseq import py_DESeq2
import pandas as pd 
import numpy as np


####################################################################################################
# Global Settings 
####################################################################################################
np.set_printoptions(threshold=sys.maxsize)
pd.options.display.max_columns = sys.maxsize
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)

inputData = "./dataSaveOriginal/rawDataset.h5ad" # ./dataSaveOriginal/rawDataset5000.h5ad
results_file = './output/savedData.h5ad'
outputDirectory = "./outputPDFs/"

In [4]:
adata = sc.read(inputData)
adata.var_names_make_unique()

In [5]:
array = [sum(x) for x in zip(*adata.X)]
array = array[0].toarray()[0]

def getColSample(string):
    subset =  adata[adata.obs["sample"] == string]
    returnArray = [sum(x) for x in zip(*subset.X)]
    return returnArray[0].toarray()[0]

cols = getColSample("CON_DS2U")
rows = adata.var["features"].values
print("Test 1")

# CON_DS2U CON_H9 CON_IMR CON_ihtc DS_2DS3 DS_DS1 DS_DSP

df = pd.DataFrame(rows, columns=['id'])
df["N_1"] = getColSample("CON_DS2U")
df["N_2"] = getColSample("CON_H9")
df["N_3"] = getColSample("CON_IMR")
df["S_1"] = getColSample("DS_2DS3")
df["S_2"] = getColSample("DS_DSP")

df.N_1 = df.N_1.astype(int)
df.N_2 = df.N_2.astype(int)
df.N_3 = df.N_3.astype(int)
df.S_1 = df.S_1.astype(int)
df.S_1 = df.S_2.astype(int)

df.head(15)


: 

: 

In [None]:
sample_df = pd.DataFrame({'samplename': df.columns}) \
        .query('samplename != "id"')\
        .assign(sample = lambda d: d.samplename.str.extract('([NS])_', expand=False)) \
        .assign(replicate = lambda d: d.samplename.str.extract('_([123])', expand=False)) 
sample_df.index = sample_df.samplename
sample_df

In [None]:
dds = py_DESeq2(count_matrix = df,
               design_matrix = sample_df,
               design_formula = '~ replicate + sample',
               gene_column = 'id') # <- telling DESeq2 this should be the gene ID column
    
dds.run_deseq() 
dds.get_deseq_result(contrast = ['sample','S','N'])
res = dds.deseq_result 
res.head()

In [None]:
# res = res.sort_values(['padj'], ascending = [True])
# res.head(50)
res = res.sort_values(['pvalue'], ascending = [True])
res.head(50)

In [None]:
dds.normalized_count() #DESeq2 normalized count

In [None]:
dds.comparison # show coefficients for GLM

In [None]:
# from the last cell, we see the arrangement of coefficients, 
# so that we can now use "coef" for lfcShrink
# the comparison we want to focus on is 'sample_B_vs_A', so coef = 4 will be used
lfc_res = dds.lfcShrink(coef=4, method='apeglm')
lfc_res.head()