# Differential gene expression analysis

## Setup

In [1]:
import os
import pandas as pd
import pickle as pkl
import matplotlib.pyplot as plt

from pydeseq2.dds import DeseqDataSet
from pydeseq2.default_inference import DefaultInference
from pydeseq2.ds import DeseqStats

In [2]:
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## Import data

In [None]:
file_path = "../datasets/E-MTAB-6863-raw-counts.tsv"
df = pd.read_csv(file_path, sep="\t", comment='#')
df.head()

In [None]:
# Examine all patients
file_path = "../datasets/E-MTAB-6863-experiment-design.tsv"
edf = pd.read_csv(file_path, sep="\t", comment='#')
# edf = edf[edf["Sample Characteristic[disease]"] == "non-alcoholic fatty liver disease"]
edf.head()

In [None]:
# Filter counts df by only non-alcoholic fatty liver disease patients
counts_df = df[["Gene ID"] + edf["Run"].tolist()] 
print(counts_df.shape)
counts_df.head()

## EDA

In [None]:
summary_df = counts_df.describe().round()
summary_df.loc["sum"] = summary_df.loc["count"] * summary_df.loc["mean"]
summary_df

In [None]:
# Filter the data with a threshold of ≥10 counts per million (CPM) in at least 20 samples (29%) was set
threshold = summary_df.loc["sum"]/1e6 * 10
counts_df1 = counts_df[counts_df.columns[1:]]
filtered_df = counts_df[(counts_df1 > threshold).sum(axis=1) >= 20]
filtered_df.shape

In [None]:
filtered_df = filtered_df.set_index("Gene ID")
filtered_df.head()

### PCA analysis

In [None]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

pca = PCA(n_components=3)
normalized_df=pd.DataFrame(StandardScaler().fit_transform(filtered_df))

pca.fit(normalized_df)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)

In [None]:
projected = pca.fit_transform(normalized_df)
print(normalized_df.shape)
print(projected.shape)

In [None]:
y=edf["Sample Characteristic[disease staging]"]

ex_variance=np.var(projected,axis=0)
ex_variance_ratio = ex_variance/np.sum(ex_variance)
ex_variance_ratio


Xax = projected[:,0]
Yax = projected[:,1]
Zax = projected[:,2]

cdict = {'early':'red','advanced':'green'}
labl = {'early':'Early','advanced':'Advanced'}
marker = {'early':'*','advanced':'o'}
alpha = {'early':.3, 'advanced':.5}

fig = plt.figure(figsize=(7,5))
ax = fig.add_subplot(111, projection='3d')

fig.patch.set_facecolor('white')
for l in np.unique(y):
 ix=np.where(y==l)
 ax.scatter(Xax[ix], Yax[ix], Zax[ix], c=cdict[l], s=40,
           label=labl[l], marker=marker[l], alpha=alpha[l])
# for loop ends
ax.set_xlabel("Component 1", fontsize=14)
ax.set_ylabel("Component 2", fontsize=14)
ax.set_zlabel("Component 3", fontsize=14)

ax.legend()
plt.show()

In [None]:
filtered_df.head()

In [None]:
filtered_df = filtered_df.T
filtered_df.head()

In [None]:
metadata = edf[["Run","Sample Characteristic[disease]", "Sample Characteristic[disease staging]"]]
metadata = metadata.rename(columns={"Sample Characteristic[disease]": "condition", 
                         "Sample Characteristic[disease staging]": "stage"})
metadata = metadata.set_index("Run")
metadata

In [None]:
# samples_to_keep = ~metadata.condition.isna()
# counts_df = counts_df.loc[samples_to_keep]
# metadata = metadata.loc[samples_to_keep]

In [None]:
# counts_df = counts_df.fillna(0)
# genes_to_keep = counts_df.columns[counts_df.sum(axis=0) >= 10]
# counts_df = counts_df[genes_to_keep]

In [None]:
# counts_df = counts_df.round(0)
# counts_df.head()

In [None]:
filtered_df.shape

In [None]:
counts_df = counts_df.T
# counts_df.columns = counts_df.loc["Gene ID"]
# counts_df = counts_df[1:]
counts_df.head(5)

## Differential expression analysis

In [None]:
inference = DefaultInference(n_cpus=8)
dds = DeseqDataSet(
    counts=counts_df,
    metadata=metadata,
    design_factors="stage",
    refit_cooks=True,
)

In [None]:
dds.deseq2()

In [None]:
print(dds)

In [None]:
stat_res = DeseqStats(dds, inference=inference)

In [None]:
stat_res.summary()
stat_res.results_df[:10]

In [None]:
results_df = stat_res.results_df.sort_values(by=["padj"])
results_df.head(10)

In [None]:
sum(results_df.padj < 0.05)

## Preping gene list

In [None]:
gene_list = df['Gene Name'][(df['Gene ID'].isin(results_df.iloc[:99].index.tolist())) & (~df['Gene Name'].isna())]

In [None]:
"','".join(gene_list.tolist())