# DGE analysis

In [171]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from scipy.stats import mannwhitneyu
from scipy.stats import false_discovery_control

In [2]:
test = False

In [3]:
path_data = '/home/thomas/Documents/git/medulloblastoma_cavalli_kaggle/data/in/protein_coding/'
path_metadata = path_data + 'GSE85217_Cavalli_subgroups_information_protein_coding.csv'
path_expr_mat = path_data + 'GSE85217_M_exp_763_MB_SubtypeStudy_TaylorLab_protein_coding.csv'

In [4]:
metadata = pd.read_csv(path_metadata,index_col=0)
expr_mat = pd.read_csv(path_expr_mat,index_col=0)

In [5]:
expr_mat_wnt = expr_mat.loc[metadata[metadata['Subgroup']=='WNT']['Study_ID'].to_list(),:]
expr_mat_grp3 = expr_mat.loc[metadata[metadata['Subgroup']=='Group3']['Study_ID'].to_list(),:]
expr_mat_grp4 = expr_mat.loc[metadata[metadata['Subgroup']=='Group4']['Study_ID'].to_list(),:]
expr_mat_shh = expr_mat.loc[metadata[metadata['Subgroup']=='SHH']['Study_ID'].to_list(),:]

In [6]:
metadata['Subgroup'].unique()

array(['SHH', 'Group3', 'Group4', 'WNT'], dtype=object)

In [7]:
# order
expr_mat_order_vars = pd.concat([expr_mat.mean(axis=0),expr_mat.median(axis=0),expr_mat.std(axis=0)],axis=1)
expr_mat_order_vars.columns = ['mean','median','sd']
expr_mat_order_vars.sort_values(by=['mean','median','sd'],inplace=True)

expr_mat_wnt=expr_mat_wnt[expr_mat_order_vars.index.tolist()]
expr_mat_grp3=expr_mat_grp3[expr_mat_order_vars.index.tolist()]
expr_mat_grp4=expr_mat_grp4[expr_mat_order_vars.index.tolist()]
expr_mat_shh=expr_mat_shh[expr_mat_order_vars.index.tolist()]

In [131]:
def trimmed_mean(expr_mat:np.ndarray,threshold=0.05):

    t_index=int(np.floor(expr_mat.shape[0]*(threshold/2)))

    trim_expr_mat=np.zeros((expr_mat.shape[0]-(2*t_index),expr_mat.shape[1]))

    for i in range(expr_mat.shape[1]):
        trim_expr_mat[:,i]=expr_mat[:,i][t_index:-t_index]

    return trim_expr_mat

In [204]:
expr_mat_wnt_trim=trimmed_mean(np.array(expr_mat_wnt))
expr_mat_grp3_trim=trimmed_mean(np.array(expr_mat_grp3))
expr_mat_grp4_trim=trimmed_mean(np.array(expr_mat_grp4))
expr_mat_shh_trim=trimmed_mean(np.array(expr_mat_shh))

In [177]:

def deg_annwhitneyu(expr_mat,expr_mat_grp1,expr_mat_grp2):

    results=[]
    for i,gene in enumerate(expr_mat.columns):
        grp1 = expr_mat_grp1[:,i]
        grp2 = expr_mat_grp2[:,i]

        stat, p_value = mannwhitneyu(grp1,grp2,alternative='two-sided')
            
        lfc = np.mean(np.log2(grp1)) - np.mean(np.log2(grp2))
    
        all_grp = np.concatenate([grp1,grp2])
    
        results.append({
            'gene':gene,
            'lfc':lfc.round(4),
            'mean':np.mean(all_grp).round(4),
            'std':np.std(all_grp).round(4),
            'statistic': stat,
            'pvalue':p_value
            })

    results_df = pd.DataFrame(results)
    results_df.sort_values(by = ['pvalue'],inplace=True)
    results_df['padj'] = false_discovery_control(results_df['pvalue'])
    return results_df

In [178]:
%%time
res_wnt_vs_grp3 =deg_annwhitneyu(expr_mat=expr_mat,expr_mat_grp1=expr_mat_wnt_trim,expr_mat_grp2=expr_mat_grp3_trim)

CPU times: user 18.2 s, sys: 23.3 ms, total: 18.3 s
Wall time: 18.3 s


In [257]:
tt_wnt_vs_grp3=res_wnt_vs_grp3[(res_wnt_vs_grp3['padj'] <= 0.05) & (abs(res_wnt_vs_grp3['lfc']) >= 0.7)]

In [258]:
tt_wnt_vs_grp3

Unnamed: 0,gene,lfc,mean,std,statistic,pvalue,padj
4527,SLC46A2,0.9259,6.4001,2.1489,9384.0,2.024644e-31,4.358966e-28
3613,CPSF6,1.4505,6.6367,3.5635,9381.0,2.209892e-31,4.358966e-28
10990,CD300A,0.7002,8.9179,2.2582,9379.0,2.342643e-31,4.358966e-28
2756,NFKBIB,-0.8230,5.0759,1.4130,8.0,2.556752e-31,4.358966e-28
6187,NTS,0.8494,6.9027,2.1424,9371.0,2.957601e-31,4.358966e-28
...,...,...,...,...,...,...,...
1001,BCL3,0.7380,4.4051,1.6844,8472.0,5.782037e-21,8.101848e-20
11719,HOPX,-0.7408,6.6920,2.3447,974.0,2.469654e-20,3.203043e-19
8768,NR4A2,-0.7399,8.4201,2.8172,985.0,3.187450e-20,4.062495e-19
1816,TMEM101,0.8012,5.2705,2.3827,8354.0,8.982613e-20,1.084335e-18


In [259]:
%%time
res_wnt_vs_grp4 =deg_annwhitneyu(expr_mat=expr_mat,expr_mat_grp1=expr_mat_wnt_trim,expr_mat_grp2=expr_mat_grp4_trim)

CPU times: user 19.3 s, sys: 169 ms, total: 19.5 s
Wall time: 20.1 s


In [260]:
tt_wnt_vs_grp4 = res_wnt_vs_grp4[(res_wnt_vs_grp4['padj']<=0.05) & (res_wnt_vs_grp4['lfc'] >= 0.7)]

In [261]:
tt_wnt_vs_grp4

Unnamed: 0,gene,lfc,mean,std,statistic,pvalue,padj
4094,ELMOD3,1.1967,5.2793,2.1637,21080.0,3.617677e-38,2.827778e-35
4527,SLC46A2,0.9165,5.7496,1.7343,21080.0,3.617711e-38,2.827778e-35
10990,CD300A,0.7508,8.0086,1.9361,21079.0,3.675778e-38,2.827778e-35
6187,NTS,0.9003,6.0805,1.8086,21077.0,3.794707e-38,2.827778e-35
3613,CPSF6,1.3920,5.6809,2.8545,21075.0,3.917424e-38,2.827778e-35
...,...,...,...,...,...,...,...
1816,TMEM101,0.8826,4.5315,1.9051,19537.0,2.870213e-28,2.888386e-27
1847,PHGDH,0.7581,4.6176,1.4802,19432.0,1.189956e-27,1.115764e-26
2260,SLA2,0.8964,4.9311,2.1199,19332.0,4.540512e-27,4.015863e-26
1001,BCL3,0.7384,4.0064,1.3327,19013.0,2.945061e-25,2.199232e-24


In [262]:
%%time
res_wnt_vs_ssh = deg_annwhitneyu(expr_mat=expr_mat,expr_mat_grp1=expr_mat_wnt_trim,expr_mat_grp2=expr_mat_shh_trim)

CPU times: user 17.6 s, sys: 105 ms, total: 17.7 s
Wall time: 18.2 s


In [263]:
tt_wnt_vs_ssh = res_wnt_vs_ssh[(res_wnt_vs_ssh['padj']<=0.05) & (abs(res_wnt_vs_ssh['lfc'])>=0.7)]

In [264]:
tt_wnt_vs_ssh

Unnamed: 0,gene,lfc,mean,std,statistic,pvalue,padj
7616,USP21,0.7267,5.4839,1.3979,14478.0,2.550057e-35,4.219836e-32
4094,ELMOD3,0.7359,6.9496,2.0039,14475.0,2.719055e-35,4.219836e-32
10523,STOX1,-1.0034,7.5247,1.9125,11.0,2.837857e-35,4.219836e-32
2700,JPH1,1.2418,5.4594,2.5101,14471.0,2.961814e-35,4.219836e-32
4527,SLC46A2,0.7791,6.4075,1.7559,14469.0,3.091150e-35,4.219836e-32
...,...,...,...,...,...,...,...
11187,IL12A,-0.7760,8.0730,1.8408,996.0,9.637033e-27,2.251233e-25
9637,LEMD2,0.7965,5.4278,1.8856,13459.0,1.646230e-26,3.711916e-25
1816,TMEM101,0.8862,4.7763,2.1963,13401.0,4.768619e-26,1.002709e-24
1001,BCL3,0.7652,4.1242,1.5296,13284.0,3.955524e-25,7.229752e-24


In [265]:
# set genes
tt_genes_wnt_vs_grp3 = tt_wnt_vs_grp3['gene'].tolist()
tt_genes_wnt_vs_grp4 = tt_wnt_vs_grp4['gene'].tolist()
tt_genes_wnt_vs_ssh = tt_wnt_vs_ssh['gene'].tolist()

In [273]:
m = len(set(tt_genes_wnt_vs_grp3 + tt_genes_wnt_vs_grp4 + tt_genes_wnt_vs_ssh))
cross_tt_genes = pd.DataFrame(data = np.zeros((m,3)),index=list(set(tt_genes_wnt_vs_grp3 + tt_genes_wnt_vs_grp4 + tt_genes_wnt_vs_ssh)))

for i,lg in enumerate([tt_genes_wnt_vs_grp3,tt_genes_wnt_vs_grp4,tt_genes_wnt_vs_ssh]):
    for g in lg:
        cross_tt_genes.loc[g,i]=1

cross_tt_genes.columns = ['G3','G4','SSH']
cross_tt_genes['all'] = cross_tt_genes.sum(axis=1) / 3

In [274]:
cross_tt_genes

Unnamed: 0,G3,G4,SSH,all
SULT2B1,1.0,1.0,1.0,1.000000
NTS,1.0,1.0,0.0,0.666667
ADAM2,1.0,0.0,0.0,0.333333
ABAT,0.0,0.0,1.0,0.333333
LDLR,0.0,0.0,1.0,0.333333
...,...,...,...,...
DLL1,1.0,0.0,0.0,0.333333
DHCR7,0.0,0.0,1.0,0.333333
PKIA,1.0,0.0,0.0,0.333333
DOCK5,0.0,0.0,1.0,0.333333


In [275]:
cross_tt_genes[cross_tt_genes['all']==1]

Unnamed: 0,G3,G4,SSH,all
SULT2B1,1.0,1.0,1.0,1.0
PRPSAP1,1.0,1.0,1.0,1.0
KIF3B,1.0,1.0,1.0,1.0
SAYSD1,1.0,1.0,1.0,1.0
JPH1,1.0,1.0,1.0,1.0
SMG6,1.0,1.0,1.0,1.0
PCDHB7,1.0,1.0,1.0,1.0
RMDN2,1.0,1.0,1.0,1.0
ELMOD3,1.0,1.0,1.0,1.0
EIF2B3,1.0,1.0,1.0,1.0


In [286]:
path_dge='/home/thomas/Documents/git/medulloblastoma_cavalli_kaggle/data/dge/'

In [290]:
res_wnt_vs_grp3.to_csv(path_dge + 'res/res_wnt_vs_grp3.csv',index=False)
res_wnt_vs_grp4.to_csv(path_dge + 'res/res_wnt_vs_grp4.csv',index=False)
res_wnt_vs_ssh.to_csv(path_dge + 'res/res_wnt_vs_ssh.csv',index=False)

tt_wnt_vs_grp3.to_csv(path_dge + 'tt/tt_wnt_vs_grp3.csv',index=False)
tt_wnt_vs_grp4.to_csv(path_dge + 'tt/tt_wnt_vs_grp4.csv',index=False)
tt_wnt_vs_ssh.to_csv(path_dge + 'tt/tt_wnt_vs_ssh.csv',index=False)

cross_tt_genes.to_csv(path_dge + 'cross_tt_genes_wnt_vs_all.csv')