# DEG analysis

In [171]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from scipy.stats import mannwhitneyu
from scipy.stats import false_discovery_control

In [2]:
test = False

In [3]:
path_data = '/home/thomas/Documents/git/medulloblastoma_cavalli_kaggle/data/in/protein_coding/'
path_metadata = path_data + 'GSE85217_Cavalli_subgroups_information_protein_coding.csv'
path_expr_mat = path_data + 'GSE85217_M_exp_763_MB_SubtypeStudy_TaylorLab_protein_coding.csv'

In [4]:
metadata = pd.read_csv(path_metadata,index_col=0)
expr_mat = pd.read_csv(path_expr_mat,index_col=0)

In [5]:
expr_mat_wnt = expr_mat.loc[metadata[metadata['Subgroup']=='WNT']['Study_ID'].to_list(),:]
expr_mat_grp3 = expr_mat.loc[metadata[metadata['Subgroup']=='Group3']['Study_ID'].to_list(),:]
expr_mat_grp4 = expr_mat.loc[metadata[metadata['Subgroup']=='Group4']['Study_ID'].to_list(),:]
expr_mat_shh = expr_mat.loc[metadata[metadata['Subgroup']=='SHH']['Study_ID'].to_list(),:]

In [6]:
metadata['Subgroup'].unique()

array(['SHH', 'Group3', 'Group4', 'WNT'], dtype=object)

In [7]:
# order
expr_mat_order_vars = pd.concat([expr_mat.mean(axis=0),expr_mat.median(axis=0),expr_mat.std(axis=0)],axis=1)
expr_mat_order_vars.columns = ['mean','median','sd']
expr_mat_order_vars.sort_values(by=['mean','median','sd'],inplace=True)

expr_mat_wnt=expr_mat_wnt[expr_mat_order_vars.index.tolist()]
expr_mat_grp3=expr_mat_grp3[expr_mat_order_vars.index.tolist()]
expr_mat_grp4=expr_mat_grp4[expr_mat_order_vars.index.tolist()]
expr_mat_shh=expr_mat_shh[expr_mat_order_vars.index.tolist()]

In [131]:
def trimmed_mean(expr_mat:np.ndarray,threshold=0.05):

    t_index=int(np.floor(expr_mat.shape[0]*(threshold/2)))

    trim_expr_mat=np.zeros((expr_mat.shape[0]-(2*t_index),expr_mat.shape[1]))

    for i in range(expr_mat.shape[1]):
        trim_expr_mat[:,i]=expr_mat[:,i][t_index:-t_index]

    return trim_expr_mat

In [204]:
expr_mat_wnt_trim=trimmed_mean(np.array(expr_mat_wnt))
expr_mat_grp3_trim=trimmed_mean(np.array(expr_mat_grp3))
expr_mat_grp4_trim=trimmed_mean(np.array(expr_mat_grp4))
expr_mat_shh_trim=trimmed_mean(np.array(expr_mat_shh))

In [177]:

def deg_annwhitneyu(expr_mat,expr_mat_grp1,expr_mat_grp2):

    results=[]
    for i,gene in enumerate(expr_mat.columns):
        grp1 = expr_mat_grp1[:,i]
        grp2 = expr_mat_grp2[:,i]

        stat, p_value = mannwhitneyu(grp1,grp2,alternative='two-sided')
            
        lfc = np.mean(np.log2(grp1)) - np.mean(np.log2(grp2))
    
        all_grp = np.concatenate([grp1,grp2])
    
        results.append({
            'gene':gene,
            'lfc':lfc.round(4),
            'mean':np.mean(all_grp).round(4),
            'std':np.std(all_grp).round(4),
            'statistic': stat,
            'pvalue':p_value
            })

    results_df = pd.DataFrame(results)
    results_df.sort_values(by = ['pvalue'],inplace=True)
    results_df['padj'] = false_discovery_control(results_df['pvalue'])
    return results_df

In [178]:
%%time
res_wnt_vs_grp3 =deg_annwhitneyu(expr_mat=expr_mat,expr_mat_grp1=expr_mat_wnt_trim,expr_mat_grp2=expr_mat_grp3_trim)

CPU times: user 18.2 s, sys: 23.3 ms, total: 18.3 s
Wall time: 18.3 s


In [198]:
tt_wnt_vs_grp3=res_wnt_vs_grp3[(res_wnt_vs_grp3['padj'] <= 0.05) & (abs(res_wnt_vs_grp3['lfc']) >= 0.9)]

In [199]:
tt_wnt_vs_grp3

Unnamed: 0,gene,lfc,mean,std,statistic,pvalue,padj
4527,SLC46A2,0.9259,6.4001,2.1489,9384.0,2.024644e-31,4.358966e-28
3613,CPSF6,1.4505,6.6367,3.5635,9381.0,2.209892e-31,4.358966e-28
4094,ELMOD3,1.1342,6.2717,2.6194,9343.0,6.6667500000000005e-31,5.142526e-28
2700,JPH1,1.1418,6.1679,2.6542,9337.0,7.930122e-31,5.142526e-28
13484,SYNDIG1L,-0.9798,8.2637,2.5466,68.0,1.4531309999999999e-30,7.347221000000001e-28
10603,IKBIP,-1.1777,8.133,2.9067,109.0,4.703751e-30,1.3617360000000002e-27
4153,RMDN2,1.0725,6.8221,2.7991,9158.0,1.26939e-28,1.1827209999999999e-26
2742,ADAM2,-0.9101,6.25,2.2387,238.0,1.771409e-28,1.503565e-26
1031,SMG6,1.1082,4.9036,2.1868,9135.0,2.402402e-28,1.928106e-26
5021,TTPAL,-0.9491,6.2529,2.2772,266.0,3.8416470000000003e-28,2.70786e-26


In [193]:
%%time
res_wnt_vs_grp4 =deg_annwhitneyu(expr_mat=expr_mat,expr_mat_grp1=expr_mat_wnt_trim,expr_mat_grp2=expr_mat_grp4_trim)

CPU times: user 18.7 s, sys: 103 ms, total: 18.8 s
Wall time: 19.1 s


In [213]:
tt_wnt_vs_grp4 = res_wnt_vs_grp4[(res_wnt_vs_grp4['padj']<=0.05) & (res_wnt_vs_grp4['lfc'] >= 0.9)]

In [214]:
tt_wnt_vs_grp4

Unnamed: 0,gene,lfc,mean,std,statistic,pvalue,padj
4094,ELMOD3,1.1967,5.2793,2.1637,21080.0,3.6176769999999996e-38,2.827778e-35
4527,SLC46A2,0.9165,5.7496,1.7343,21080.0,3.6177109999999997e-38,2.827778e-35
6187,NTS,0.9003,6.0805,1.8086,21077.0,3.7947069999999994e-38,2.827778e-35
3613,CPSF6,1.392,5.6809,2.8545,21075.0,3.917424e-38,2.827778e-35
6978,DNA2,0.9279,5.9806,1.9313,21054.0,5.470472e-38,2.827778e-35
2700,JPH1,1.1985,5.2055,2.1963,21053.0,5.558062e-38,2.827778e-35
4799,ZNF639,0.93,5.6553,1.7997,21043.0,6.514387999999999e-38,2.827778e-35
9602,PRPSAP1,0.966,7.2783,2.4335,20982.0,1.710274e-37,3.192127e-35
5278,BFSP1,0.9213,5.2905,1.7317,20950.0,2.831423e-37,3.558374e-35
3703,SAYSD1,1.0696,5.426,2.0046,20908.0,5.474627e-37,5.346666e-35


In [205]:
%%time
res_wnt_vs_ssh = deg_annwhitneyu(expr_mat=expr_mat,expr_mat_grp1=expr_mat_wnt_trim,expr_mat_grp2=expr_mat_shh_trim)

CPU times: user 17.7 s, sys: 4.23 ms, total: 17.7 s
Wall time: 17.7 s


In [211]:
tt_wnt_vs_ssh = res_wnt_vs_ssh[(res_wnt_vs_ssh['padj']<=0.05) & (abs(res_wnt_vs_ssh['lfc'])>=0.9)]

In [212]:
tt_wnt_vs_ssh

Unnamed: 0,gene,lfc,mean,std,statistic,pvalue,padj
10523,STOX1,-1.0034,7.5247,1.9125,11.0,2.837857e-35,4.219836e-32
2700,JPH1,1.2418,5.4594,2.5101,14471.0,2.961814e-35,4.219836e-32
3613,CPSF6,1.2873,6.457,3.1729,14393.0,1.5549929999999998e-34,6.463988e-32
7295,DET1,-1.1271,8.3773,2.4529,96.0,1.7283439999999999e-34,7.00498e-32
8384,ADAM33,-1.0448,9.1633,2.6889,101.0,1.92088e-34,7.242166000000001e-32
3703,SAYSD1,1.0424,5.8127,2.2152,14368.0,2.6358469999999998e-34,8.557321e-32
13434,ABAT,0.9411,6.4118,2.1812,14336.0,5.165703e-34,1.350748e-31
7509,TM4SF5,-1.0485,9.1721,2.5143,187.0,1.168165e-33,2.2280350000000003e-31
7013,SECISBP2L,-1.1661,9.1383,2.8255,223.0,2.471203e-33,4.088076e-31
1031,SMG6,1.1722,4.4188,2.0427,14172.0,1.549993e-32,1.7572369999999998e-30
