In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
import pickle

In [10]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")
from utils import get_tr_data, get_lipids_data

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
healthy_data = get_tr_data(True)
sz_data = get_tr_data(False)

In [8]:
common_genes = list(set(healthy_data.columns) & set(sz_data.columns) - set(["region", "batch"]))
A = healthy_data.groupby("region")[common_genes].apply(pd.Series.tolist).tolist()
A = np.array(A)
B = sz_data.groupby("region")[common_genes].apply(pd.Series.tolist).tolist()
B = np.array(B)
ttest = ttest_ind(A, B, axis=1)
p_vals_corrected = multipletests(ttest.pvalue.reshape(-1,), alpha=0.05, method="holm-sidak")
p_vals_corrected = p_vals_corrected[1].reshape(35,-1)
p_vals_corrected = pd.DataFrame(p_vals_corrected, index=healthy_data.region.unique(), columns=common_genes)
genes = p_vals_corrected[p_vals_corrected < .05].stack()
genes

1  Cerebellar Grey Matter              ENSG00000173166    0.005742
15 1ary Motor (BA4)                    ENSG00000025772    0.002790
28 2ary Auditory, Wernicke (BA22p)     ENSG00000008441    0.021120
33 Anterior Inferior Temporal (BA20a)  ENSG00000169967    0.048344
                                       ENSG00000115966    0.048291
34 Dorsolateral Prefrontal (BA9)       ENSG00000080371    0.026713
52 Amygdala                            ENSG00000085224    0.032036
                                       ENSG00000048649    0.036455
56 Nucleus Accumbens                   ENSG00000085224    0.004179
                                       ENSG00000102908    0.008513
                                       ENSG00000038219    0.040155
                                       ENSG00000127914    0.009514
                                       ENSG00000106636    0.011221
57 Caudate                             ENSG00000198162    0.032574
63 Substantia Nigra                    ENSG00000089048    0.01

In [17]:
print(pd.DataFrame(genes).to_latex())

\begin{tabular}{llr}
\toprule
                   &                 &         0 \\
\midrule
1  Cerebellar Grey Matter & ENSG00000173166 &  0.005742 \\
15 1ary Motor (BA4) & ENSG00000025772 &  0.002790 \\
28 2ary Auditory, Wernicke (BA22p) & ENSG00000008441 &  0.021120 \\
33 Anterior Inferior Temporal (BA20a) & ENSG00000169967 &  0.048344 \\
                   & ENSG00000115966 &  0.048291 \\
34 Dorsolateral Prefrontal (BA9) & ENSG00000080371 &  0.026713 \\
52 Amygdala & ENSG00000085224 &  0.032036 \\
                   & ENSG00000048649 &  0.036455 \\
56 Nucleus Accumbens & ENSG00000085224 &  0.004179 \\
                   & ENSG00000102908 &  0.008513 \\
                   & ENSG00000038219 &  0.040155 \\
                   & ENSG00000127914 &  0.009514 \\
                   & ENSG00000106636 &  0.011221 \\
57 Caudate & ENSG00000198162 &  0.032574 \\
63 Substantia Nigra & ENSG00000089048 &  0.014087 \\
64 Globus Pallidus & ENSG00000129534 &  0.036828 \\
\bottomrule
\end{tabular}



  print(pd.DataFrame(genes).to_latex())


In [11]:
print("\n".join(np.array(genes.index.tolist())[:, 1].tolist()))

ENSG00000173166
ENSG00000025772
ENSG00000008441
ENSG00000115966
ENSG00000169967
ENSG00000080371
ENSG00000085224
ENSG00000048649
ENSG00000106636
ENSG00000085224
ENSG00000038219
ENSG00000102908
ENSG00000127914
ENSG00000198162
ENSG00000089048
ENSG00000129534


# Lipids

In [16]:
healthy_data = get_lipids_data(True)
sz_data = get_lipids_data(False)

In [61]:
healthy_data.head()

Unnamed: 0,batch,region,human,posFT21454,posFT22500,posFT23044,posFT23164,posFT23296,posFT23406,posFT24809,...,negFT02874,negFT02842,negFT02798,negFT02757,negFT02685,negFT03483,negFT03357,negFT03333,negFT03309,negFT04068
0,170426_BM_pos_1-30_MS1_HA_259,25 Posterior Inferior Temporal (BA20p),HA,-0.321858,-0.18667,-0.831546,-0.481312,-0.317817,-0.273785,-0.657452,...,-0.144085,0.107967,-0.19324,0.079825,-1.455671,-0.085955,-0.415611,-0.689455,-0.308895,0.324224
1,170426_BM_pos_1-30_MS2_HA_502,2 Anterior Supramarginal (BA40a),HA,-0.269051,-0.324846,-0.691367,-0.309655,-0.263693,-0.467548,-0.870317,...,-0.573214,-0.176788,0.201614,0.260553,0.048211,-0.159775,-0.767876,-0.631643,-0.277298,-0.049696
2,170426_BM_pos_1-30_MS3_HA_16,21 FEF Lateral (BA8),HA,-1.15597,-1.659437,-1.220121,-1.535495,-1.000576,-1.353224,-1.436105,...,-0.923116,-0.096289,0.226105,0.334437,-0.784905,-0.886897,-0.842676,-0.742599,-0.147094,-1.307362
3,170426_BM_pos_1-30_MS4_HA_556,"65 Hippocampus, CA1",HA,0.163326,0.432867,0.555181,0.311151,-0.064507,0.258723,0.458305,...,0.537205,0.361747,0.38937,-0.121017,-2.0727,0.432871,0.87514,0.703214,-0.013925,1.088353
4,170426_BM_pos_1-30_MS5_HA_98,29 Precuneus (BA7m),HA,-1.429402,-1.037357,-1.572588,-0.774326,-1.081257,-1.344704,-1.608834,...,-1.83227,-1.092563,-0.648667,-0.336176,-0.139349,-1.347401,-1.402256,-0.778622,-0.709377,-2.455449


In [62]:
sz_data.head()

Unnamed: 0,batch,region,human,posFT24227,posFT25123,posFT25655,posFT25793,posFT25962,posFT26104,posFT27598,...,negFT02655,negFT02623,negFT02583,negFT02495,negFT02443,negFT03339,negFT03175,negFT03147,negFT03110,negFT04014
0,Batch1_Sch_Br_x30_pos_1-01_29,1 Cerebellar Grey Matter,H1,2.424921,2.838137,1.524328,1.822651,2.354328,2.474153,1.719755,...,1.452766,0.67005,0.816402,0.309825,0.306229,1.731028,1.567002,0.93733,1.124983,1.957048
1,Batch2_Sch_Br_x30_pos_2-01_167,1 Cerebellar Grey Matter,H2,-0.403839,-1.092249,-0.683415,-0.854029,-0.4377,-1.090015,-0.531587,...,0.46153,0.223532,0.706481,0.707628,0.566704,-1.215483,0.451452,0.604473,1.123945,-1.415834
2,Batch3_Sch_Br_x30_pos_3-01_261,1 Cerebellar Grey Matter,H3,0.013676,-0.25548,-0.156406,-0.032195,-0.040578,-0.152843,0.050003,...,-0.102317,-0.272963,-0.621122,0.100003,0.421818,-0.301936,-0.110691,0.173696,0.691605,0.153238
3,Batch4_Sch_Br_x30_pos_5-01_343,1 Cerebellar Grey Matter,H5,-1.023772,-2.069189,-0.839931,-1.133143,-1.191844,-1.724849,-0.745246,...,-0.173373,-0.087506,0.328943,0.321068,1.433593,-1.591868,-0.062515,1.104718,1.279782,-1.621301
4,Batch1_Sch_Br_x30_pos_1-10_22,10 2ary/3ary Visual Posterior (BA18/19p),H1,0.214234,0.214688,-0.189341,-0.060685,0.180887,0.407153,-0.290135,...,0.42267,0.364077,0.330135,0.670403,1.035297,0.639611,0.407011,0.252742,0.372836,0.825966


In [111]:
A = healthy_data.groupby("region")["human"].apply(pd.Series.tolist)
B = sz_data.groupby("region")["human"].apply(pd.Series.tolist)

all_fours = set(A[A.apply(len) == 4].index) & set(B[B.apply(len) == 4].index)

A = healthy_data[healthy_data.region.isin(all_fours)].groupby("region").apply(pd.Series.tolist)
A = np.array(A.sort_index().tolist())[:, :, 3:].astype(float)

B = sz_data[sz_data.region.isin(all_fours)].groupby("region").apply(pd.Series.tolist)
B = np.array(B.sort_index().tolist())[:, :, 3:].astype(float)

ttest = ttest_ind(A, B, axis=1)
p_vals_corrected = multipletests(ttest.pvalue.reshape(-1,), alpha=0.05, method="holm-sidak")
p_vals_corrected = p_vals_corrected[1].reshape(52, -1)
p_vals_corrected = pd.DataFrame(p_vals_corrected, index=list(all_fours))
lipids = p_vals_corrected[p_vals_corrected < .05].stack()
lipids

31 Supramarginal Posterior (BA40p)  82    0.044054
dtype: float64