In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy.stats import ttest_ind, mannwhitneyu
from statsmodels.stats.multitest import multipletests
import pickle

In [3]:
%load_ext autoreload
%autoreload 2

import sys

sys.path.append("..")
from utils import get_tr_data, get_lipids_data

# Transcriptomis data

In [4]:
healthy_data = get_tr_data(True)
sz_data = get_tr_data(False)

In [5]:
region_names = healthy_data.region.unique().tolist()

In [6]:
common_genes = list(
    set(healthy_data.columns) & set(sz_data.columns) - set(["region", "batch"])
)
A = healthy_data.groupby("region")[common_genes].apply(pd.Series.tolist).tolist()
A = np.array(A)
B = sz_data.groupby("region")[common_genes].apply(pd.Series.tolist).tolist()
B = np.array(B)

In [7]:
sz_var = B.var(axis=1)
h_var = A.var(axis=1)

stattest = mannwhitneyu(sz_var, h_var, axis=1)

(stattest.pvalue < 0.05).mean()

0.9714285714285714

In [8]:
alternative = "greater"

In [9]:
stattest = ttest_ind(A, B, alternative=alternative, equal_var=False, axis=1)

In [10]:
(stattest.pvalue < 0.05).mean()

0.1899233164380939

In [11]:
for method in ["fdr_bh", "fdr_by", "fdr_tsbh", "fdr_tsbky"]:
    p_vals_corrected = multipletests(
        stattest.pvalue.reshape(
            -1,
        ),
        alpha=0.05,
        method=method,
    )
    print(method, (p_vals_corrected[1] < 0.05).mean())

fdr_bh 0.019284757000775903
fdr_by 0.0
fdr_tsbh 0.019980048166547425
fdr_tsbky 0.01800098751498907


In [12]:
p_vals_corrected = multipletests(
    stattest.pvalue.reshape(
        -1,
    ),
    alpha=0.05,
    method="fdr_tsbh",
)

In [13]:
p_vals_corrected = p_vals_corrected[1].reshape(35, -1)

In [14]:
p_vals_corrected = pd.DataFrame(
    p_vals_corrected, index=healthy_data.region.unique(), columns=common_genes
)

In [15]:
(p_vals_corrected < 0.05).mean().mean()

0.019980048166547425

In [16]:
genes = p_vals_corrected[p_vals_corrected < 0.05].stack()
genes

1  Cerebellar Grey Matter   ENSG00000090382    0.041462
                            ENSG00000082212    0.040922
                            ENSG00000146243    0.035675
                            ENSG00000143450    0.028103
                            ENSG00000164975    0.036790
                                                 ...   
75 Cerebellar White Matter  ENSG00000156162    0.021517
                            ENSG00000133773    0.041953
                            ENSG00000213463    0.028968
                            ENSG00000151881    0.036245
                            ENSG00000189403    0.044333
Length: 9914, dtype: float64

In [17]:
len(genes)

9914

In [18]:
for region_name in region_names:
    difference = genes[region_name].index.tolist()
    with open(
        f"../../data/output/{region_name.replace('/', '|')} {alternative}.txt", "w"
    ) as fout:
        fout.write("\n".join(difference))

# Lipids

In [20]:
lipids_meta = pd.read_csv("../../data/input/TL_combined.csv")
lipids_meta.head()

Unnamed: 0.1,Unnamed: 0,polarity_peak,Lipid.class,Lipid.species,fragmentation,adduct,other.mode,MZ.target,RT.data,init_peak,polarity,schiz_peak,healthy_predicted,rt_error,ppm_error,schiz_peak_WO_,corr_all,corr_HC,polarity_schiz_peak
0,0,posFT21454,DG,DG 34:1,DG(16:0_18:1). DG(16:1_18:0),M+NH4,negFT09400,612.557849,9.833337,FT21454,pos,FT24227,FT21454,0.010223,0.330946,FT24227,0.534097,0.97717,posFT24227
1,1,posFT22500,DG,DG 35:1,DG(17:1_18:0),M+NH4,negFT09832,626.573761,10.592758,FT22500,pos,FT25123,FT22500,0.014286,0.299755,FT25123,0.689857,0.87511,posFT25123
2,2,posFT23044,DG,DG 36:4,DG(16:0_20:4),M+NH4,negFT10106,634.542347,8.601461,FT23044,pos,FT25655,FT23044,0.014547,-0.445968,FT25655,0.588468,0.973725,posFT25655
3,3,posFT23164,DG,DG 36:3,DG(16:0_20:3). DG(18:1_18:2),M+NH4,negFT10169,636.557585,9.128819,FT23164,pos,FT25793,FT23164,0.013258,-0.156961,FT25793,0.432344,0.979049,posFT25793
4,4,posFT23296,DG,DG 36:2,DG(18:0_18:2). DG(18:1_18:1),M+NH4,negFT10242,638.573939,10.122875,FT23296,pos,FT25962,FT23296,0.004684,0.119584,FT25962,0.511568,0.976847,posFT25962


In [21]:
columns = [
    "polarity_peak",
    "polarity_schiz_peak",
    "Lipid.class",
    "Lipid.species",
]
lipids_meta = lipids_meta[columns]

In [22]:
lipids_meta.columns = ["h_id", "sz_id", "class", "species"]

In [23]:
print(lipids_meta["class"].unique())

['DG' 'SM' 'Cer' 'HexCer' 'CAR' 'CE' 'PS' 'TG' 'Cholesterol' 'LPC' 'PE'
 'LPE' 'PG' 'PE_P' 'PE_O' 'PC' 'PC_P' 'PC_O' 'PI' 'SulfoHexCer' 'FA']


In [24]:
lipids_meta[lipids_meta["species"].str.endswith("1")]

Unnamed: 0,h_id,sz_id,class,species
0,posFT21454,posFT24227,DG,DG 34:1
1,posFT22500,posFT25123,DG,DG 35:1
5,posFT23406,posFT26104,DG,DG 36:1
11,posFT28338,posFT31217,SM,SM d35:1
13,posFT29177,posFT32121,SM,SM d36:1
...,...,...,...,...
358,negFT01882,negFT01689,FA,FA 19:1
360,negFT02166,negFT01929,FA,FA 20:1
367,negFT03015,negFT02744,FA,FA 22:1
374,negFT03483,negFT03339,FA,FA 24:1


In [25]:
meta_fa_only = lipids_meta[lipids_meta["class"] == "FA"]

In [80]:
healthy_data = get_lipids_data(True)
sz_data = get_lipids_data(False)

### entorhinal cortex

In [70]:
healthy_54 = healthy_data.loc[
    healthy_data.region.str.contains("Entorhinal"), meta_fa_only.h_id
].astype(float)

sz_54 = sz_data.loc[
    sz_data.region.str.contains("Entorhinal"), meta_fa_only.sz_id
].astype(float)

stattest = ttest_ind(healthy_54, sz_54, axis=0, alternative="greater")

In [71]:
stattest.pvalue

array([0.56704359, 0.7266679 , 0.94340542, 0.43483154, 0.78986893,
       0.75880588, 0.09098458, 0.19710794, 0.41651129, 0.84920179,
       0.88522033, 0.77153827, 0.79347765, 0.41765456, 0.2149289 ,
       0.336984  , 0.87945113, 0.87862532, 0.99240367, 0.94484974,
       0.9283771 , 0.2040184 , 0.03317193, 0.04605322, 0.33896465,
       0.9850159 , 0.88304087, 0.93321042, 0.34004906, 0.97774103])

In [72]:
for method in ["fdr_bh", "fdr_by", "fdr_tsbh", "fdr_tsbky"]:
    p_vals_corrected = multipletests(
        stattest.pvalue.reshape(
            -1,
        ),
        alpha=0.05,
        method=method,
    )
    print(method, (p_vals_corrected[1] < 0.05).mean())

fdr_bh 0.0
fdr_by 0.0
fdr_tsbh 0.0
fdr_tsbky 0.0


In [None]:
pvals_corrected = multipletests(stattest.pvalue, method="Bonferroni")
df = pd.DataFrame(
    np.stack([stattest.pvalue, pvals_corrected[1]]).T,
    index=meta_fa_only.species.values,
    columns=["pval", "padj"],
)
df.sort_values("pval")

### prefrontal cortex [Brodmann area (BA) 8].

In [81]:
pd.concat([healthy_data, sz_data], a)

Unnamed: 0,batch,region,human,posFT21454,posFT22500,posFT23044,posFT23164,posFT23296,posFT23406,posFT24809,...,negFT02655,negFT02623,negFT02583,negFT02495,negFT02443,negFT03339,negFT03175,negFT03147,negFT03110,negFT04014
0,170426_BM_pos_1-30_MS1_HA_259,25 Posterior Inferior Temporal (BA20p),HA,-0.321858,-0.186670,-0.831546,-0.481312,-0.317817,-0.273785,-0.657452,...,,,,,,,,,,
1,170426_BM_pos_1-30_MS2_HA_502,2 Anterior Supramarginal (BA40a),HA,-0.269051,-0.324846,-0.691367,-0.309655,-0.263693,-0.467548,-0.870317,...,,,,,,,,,,
2,170426_BM_pos_1-30_MS3_HA_16,21 FEF Lateral (BA8),HA,-1.155970,-1.659437,-1.220121,-1.535495,-1.000576,-1.353224,-1.436105,...,,,,,,,,,,
3,170426_BM_pos_1-30_MS4_HA_556,"65 Hippocampus, CA1",HA,0.163326,0.432867,0.555181,0.311151,-0.064507,0.258723,0.458305,...,,,,,,,,,,
4,170426_BM_pos_1-30_MS5_HA_98,29 Precuneus (BA7m),HA,-1.429402,-1.037357,-1.572588,-0.774326,-1.081257,-1.344704,-1.608834,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287,Batch4_Sch_Br_x30_pos_5-08_275,8 Cingulate Posterior (BA23a),H5,,,,,,,,...,0.451108,0.546410,0.471613,0.687231,0.313841,0.757708,0.796724,0.499958,0.595330,0.779982
288,Batch1_Sch_Br_x30_pos_1-09_23,9 2ary Somatosensory Cortex,H1,,,,,,,,...,-0.681754,-0.135017,0.186984,0.407401,0.527567,-0.694895,-0.918294,-0.297782,0.029223,-1.063673
289,Batch2_Sch_Br_x30_pos_2-09_97,9 2ary Somatosensory Cortex,H2,,,,,,,,...,-0.991031,-0.609883,-0.024355,-0.126096,0.386976,-1.307066,-1.151982,-0.760314,-0.678591,-1.469617
290,Batch3_Sch_Br_x30_pos_3-09_232,9 2ary Somatosensory Cortex,H3,,,,,,,,...,1.944500,1.007795,0.392589,-1.421747,-1.449170,2.025034,2.588376,1.943208,0.247463,2.129110


In [76]:
shapes = (
    pd.concat([healthy_data, sz_data])
    .groupby("region")
    .apply(lambda x: x.shape[0] == 4)
)
all_fours = shapes[shapes].index.tolist()

healthy_data = healthy_data[healthy_data.region.isin(all_fours)]
sz_data = sz_data[sz_data.region.isin(all_fours)]

In [77]:
healthy_data.shape, sz_data.shape

((0, 382), (0, 382))

In [72]:
common_genes = list(
    set(healthy_data.columns) & set(sz_data.columns) - set(["region", "batch", "human"])
)
A = healthy_data.groupby("region")[common_genes].apply(pd.Series.tolist).tolist()
A = np.array(A)
B = sz_data.groupby("region")[common_genes].apply(pd.Series.tolist).tolist()
B = np.array(B)

  B = np.array(B)


In [73]:
A.shape

(59, 4, 4)

In [36]:
healthy_ba8 = (
    healthy_data[healthy_data.region.str.contains("BA8")]
    .drop(columns=["batch", "region", "human"])
    .astype(float)
)

sz_ba8 = (
    sz_data[sz_data.region.str.contains("BA8")]
    .drop(columns=["batch", "region", "human"])
    .astype(float)
)

stattest = mannwhitneyu(healthy_ba8, sz_ba8, axis=0, method="exact")

In [40]:
stattest.pvalue.shape

(379,)

In [39]:
for method in ["fdr_bh", "fdr_by", "fdr_tsbh", "fdr_tsbky"]:
    p_vals_corrected = multipletests(
        stattest.pvalue.reshape(
            -1,
        ),
        alpha=0.05,
        method=method,
    )
    print(method, (p_vals_corrected[1] < 0.05).mean())

fdr_bh 0.0
fdr_by 0.0
fdr_tsbh 0.0
fdr_tsbky 0.0


In [None]:
A = healthy_data.groupby("region")["human"].apply(pd.Series.tolist)
B = sz_data.groupby("region")["human"].apply(pd.Series.tolist)

all_fours = set(A[A.apply(len) == 4].index) & set(B[B.apply(len) == 4].index)

A = (
    healthy_data[healthy_data.region.isin(all_fours)]
    .groupby("region")
    .apply(pd.Series.tolist)
)

In [None]:
A = np.array(A.sort_index().tolist())[:, :, 3:].astype(float)

B = sz_data[sz_data.region.isin(all_fours)].groupby("region").apply(pd.Series.tolist)
B = np.array(B.sort_index().tolist())[:, :, 3:].astype(float)

stattest = ttest_ind(A, B, axis=1)
p_vals_corrected = multipletests(
    stattest.pvalue.reshape(
        -1,
    ),
    alpha=0.05,
    method="holm-sidak",
)
p_vals_corrected = p_vals_corrected[1].reshape(52, -1)
p_vals_corrected = pd.DataFrame(p_vals_corrected, index=list(all_fours))
lipids = p_vals_corrected[p_vals_corrected < 0.05].stack()
lipids

In [None]:
A.shape