In [None]:
import pandas as pd
import networkx as nx
import json
import pickle

In [None]:
sc_df = pd.read_csv("../gwas/scz-2022-02-11.csv")
bp_df = pd.read_csv("../gwas/bpd-2022-02-11.csv")

In [None]:
temp = sc_df[~sc_df["Mapped gene"].isna()].copy()
t2 = temp[temp["Mapped gene"] != "'-"].copy()
t2["Mapped gene"] = t2["Mapped gene"].str.split(", ")
sc_proc = t2.explode("Mapped gene").reset_index()


temp = bp_df[~bp_df["Mapped gene"].isna()].copy()
t2 = temp[temp["Mapped gene"] != "'-"].copy()
t2["Mapped gene"] = t2["Mapped gene"].str.split(", ")
bp_proc = t2.explode("Mapped gene").reset_index()

In [None]:
sc_proc["pval_num"] = sc_proc["P-value"].apply(lambda x: float(int(x.split(" x 10-")[0]) * 10**-int(x.split(" x 10-")[1])))
bp_proc["pval_num"] = bp_proc["P-value"].apply(lambda x: float(int(x.split(" x 10-")[0]) * 10**-int(x.split(" x 10-")[1])))

In [None]:
sc_sig = sc_proc[sc_proc["pval_num"] <= float(5 * 10**-8)]
bp_sig = bp_proc[bp_proc["pval_num"] <= float(5 * 10**-8)]

In [None]:
sc_geneset = set(sc_proc["Mapped gene"])
bp_geneset = set(bp_proc["Mapped gene"])

In [None]:
sc_bel_df = pd.read_table("../bel_graphs/sc_bel_extend.tsv")
bp_bel_df = pd.read_table("../bel_graphs/bp_bel_extend.tsv")

scbp_bel_df = sc_bel_df.copy()
scbp_bel_df = scbp_bel_df.append(bp_bel_df.copy())

In [None]:
sc_count = 0
sc_kg_set = []
bp_count = 0
bp_kg_set = []
scbp_count = 0
scbp_kg_set = []

for gene in sc_geneset.intersection(bp_geneset):
    if any(sc_bel_df["source"].str.contains(gene)):
        sc_count += 1
        sc_kg_set.append(gene)
        continue
    if any(sc_bel_df["target"].str.contains(gene)):
        sc_count += 1
        sc_kg_set.append(gene)
        continue

for gene in sc_geneset.intersection(bp_geneset):
    if any(bp_bel_df["source"].str.contains(gene)):
        bp_count += 1
        bp_kg_set.append(gene)
        continue
    if any(bp_bel_df["target"].str.contains(gene)):
        bp_count += 1
        bp_kg_set.append(gene)
        continue

for gene in sc_geneset.intersection(bp_geneset):
    if any(scbp_bel_df["source"].str.contains(gene)):
        scbp_count += 1
        scbp_kg_set.append(gene)
        continue
    if any(scbp_bel_df["target"].str.contains(gene)):
        scbp_count += 1
        scbp_kg_set.append(gene)
        continue

In [None]:
with open("scz_gwas_geneset.json", "w") as f:
    json.dump(list(sc_kg_set), f)

with open("bpd_gwas_geneset.json", "w") as f:
    json.dump(list(bp_kg_set), f)

with open("scz_bpd_gwas_geneset.json", "w") as f:
    json.dump(list(scbp_kg_set), f)

In [None]:
from scipy.stats import fisher_exact

fisher_table = [[len(sc_geneset.intersection(bp_geneset)), len(sc_geneset-bp_geneset)], [len(bp_geneset-sc_geneset), len(sc_geneset.union(bp_geneset))]]
                
oddsratio, pvalue = fisher_exact(fisher_table)     

In [None]:
pvalue

In [None]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

In [None]:
v = venn2([set(sc_kg_set), set(bp_kg_set)], set_labels=('SCZ Overlap', 'BPD Overlap'))

v.get_patch_by_id('10').set_alpha(0.45)
v.get_patch_by_id('10').set_color('#e51e25')
v.get_patch_by_id('11').set_alpha(0.45)
v.get_patch_by_id('11').set_color('#35b44a')
v.get_patch_by_id('01').set_alpha(0.45)
v.get_patch_by_id('01').set_color('#1cade4')

v.get_label_by_id('10').set_family("sans-serif")
v.get_label_by_id('11').set_family("sans-serif")
v.get_label_by_id('01').set_family("sans-serif")
v.get_label_by_id('10').set_fontsize(18)
v.get_label_by_id('11').set_fontsize(18)
v.get_label_by_id('01').set_fontsize(18)

v.get_label_by_id('A').set_family("sans-serif")
v.get_label_by_id('B').set_family("sans-serif")
v.get_label_by_id('A').set_fontsize(18)
v.get_label_by_id('B').set_fontsize(18)

v.get_label_by_id('A').set_x(-0.63)
v.get_label_by_id('A').set_y(-0.25)
v.get_label_by_id('B').set_x(0.55)
v.get_label_by_id('B').set_y(-0.25)

plt.title("Overlap between genes in schizophrenia and bipolar disorder KGs", y=1.1, fontsize=20)

plt.savefig("sc_bp_venn.png", bbox_inches="tight")