In [None]:
import networkx as nx
import pandas as pd
import pybel
import os
from tqdm import tqdm
from pybel.dsl import BaseConcept, CentralDogma

In [None]:
auto_bel_df = pd.read_table("../auto_bel.tsv")

auto_bel_df["source"] = auto_bel_df["Subject"].str.replace("UNKNOWN:", "! ").str.replace('"', '').str.extract(r'(?<=!\ )([\s\S]*)(?=\))')[0]
auto_bel_df["relation"] = auto_bel_df["Relation"]
auto_bel_df["target"] = auto_bel_df["Object"].str.replace("UNKNOWN:", "! ").str.replace('"', '').str.extract(r'(?<=!\ )([\s\S]*)(?=\))')[0]

In [None]:
sc_only = auto_bel_df[(auto_bel_df["source"] == "Schizophrenia") | (auto_bel_df["target"] == "Schizophrenia")].copy()
bp_only = auto_bel_df[(auto_bel_df["source"] == "Bipolar disorder") | (auto_bel_df["target"] == "Bipolar disorder")].copy()

In [None]:
intersection = sc_only.loc[sc_only.index.intersection(bp_only.index), :].copy()
sc_only = sc_only.loc[sc_only.index.difference(intersection.index), :].copy()
bp_only = bp_only.loc[bp_only.index.difference(intersection.index), :].copy()

sc_only["Data_Source"] = "Auto"
bp_only["Data_Source"] = "Auto"

In [None]:
common_df = auto_bel_df.loc[auto_bel_df.index.difference(sc_only.index), :].copy()
common_df = common_df.loc[common_df.index.difference(bp_only.index), :].copy()
common_df = common_df.append(intersection)
common_df["Data_Source"] = "Auto"

In [None]:
HERE = os.path.abspath(os.path.dirname('__file__'))

def get_cached_bel_files(directory):
    """Return a graph with all bel files."""
    
    BEL_DIRECTORY = os.path.abspath(
        os.path.join(HERE, os.pardir, directory),
    )

    graphs = []
        
    for filename in tqdm(os.listdir(BEL_DIRECTORY)):
                
        if not filename.endswith(".bel"):
            continue
            
        graphs.append(
            pybel.from_bel_script(
                os.path.join(BEL_DIRECTORY, filename),
                **{
                    'no_identifier_validation': True,
                    'allow_naked_names': True,
                    'allow_definition_failures': True,
                }
            )   
        )
            
    return pybel.union(graphs)

In [None]:
DATASET = "schizophrenia"
OUTPUT_NAME = "sc_bel"

bel_kg = get_cached_bel_files(directory=DATASET)

pybel.to_csv(bel_kg, f"./{OUTPUT_NAME}.tsv", sep='\t')

not_hgnc = pybel.struct.filters.invert_node_predicate((pybel.struct.filters.namespace_inclusion_builder("HGNC")))

pybel.struct.mutation.deletion.remove_filtered_nodes(bel_kg, node_predicates=[not_hgnc])

df = nx.to_pandas_edgelist(bel_kg, source='source', target='target')

df.source = df.source.apply(lambda x: x.name)
df.target = df.target.apply(lambda x: x.name)

df = df[['source', 'relation', 'target']]
df.to_csv(f"../bel_graphs/processed_{OUTPUT_NAME}.tsv", sep='\t', index=False)

In [None]:
sc_bel_df = pd.read_table("processed_sc_bel.tsv")
sc_bel_df["Data_Source"] = "Manual"

sc_bel_df = sc_bel_df.append(common_df, ignore_index=True)
sc_bel_df = sc_bel_df.append(sc_only, ignore_index=True)

sc_bel_df.to_csv("../bel_graphs/schizophrenia_kg.tsv", sep='\t', index=False)

In [None]:
DATASET = "bipolar_disorder"
OUTPUT_NAME = "bp_bel"

bel_kg = get_cached_bel_files(directory=DATASET)

pybel.to_csv(bel_kg, f"./{OUTPUT_NAME}.tsv", sep='\t')

not_hgnc = pybel.struct.filters.invert_node_predicate((pybel.struct.filters.namespace_inclusion_builder("HGNC")))

pybel.struct.mutation.deletion.remove_filtered_nodes(bel_kg, node_predicates=[not_hgnc])

df = nx.to_pandas_edgelist(bel_kg, source='source', target='target')

df.source = df.source.apply(lambda x: x.name)
df.target = df.target.apply(lambda x: x.name)

df = df[['source', 'relation', 'target']]
df.to_csv(f"../bel_graphs/processed_{OUTPUT_NAME}.tsv", sep='\t', index=False)

In [None]:
bp_bel_df = pd.read_table("processed_bp_bel.tsv")
bp_bel_df["Data_Source"] = "Manual"

bp_bel_df = bp_bel_df.append(common_df, ignore_index=True)
bp_bel_df = bp_bel_df.append(bp_only, ignore_index=True)

bp_bel_df.to_csv("../bel_graphs/bipolar_disorder_kg.tsv", sep='\t', index=False)

In [None]:
auto_diabetes = auto_bel_df[(auto_bel_df["source"].str.contains("diabetes")) | (auto_bel_df["target"].str.contains("diabetes"))].copy()
auto_diabetes["Data_Source"] = "Auto"

diabetes_df = pd.read_table("../../bel_processing/bel_graphs/processed_diabetes_bel.tsv")
diabetes_df["Data_Source"] = "Manual"

diabetes_df = diabetes_df.append(auto_diabetes, ignore_index=True)
diabetes_df.to_csv("../bel_graphs/t2dm_kg.tsv", sep='\t', index=False)