In [None]:
import os
import sys
sys.path.append("../")

import ast
import yaml
import pickle
import argparse
import pandas as pd

import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt

from analysis import get_gene_name, get_unique_uniprot, get_gene, get_counts
from vizualization import plot_uniprot_counts, pathogenicity_graph_viz

# Pathogenicity Enrichment Analysis

In [None]:
ConfigPATH = '../config/run.yaml'
with open(ConfigPATH, 'r') as f:
    model_config = yaml.safe_load(f)
config = argparse.Namespace(**model_config)

In [None]:
param = 'e5i3t11'
node_info_df = pd.read_csv(os.path.join(config.Feature_PATH, 'merge_all_features_v021026.csv'))
result_df = pd.read_csv(os.path.join(config.SAVEPATH, 'pathogenity', f'MCL_{param}_mut+res_nP1m_bin42.csv'))
sig_df = result_df[result_df['q_value']<0.05]

In [None]:
gene_map_df = get_gene_name(result_df)
gene_dict = gene_map_df.set_index('uniprot_id').to_dict()['gene_name']
result_df['gene_name'] = result_df.apply(lambda x: get_gene(x, gene_dict), axis=1)

In [None]:
sig_df = result_df[result_df['q_value']<0.05]
print("Detected Clusters", len(sig_df), "||", round((len(sig_df)/len(result_df))*100, 1), '%')
print("Total Node in detected Clusters:", sig_df.cluster_size.sum())
print("Unique UniProt ID in Detected Clusters:", len(get_unique_uniprot(sig_df)))

In [None]:
uniprot_cnt_dict = get_counts(sig_df, 'gene_name')
sorted_dict = {k: v for k, v in sorted(uniprot_cnt_dict.items(), key=lambda item: item[1], reverse=True)}

In [None]:
plot_uniprot_counts(sorted_dict, top_n=20)

In [None]:
node_pathg_df = node_info_df[['node_id', 'avg_am_pathogenicity']]
pathg_dict = node_pathg_df.set_index('node_id').to_dict()['avg_am_pathogenicity']
node_pathg_df['chain_id'] = node_pathg_df['node_id'].apply(lambda x: x.split("_")[0])

In [None]:
uniprots = []
pathg_nodes = 0
for idx, row in sig_df.iterrows():
    nodes_in_clusters = ast.literal_eval(row['nodes'])
    node_info_in_clusters = node_pathg_df[node_pathg_df['node_id'].isin(nodes_in_clusters)]
    filtered_node = node_info_in_clusters[node_info_in_clusters['avg_am_pathogenicity']>=0.564]
    pathg_nodes += len(filtered_node)
    filtered_node['gene_name'] = filtered_node['chain_id'].apply(lambda x: f"{x} ({gene_dict[x.split('_')[0].split('-')[0]]})")
    uniprots.extend(filtered_node['gene_name'].to_list())

uniprot_counts = dict(Counter(uniprots))

# Pathogenecity Visualization

In [None]:
with open("../mergedG_btw+clos+deg+pgr+spl.pkl", 'rb') as f:
    G = pickle.load(f)

In [None]:
ordered_sig_df = sig_df.sort_values(by='observed_mean')

In [None]:
pathogenicity_graph_viz(G, ordered_sig_df, gene_dict, min_prot=3,
                        limit_n_clusters=20, saveFileName=None)

# Partitioning Results