In [None]:
import os
import sys
sys.path.extend(["./", "../"])

import yaml
import pickle
import argparse

import pprint
import numpy as np
import pandas as pd
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt

from src.utils.graph_ops import load_graph

In [None]:
ConfigPATH = '../config/run.yaml'
with open(ConfigPATH, 'r') as f:
    model_config = yaml.safe_load(f)
config = argparse.Namespace(**model_config)

# New DataFrame (12/18)

In [None]:
from GraphGenerator import *

In [None]:
DATAPATH = os.path.join(config.Feature_PATH, '../021326')
SAVEPATH = os.path.join(config.Feature_PATH, '../021326/processing')
FINALPATH = os.path.join(config.Feature_PATH, '../021326/final')

In [None]:
# Reference Data
incl_uniprot = pd.read_csv(os.path.join(DATAPATH, 'nucleolus_pt.csv'))
excl_pdb = pd.read_csv(os.path.join(DATAPATH, 'invalid_structure_ids.csv'))
bmr_df = pd.read_csv(os.path.join(DATAPATH, 'node_mutation_with_BMR_v021126.csv'))

In [None]:
remove_ubq_pid = ['p0cg47', 'p62979', 'p0cg48', 'p62987', 
                  'p0cg47_', 'p62979_', 'p0cg48_', 'p62987_' # For remove homodimer node
                  ]

## Step#1 Remove UBQ & Keep only Nucleosome related Node

In [None]:
# new_df = pd.read_csv(os.path.join(DATAPATH, 'human_aa_aa_edges_corrected.csv'))

In [None]:
# new_df = remove_ubq_related_connection(new_df, remove_ubq_pid)
# new_df = generate_nodeid_and_only_uniprot(new_df)
# new_df = filter_only_nucleosome_related_connection(new_df, excl_pdb, incl_uniprot)

In [None]:
# new_df[new_df.nodeid_1.str.contains('-')].head(3)

In [None]:
# new_df.to_csv(os.path.join(SAVEPATH, 'step1_human_aa_edges_exclubq_Nucleosome_related_data.csv'), index=False)

## Step#2 Remove Strange Position and CDS Context Matching

In [None]:
from CDSLoader import build_node_context_df

In [None]:
new_df = pd.read_csv(os.path.join(SAVEPATH, 'step1_human_aa_edges_exclubq_Nucleosome_related_data.csv'))

In [None]:
new_df = remove_negative_and_zero_position_node(new_df)

### Temp Section for using past BMR

In [None]:
# past_df = pd.read_csv(os.path.join(DATAPATH, 'cleaned_human_aa_aa_edges_exclubq_apply_transcript_v120825.csv'))
# past_df = filter_only_nucleosome_related_connection(past_df, excl_pdb, incl_uniprot)
# past_df = remove_ubq_related_connection(past_df, remove_ubq_pid)
# past_df.to_csv(os.path.join(SAVEPATH, 'step1_past_df_exclubq_Nucleosome_related_data.csv'), index=False)

In [None]:
past_df = pd.read_csv(os.path.join(SAVEPATH, 'step1_past_df_exclubq_Nucleosome_related_data.csv'))
past_df = remove_negative_and_zero_position_node(past_df)

In [None]:
past_pid = get_unique_pid_from_edge_df(past_df)
new_pid = get_unique_pid_from_edge_df(new_df)
total_pid = set(past_pid).union(set(new_pid))
unique_new_elements = list(set(new_pid) - set(past_pid))
print("Past", len(past_pid), "||", "New", len(new_pid))
print("Unique New Elements", len(unique_new_elements))

In [None]:
file_name = "saved_new_node_ids_for_cds.py"
variable_name = "NODE_IDS"

with open(file_name, "w", encoding="utf-8") as f:
    f.write(f"{variable_name} = ")
    pprint.pprint(unique_new_elements, stream=f, width=80, compact=True)

In [None]:
file_name = "saved_all_node_ids_for_cds.py"
variable_name = "NODE_IDS"

with open(file_name, "w", encoding="utf-8") as f:
    f.write(f"{variable_name} = ")
    pprint.pprint(total_pid, stream=f, width=80, compact=True)

In [None]:
unique_new_elements[:10]

In [None]:
%load_ext autoreload
%autoreload 2

from CDSLoader import build_node_context_df

In [None]:
new_bmr_df = build_node_context_df(unique_new_elements)

In [None]:
new_bmr_df.info()

In [None]:
new_bmr_df

### Real BMR

# Past

In [None]:
edge_df = pd.read_csv(f'{config.Feature_PATH}/reference/cleaned_edge_energy_df_related_nucleosome_v021226.csv')
edge_df

In [None]:
edge_df.columns

In [None]:
node_w_feature_df = pd.read_csv(f'{config.Feature_PATH}/reference/merged_features_v021226.csv')
# useable_nodes = list(node_w_feature_df.node_id)

In [None]:
usable_nodes = list(node_w_feature_df.node_id)

In [None]:
edge_df = edge_df[edge_df['nodeid_1'].isin(usable_nodes) & edge_df['nodeid_2'].isin(usable_nodes)]

In [None]:
edge_df

# Graph Generation

In [None]:
G = nx.from_pandas_edgelist(
    edge_df, 
    source='nodeid_1', 
    target='nodeid_2',
    edge_attr=['cleaned_total_energy'] # if you need to save edge weight
)
print(G)

In [None]:
GraphSAVE = os.path.join(config.Feature_PATH, 'reference/weighted_inter-chain_grpah_nucleosome_v021226.pkl')

with open(GraphSAVE, 'wb') as f:
    pickle.dump(G, f)

## [Action Required: run attProcessing] Merge Topological Feature into Graph

In [None]:
attr_mapping = {
    'graph_with_betweenness.pkl': 'betweenness',
    'graph_with_pagerank.pkl': 'pagerank',
    'graph_with_closeness.pkl': 'closeness',
    'graph_with_degree.pkl': 'degree',
    'graph_with_shortest_path_length_per_node.pkl': 'shortest_path'
}

In [None]:
attSAVEPATH = os.path.join(config.Feature_PATH, 'graph_att')

for file_name in os.listdir(attSAVEPATH):
    if not file_name.endswith('.pkl'): continue
    
    with open(os.path.join(attSAVEPATH, file_name), 'rb') as f:
        temp_G = pickle.load(f)
        
        first_node = next(iter(temp_G.nodes()))
        actual_attr_names = temp_G.nodes[first_node].keys()
        
        for attr_name in actual_attr_names:
            attr_values = nx.get_node_attributes(temp_G, attr_name)
            nx.set_node_attributes(G, attr_values, name=attr_mapping[file_name])

In [None]:
first_node = next(iter(G.nodes()))
actual_attr_names = G.nodes[first_node].keys()
print(actual_attr_names)
print(G)

In [None]:
FinalattSAVEPATH = os.path.join(config.Feature_PATH, 'final_graph_with_topological_v021226.pkl')
with open(FinalattSAVEPATH, 'wb') as f:
    pickle.dump(G, f)

# Graph EDA

In [None]:
FinalattSAVEPATH = os.path.join(config.Feature_PATH, 'final_graph_with_topological_v021226.pkl')
G = load_graph(FinalattSAVEPATH)

In [None]:
connected_components = list(nx.connected_components(G))
cc_sizes = [len(cc) for cc in connected_components]
cc_sizes.sort(reverse=True)
print(len(cc_sizes))

plt.figure(figsize=(10, 4))
plt.hist(cc_sizes, bins=50, edgecolor='black', color='skyblue', alpha=0.7)

plt.xlabel("Connected Component Size", fontsize=8)
plt.ylabel("Frequency (Log Scale)", fontsize=8)
plt.yscale('log')
plt.title("Connected Component Size Distribution in Inter-chain Graph", fontsize=10)

plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
plt.figure(figsize=(10, 4))
plt.hist(cc_sizes[26:], bins=50, edgecolor='black', color='skyblue', alpha=0.7)

plt.xlabel("Connected Component Size", fontsize=8)
plt.ylabel("Frequency (Log Scale)", fontsize=8)
plt.yscale('log')
plt.title("Connected Component Size Distribution in Inter-chain Graph", fontsize=10)

plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()