In [None]:
import os
import ast
import sys
import yaml
import pickle
import argparse
sys.path.append("./")
sys.path.append("../")

import numpy as np
import pandas as pd
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt

from DriverGenerater import getDriver_df
from src.utils.graph_ops import load_graph
from src.utils.sub_ops import make_bin_cols

In [None]:
protein_dictionary = pd.read_csv('reference/protein_dictionary.csv')
node_to_hgnc = protein_dictionary.set_index('uniprot_id')['gene_first'].to_dict()

In [None]:
ConfigPATH = '../config/run.yaml'
with open(ConfigPATH, 'r') as f:
    model_config = yaml.safe_load(f)
config = argparse.Namespace(**model_config)

In [None]:
FinalattSAVEPATH = os.path.join(config.Feature_PATH, 'final_graph_with_topological_v021226.pkl')
G = load_graph(FinalattSAVEPATH)
print(G)
print(len(list(nx.connected_components(G))))

In [None]:
feature_df = pd.read_csv(os.path.join(config.Feature_PATH, "final_only_inter-chain_nodes_features_v021226.csv"))
feature_df

# Splited Graph Analysis

In [None]:
trainG = loadGraph('../DeepResidueCluster_train.pkl')
valG = loadGraph('../DeepResidueCluster_val.pkl')
testG = loadGraph('../DeepResidueCluster_test.pkl')
AugG = loadGraph('../DeepResidueCluster_train_aug.pkl')

In [None]:
cnt = 0
for g in testG:
    tempNode = g.nodes(data=True)
    for n, val in tempNode:
        if val['is_mut'] != 0:
            cnt += 1
            break

print("Total Graph", len(testG))
print("Mutated Graph", cnt)

In [None]:
# 

# Graph Analysis

In [None]:
feature_df.columns

In [None]:
print("Total Node:", len(feature_df))
print("Unique UniProt ID:", len(feature_df['uniprot_id'].unique()))
print("Copy Node:", len(feature_df[feature_df['node_id'].str.contains('-')]))
print("Unique Node:", len(feature_df['node_id'].unique()))
print("Mutation Node", feature_df[feature_df['is_mut']==1].shape[0], "||", round(feature_df[feature_df['is_mut']==1].shape[0]/len(feature_df)*100, 2), "%")

In [None]:
mut_use_node_df = feature_df[feature_df['is_mut'] == 1]
print(len(mut_use_node_df))
print("Node Info (%)", round((len(mut_use_node_df)/len(feature_df))*100, 2), '%')
print("Protein Info (%)", round((mut_use_node_df['uniprot_id'].nunique()/feature_df['uniprot_id'].nunique())*100, 2), '%')

In [None]:
feature_df['copy_incl_id'] = feature_df['node_id'].apply(lambda x: x.split('_')[0])

only_copy_df = feature_df[feature_df['from_copy']==True].copy()
copy_node_dict = {}
for row in only_copy_df.itertuples():
    node_id = row.copy_incl_id
    uniprot = row.uniprot_id
    if uniprot not in copy_node_dict:
        copy_node_dict[uniprot] = int(node_id.split('-')[1]) + 1
    else:
        prev_copy_num = copy_node_dict[uniprot]
        curr_copy_num = node_id.split('-')[1]
        if int(curr_copy_num) > int(prev_copy_num):
            copy_node_dict[uniprot] = int(curr_copy_num) + 1
            
rm_copy_df = feature_df[feature_df['from_copy']==False].copy()
for row in rm_copy_df.itertuples():
    node_id = row.node_id
    uniprot = row.uniprot_id
    if uniprot not in copy_node_dict:
        copy_node_dict[uniprot] = 1

In [None]:
counts_total = feature_df['uniprot_id'].value_counts().head(50)

top50_ids = counts_total.index
top50_values = counts_total.values
top50_labels = [node_to_hgnc.get(idx, idx) for idx in top50_ids]

plt.figure(figsize=(14, 6))
colors = plt.cm.turbo(np.linspace(0, 1, 50))

bars = plt.bar(top50_labels, top50_values, color=colors, edgecolor='black', linewidth=0.5)

for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, height + (max(top50_values) * 0.01), 
             f'{int(height)}', 
             ha='center', va='bottom', fontsize=8)

plt.xlabel('Protein (Gene Name)', fontsize=11)
plt.ylabel('Total Node Count', fontsize=11)
plt.title('Top 50 Proteins by Total Node Frequency', fontsize=14, pad=20)

plt.xticks(rotation=45, ha='right', fontsize=9)
ax = plt.gca()
for label in ax.get_xticklabels():
    text = label.get_text()
    if any(text.startswith(h) for h in ['H1', 'H2', 'H3', 'H4']):
        label.set_fontweight('bold')
        label.set_color('darkblue')

plt.grid(axis='y', linestyle='--', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
counts_total = feature_df['uniprot_id'].value_counts()
counts_mut = mut_use_node_df['uniprot_id'].value_counts()

df_norm = pd.DataFrame({'raw_total': counts_total, 'raw_mut': counts_mut}).fillna(0)

df_norm['copy_num'] = df_norm.index.map(lambda x: copy_node_dict.get(x, 1))

df_norm['norm_total'] = df_norm['raw_total'] / df_norm['copy_num']
df_norm['norm_mut'] = df_norm['raw_mut'] / df_norm['copy_num']
df_norm['norm_other'] = df_norm['norm_total'] - df_norm['norm_mut']

top50_df = df_norm.sort_values(by='norm_mut', ascending=False).head(50)

top50_ids = top50_df.index
top50_labels = [node_to_hgnc.get(idx, idx) for idx in top50_ids]

norm_mut_values = top50_df['norm_mut'].values
norm_other_values = top50_df['norm_other'].values

plt.figure(figsize=(12, 6))
colors = plt.cm.turbo(np.linspace(0, 1, 50))

p1 = plt.bar(top50_labels, norm_mut_values, color=colors, edgecolor='black', 
             linewidth=0.5, label='Mutation')

p2 = plt.bar(top50_labels, norm_other_values, bottom=norm_mut_values, color=colors, 
             edgecolor='black', linewidth=0.5, alpha=0.3, label='Non-Mutation')

norm_mut_counts = top50_df['norm_mut'].values

for i, bar in enumerate(p1):
    total_height = norm_mut_values[i] + 5
    
    plt.text(bar.get_x() + bar.get_width()/2, total_height + 0.01, 
             f'{int(norm_mut_counts[i])}',
             ha='center', va='bottom', fontsize=8, fontweight='bold')

plt.xlabel('Protein (Gene Name)', fontsize=11)
plt.ylabel('Normalized Count', fontsize=11)
plt.xticks(rotation=45, ha='right', fontsize=9)

ax = plt.gca()
for label in ax.get_xticklabels():
    text = label.get_text()
    if any(text.startswith(h) for h in ['H1', 'H2', 'H3', 'H4']):
        label.set_fontweight('bold')
        label.set_color('darkblue')

plt.legend(loc='upper right', frameon=True)
plt.grid(axis='y', linestyle='--', alpha=0.3)
plt.tight_layout()
plt.show()

# Node Features from Table

In [None]:
feature_df

In [None]:
use_feat_in_df1 = ['BHAR880101', 'CHOP780201', 'GRAR740102', 'JANJ780101', 'KLEP840101', 'KYTJ820101']

fig, axs = plt.subplots(2, 3, figsize=(15, 6))

for i, feat in enumerate(use_feat_in_df1):
    axs[i // 3, i % 3].hist(feature_df[feat])
    axs[i // 3, i % 3].set_title(f'{feat} (Unique: {feature_df[feat].unique().shape[0]})')
    axs[i // 3, i % 3].set_xlabel('Value')
    axs[i // 3, i % 3].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
mut_use_feat_in_df1 = ['unique_patients_count', 'total_mutations_count', 'unique_mutation_types_count', 'DAYM780301_avg', 'HENS920102_avg']

fig, axs = plt.subplots(2, 3, figsize=(15, 6))

for i, feat in enumerate(mut_use_feat_in_df1):
    axs[i // 3, i % 3].hist(df1[feat],)
    axs[i // 3, i % 3].set_title(f"{feat} (Unique: {df1[feat].nunique()})")
    axs[i // 3, i % 3].set_xlabel('Value')
    axs[i // 3, i % 3].set_yscale('log')
    axs[i // 3, i % 3].set_ylabel('Frequency (log)')

plt.tight_layout()
plt.show()

In [None]:
df1.copyindex.fillna(0, inplace=True)
nodes_in_df = df1.node_id.values
target_set = set(nodes_in_df)

In [None]:
skip_nodes = [n for n in nodes_in_G if n not in target_set]
skip_nodes

In [None]:
df3[df3.node_id.str.contains(';')].uniprot_id.unique()

In [None]:
df3[df3['node_id'].isin(df1['node_id'])]

# Connected Components Analysis

In [None]:
num_node_in_originG = [nx.subgraph(G, g).number_of_nodes() for g in list(nx.connected_components(G))]
num_node_in_originG = np.sort(num_node_in_originG)[::-1]

In [None]:
cc_dict = {'train': None, 'train_aug': None, 'val': None, 'test': None}

for name in ['train', 'val', 'test', 'train_aug']:
    with open(f'../DeepResidueCluster_{name}.pkl', 'rb') as f:
        cc_dict[name] = pickle.load(f)

In [None]:
list(cc_dict['train'][0].nodes(data=True))[0]

In [None]:
num_dict = {'train': [], 'train_aug': [], 'val': [], 'test': []}
mut_dict = {'train': [], 'train_aug': [], 'val': [], 'test': []}
for name, val in cc_dict.items():
    cnt = 0

    for g in val:
        num_dict[name].append(g.number_of_nodes())
        all_mut_in_subG = get_node_att_value(g,'is_mut')
        mut_cnt = sum(all_mut_in_subG)
        if mut_cnt > 0:
            cnt += 1

        mut_dict[name].append(mut_cnt)
    print(f"Total number of {name} graph included mutated Node", cnt)

## Mutation

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(8, 5))
for i, (name, mut_list) in enumerate(mut_dict.items()):
    mut_data = mut_dict[name]
    mut_data.sort()
    if 'train' in name:
        axs[i//2, i%2].hist(mut_data[::-1][2:])
    else:
        axs[i//2, i%2].hist(mut_data)
    axs[i//2, i%2].set_title(f'{name} ({len(mut_data)})')
    axs[i//2, i%2].set_xlabel('Mutation Count')
    axs[i//2, i%2].set_ylabel('Number of Graphs')
    axs[i//2, i%2].set_yscale('log')

plt.tight_layout()
plt.show()


## Number of Nodes

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(8, 5))
for i, (name, mut_list) in enumerate(num_dict.items()):
    mut_data = num_dict[name]
    mut_data.sort()
    if 'train' in name:
        axs[i//2, i%2].hist(mut_data[::-1][2:])
    else:
        axs[i//2, i%2].hist(mut_data)
    axs[i//2, i%2].set_title(name)
    axs[i//2, i%2].set_xlabel('Node Count')
    axs[i//2, i%2].set_ylabel('Number of Graphs')
    axs[i//2, i%2].set_yscale('log')

plt.tight_layout()
plt.show()


# Cancer Driver

In [None]:
MutaGenePATH = './reference/MutaGene_Benchmark.csv'
COSMICPATH = './reference/CosmicMutantExport.tsv.gz'
CHEMPATH = './reference/CHASMplus.xlsx'

PATHDict = {'MutaGene': MutaGenePATH,
            'COSMIC': COSMICPATH,
            'ChemPlus': CHEMPATH}

In [None]:
mutagene = getDriver_df(PATHDict, feat_df, score_th=None, reference_data='MutaGene')
cosmic = getDriver_df(PATHDict, feat_df, score_th=0.8, reference_data='COSMIC')
chemplus = getDriver_df(PATHDict, feat_df, score_th=None, reference_data='ChemPlus')

In [None]:
non_dup_df = pd.concat([cosmic, chemplus, mutagene], axis=0)
non_dup_df.drop_duplicates(subset=['position', 'residueType', 'node_id', 'mutability', 'is_driver'], inplace=True)
non_dup_df.is_driver.value_counts()

In [None]:
trainable_driver_df = pd.concat([cosmic, chemplus], axis=0)
trainable_driver_df.drop_duplicates(subset=['position', 'residueType', 'node_id', 'mutability', 'is_driver'], inplace=True)
trainable_driver_df.is_driver.value_counts()