In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import configparser
import csv
import json
import os
import zipfile

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import numba
import pandas as pd
from community import community_louvain
from dwave.cloud import Client
from dwave.cloud.config import get_configfile_paths
from kaggle.api.kaggle_api_extended import KaggleApi
from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score

import algorithm.kcomm.graph_kClusterAlgorithm_functions as QCD
import algorithm.kcomm.graph_kClusterAlgorithm_functions_optimized as QCD_optimized
import algorithm.kcomm.graphFileUtility_functions as GFU

In [None]:
dwave_config_paths = get_configfile_paths(only_existing=False)

token_found = False
for config_path in dwave_config_paths:
    try:
        # Read the configuration file
        config = configparser.ConfigParser()
        config.read(config_path)

        # Check if the 'defaults' section and 'token' exist
        if 'defaults' in config and 'token' in config['defaults']:
            token = config['defaults']['token']

            # Check if the token is non-empty
            if token.strip():
                os.environ['DWAVE_API_TOKEN'] = token
                print(f"Set DWAVE_API_TOKEN from {config_path}")
                token_found = True
                break  # Exit the loop once a valid token is found
    except Exception as e:
        print(f"Error reading {config_path}: {e}")

# If no valid token was found, print a message
if not token_found:
    print("You need a valid D-Wave config file with a non-empty token.")

In [None]:
# Initialize and authenticate the Kaggle API
api = KaggleApi()
api.authenticate()

In [None]:
data_dir = "../data"
competition = 'cm4ai-community-detection-benchmark'
data_path = os.path.join(data_dir, competition)
os.makedirs(data_dir, exist_ok=True)


In [None]:
output_dir = "../output"
os.makedirs(output_dir, exist_ok=True)

In [None]:
# Download all files from a competition (e.g., Titanic)
competition = 'cm4ai-community-detection-benchmark'
api.competition_download_files(competition, path=data_dir, force=True)

In [None]:
# Extract all files from the zip to the specified directory
zip_file_path = data_path + ".zip"
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(data_path)

In [None]:
args_dict = {
    "benchmark_gen":"protein",
    "output_dir" : output_dir,
    "beta0": 25,                     # A weight on a node's modularity 
    "gamma0": -500,                  # Penalty for nodes being placed in multiple communities
    "threshold": 0.8,                # Unknown
    "qsize": 64,                     # unknown
    "community_penalty_factor": 1,   # Penalty factor for large communities
    "resolution": 3                 # Higher modifies the modularity matrix to emphasize smaller communities
}


In [None]:
run_profile="defaults"

gt_arr = []
if args_dict["benchmark_gen"] == 'karate':

    print(f"Using benchmark graph generated by: nx-karate")  
    
    run_label = "zacharys-karate-club"
    input_graph = "zacharys-karate-club"
    
    G = nx.karate_club_graph() 


    gt_arr = [G.nodes[v]['club'] for v in G.nodes()]
    gt_arr = [0 if x == 'Mr. Hi' else 1 for x in gt_arr]        # Convert to binary labels


elif args_dict["benchmark_gen"] == 'networkx':
    
    print(f"Using benchmark graph generated by: networkx")    

    run_label = "LFR_rs11_N1000_ad5_mc20_mu0.1"
    input_graph = f"../data/cm4ai-community-detection-benchmark/{run_label}"

    G = nx.read_edgelist(f"{input_graph}.edgelist")

    df = pd.read_csv(f"{input_graph}_communities.csv")
    gt_dict = df.set_index('id')['solution'].to_dict()

    sorted_by_keys = dict(sorted(gt_dict.items()))
    gt_arr = []
    for k,v in sorted_by_keys.items():
        gt_arr.append(v)

elif args_dict["benchmark_gen"] == 'dynbench':

    print(f"Using benchmark graph generated by: Dynbench")    
    
    # n = Number of nodes per community
    # q = Number of communities
    run_label = "stdmerge-n32-q8-pout01.t00100"
    input_graph = f"../data/cm4ai-community-detection-benchmark/{run_label}.graph"
    ground_truth_path = f"../data/scm4ai-community-detection-benchmark/{run_label}.comms"

    edgelist = pd.read_csv(input_graph, sep=' ', names=["source","target"])
    G = nx.from_pandas_edgelist(edgelist)
    for edge in G.edges():
        G[edge[0]][edge[1]]['weight'] = 1

    gt_arr=[]
    with open(ground_truth_path) as ground_truth_file:
        for line in ground_truth_file:
            if line.startswith("#"):
                continue
            fields = line.strip().split(" ")
            gt_arr.append(fields[1])
elif args_dict["benchmark_gen"] == 'football':

    print(f"Using benchmark graph generated by: Football")    
    
    run_label = "football"
    input_graph = f"../data/cm4ai-community-detection-benchmark/football_adjacency_matrix.csv"
    input_graph_named = f"../data/cm4ai-community-detection-benchmark/football.gml"
    ground_truth_path = f"../data/cm4ai-community-detection-benchmark/football_labels.csv"
    ground_truth_named_path = f"../data/cm4ai-community-detection-benchmark/football_labels_named.csv"
    adj_matrix = np.loadtxt(input_graph, delimiter=",", skiprows=1)
    
    # Create a graph from the adjacency matrix
    G = nx.from_numpy_matrix(adj_matrix)

    gt_arr=[]
    with open(ground_truth_path) as ground_truth_file:
        reader = csv.reader(ground_truth_file)
        next(reader)  # Skip the header
        for row in reader:
            gt_arr.append(int(row[1]))
            
    gt_arr_named = []
    with open(ground_truth_named_path, "r") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  
        for row in reader:
            gt_arr_named.append(row[1]) 
    
    # Read in the named graph to get the node names
    G_named = nx.read_gml(input_graph_named)

    # Extract the 'name' attribute for each node
    node_data = [(node, data.get('name', '')) for node, data in G_named.nodes(data=True)]

    # Save to a file
    gt_node_names = []
    for node_id, name in node_data:
        gt_node_names.append(node_id)

elif args_dict["benchmark_gen"] == 'protein':

    print(f"Using benchmark graph generated by: Protein")    

    run_label = "protein"
    edge_list_path = f"../data/cm4ai-community-detection-benchmark/quantum_ppi_cutoff_0.002.id.edgelist.tsv"
    
    # Create a graph from the edge list
    G = nx.read_edgelist(edge_list_path, delimiter="\t", nodetype=int)


In [None]:
nx.draw(
    G, 
    node_size=25,
    width=0.5,
    node_color="grey"
)

In [None]:
A = nx.adjacency_matrix(G)
print ('\nAdjacency matrix:\n', A.todense())

if len(gt_arr) != 0:
    num_parts = len(np.unique(gt_arr))
else:
    num_parts = 40

num_blocks = num_parts 
num_nodes = nx.number_of_nodes(G)
num_edges = nx.number_of_edges(G)
print (f"\nQuantum Community Detection: Up to {num_parts} communities")
print (f"Graph has {num_nodes} nodes and {num_edges} edges")

In [None]:
beta, gamma, GAMMA  = QCD.set_penalty_constant(num_nodes, num_blocks, args_dict["beta0"], args_dict["gamma0"])

In [None]:
# mtotal, modularity = QCD.build_mod(A, args_dict["threshold"], num_edges)
mtotal, modularity = QCD_optimized.build_mod_resolution(A, args_dict["threshold"], num_edges, resolution=args_dict["resolution"])

print ("\nModularity matrix: \n", modularity)

print ("min value = ", modularity.min())
print ("max value = ", modularity.max())

print ("threshold = ", args_dict["threshold"])

In [None]:
args_dict['resolution']

In [None]:
# scaled modularity, community penalty, sampling time

# Higher resolution parameter
# TODO: Didn't realize that I ran this when I added the 'resolution' parameter inside the modularity matrix function
# need to test whether or not this line helps increase ARI or if we can just add resolution to the modularity matrix
modularity_scaled = args_dict["resolution"] * modularity

# Q = QCD.makeQubo(G, modularity, beta, gamma, GAMMA, num_nodes, num_parts, num_blocks, threshold)
Q = QCD_optimized.makeQubo(
    modularity_scaled, 
    beta, 
    gamma, 
    GAMMA, 
    num_nodes, 
    num_parts, 
    num_blocks, 
    args_dict["threshold"],
    args_dict["community_penalty_factor"]
)

In [None]:
result = {}
result['num_clusters'] = num_parts 
result['nodes'] = num_nodes
result['edges'] = num_edges
result['size'] = num_nodes * num_parts 
result['subqubo_size'] = args_dict["qsize"]
result

In [None]:
# Run k-clustering with Hybrid/D-Wave using ocean
ss = QCD.clusterHybrid(Q, num_parts, args_dict["qsize"], run_label, run_profile, result)
result

In [None]:
# Process solution
part_number = QCD.process_solution(ss, G, num_blocks, num_nodes, num_parts, result)

In [None]:
mmetric = QCD.calcModularityMetric(mtotal, modularity, part_number)
result['modularity_metric'] = mmetric

In [None]:
# draw graph clusters and save .png
GFU.showClusters(part_number, G, args_dict)

In [None]:
args_dict['resolution']

In [None]:
part_number

In [None]:
# write comms file 
GFU.write_partFile(
    part_num=part_number, 
    Dim=num_nodes, 
    nparts=num_parts, 
    args_dict=args_dict
) 


In [None]:
if len(gt_arr) != 0:
    # Add partition ID as a node attribute
    for node, cluster_id in part_number.items():
        G.nodes[node]['cluster_id'] = cluster_id

    # Add gt partition ID as attribute
    for node in G.nodes:
        G.nodes[node]['gt_cluster_id'] = gt_arr[node]
        G.nodes[node]['gt_cluster_named'] = gt_arr_named[node]
        G.nodes[node]['gt_team_name'] = gt_node_names[node]

    # Save the graph as a NetworkX graph object
    graph_path = os.path.join(args_dict['output_dir'], "graph_with_clusters_good_ARI.graphml")
    nx.write_graphml(G, graph_path)
else:
    print("No ground truth available for this graph. Skipping graph saving.")

In [None]:
# Rearrange node and community IDs for metric calculations
columns = ["node_id", "comm_id"]
communities = []

pred_arr=[]

comm_file_name = f"nparts_{num_parts}_resolution_{args_dict['resolution']}.txt"
comm_file_path = os.path.join(args_dict['output_dir'], comm_file_name)
with open(comm_file_path) as comm_file:
    i = 0
    for line in comm_file:
        i += 1
        if i == 1:
            continue
        fields = line.strip().split("  ")
        communities.append(fields)
        pred_arr.append(fields[1])

pred_arr = [int(x) for x in pred_arr]
pred_arr[:10]

In [None]:
# modularity = nx.community.modularity(G, pred_arr)
print("Number of communities (DWave):", len(np.unique(pred_arr)))

print(f"Modularity: \t{round(result['modularity_metric'],4)}")

if len(gt_arr) != 0:
    # Calculate adjusted mutual information and adjusted rand index
    result['ari_score'] = adjusted_rand_score(gt_arr, pred_arr)
    result['ami_score'] = adjusted_mutual_info_score(gt_arr,pred_arr)

    print(f"ARI: \t\t{round(result['ari_score'],4)}")
    print(f"AMI: \t\t{round(result['ami_score'],4)}")
else:
    print("No ground truth available for this graph. Skipping ARI and AMI calculations.")


In [None]:
# --- Louvain Community Detection ---
# Perform Louvain community detection
louvain_partition = community_louvain.best_partition(G, resolution=1.5)
num_communities_louvain = len(set(louvain_partition.values()))
print("Number of communities (Louvain):", num_communities_louvain)

# --- Calculate Modularity ---
modularity_score = community_louvain.modularity(louvain_partition, G)
print(f"Modularity: \t{round(modularity_score,4)}")

louvain_pred_arr = [louvain_partition[node] for node in G.nodes]

if len(gt_arr) != 0:
    # --- Calculate ARI and AMI ---
    ari_score = adjusted_rand_score(gt_arr, louvain_pred_arr)
    ami_score = adjusted_mutual_info_score(gt_arr, louvain_pred_arr)
    print(f"ARI: \t\t{round(ari_score,4)}")
    print(f"AMI: \t\t{round(ami_score,4)}")
else:
    print("No ground truth available for this graph. Skipping Louvain ARI and AMI calculations.")

In [None]:
# Check the modularity score for the ground truth 
if len(gt_arr) != 0:
    # Convert gt_arr to a dictionary format for modularity calculation
    gt_partition = {node: community_id for node, community_id in enumerate(gt_arr)}

    # Calculate modularity using community_louvain.modularity
    modularity_score = community_louvain.modularity(gt_partition, G)
    print(f"Ground Truth Modularity: {round(modularity_score,4)}")
else:
    print("No ground truth available for this graph. Skipping ground truth modularity calculation.")