In [1]:
import json
import os
import zipfile

import networkx as nx
import numpy as np
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi
from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score

import algorithm.kcomm.graph_kClusterAlgorithm_functions_optimized as QCD
import algorithm.kcomm.graphFileUtility_functions as GFU



In [None]:
# Initialize and authenticate the Kaggle API
api = KaggleApi()
api.authenticate()



In [3]:
data_dir = "../data"
competition = 'cm4ai-community-detection-benchmark'
data_path = os.path.join(data_dir, competition)
os.makedirs(data_dir, exist_ok=True)


In [4]:
output_dir = "../output"
os.makedirs(output_dir, exist_ok=True)

In [5]:
# Download all files from a competition (e.g., Titanic)
competition = 'cm4ai-community-detection-benchmark'
api.competition_download_files(competition, path=data_dir, force=True)

In [6]:
# Extract all files from the zip to the specified directory
zip_file_path = data_path + ".zip"
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(data_path)

In [7]:
args_dict = {
    "output_dir" : output_dir
}
run_profile="defaults"
beta0 = 5
gamma0 = -250
threshold = 0.2
qsize = 64

In [8]:
run_label = "graph1"
input_graph = f"../data/cm4ai-community-detection-benchmark/{run_label}"

G = nx.read_edgelist(f"{input_graph}.edgelist")

In [9]:
A = nx.adjacency_matrix(G)
print ('\nAdjacency matrix:\n', A.todense())

# num_parts = len(np.unique(gt_arr))
num_parts = 20                              # TODO: What should we put for num_parts?
num_blocks = num_parts 
num_nodes = nx.number_of_nodes(G)
num_edges = nx.number_of_edges(G)
print (f"\nQuantum Community Detection: Up to {num_parts} communities")
print (f"Graph has {num_nodes} nodes and {num_edges} edges")


Adjacency matrix:
 [[0 1 1 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 1]]

Quantum Community Detection: Up to 20 communities
Graph has 500 nodes and 981 edges


  A = nx.adjacency_matrix(G)


In [10]:
beta, gamma, GAMMA  = QCD.set_penalty_constant(num_nodes, num_blocks, beta0, gamma0)

In [11]:
mtotal, modularity = QCD.build_mod(A, threshold, num_edges)
print ("\nModularity matrix: \n", modularity)

print ("min value = ", modularity.min())
print ("max value = ", modularity.max())

print ("threshold = ", threshold)


 Dim =  500

 Computing modularity matrix ...

Modularity matrix: 
 [[-8.355e-03  9.896e-01  9.958e-01 ... -4.178e-03 -2.089e-03 -2.089e-03]
 [ 9.896e-01 -1.305e-02 -5.222e-03 ... -5.222e-03 -2.611e-03 -2.611e-03]
 [ 9.958e-01 -5.222e-03 -2.089e-03 ... -2.089e-03 -1.044e-03 -1.044e-03]
 ...
 [-4.178e-03 -5.222e-03 -2.089e-03 ... -2.089e-03 -1.044e-03 -1.044e-03]
 [-2.089e-03 -2.611e-03 -1.044e-03 ... -1.044e-03  9.995e-01 -5.222e-04]
 [-2.089e-03 -2.611e-03 -1.044e-03 ... -1.044e-03 -5.222e-04  9.995e-01]]
min value =  -0.31018276762402086
max value =  0.9994778067885117
threshold =  0.2


In [13]:
Q = QCD.makeQubo(modularity, beta, gamma, GAMMA, num_nodes, num_parts, num_blocks, threshold)



Matrix size: 10000


In [14]:
result = {}
result['num_clusters'] = num_parts 
result['nodes'] = num_nodes
result['edges'] = num_edges
result['size'] = num_nodes * num_parts 
result['subqubo_size'] = qsize
result

{'num_clusters': 20,
 'nodes': 500,
 'edges': 981,
 'size': 10000,
 'subqubo_size': 64}

In [15]:
# Run k-clustering with Hybrid/D-Wave using ocean
ss = QCD.clusterHybrid(Q, num_parts, qsize, run_label, run_profile, result)
result


 Q size =  10000
   0  1  2  3  4  5  6  7  8  9 10 11 12 13 ... 9999         energy num_oc. ...
0  0  0  0  0  0  1  1  1  1  1  0  0  0  0 ...    0 -133835.101828       2 ...
3  1  0  1  1  1  1  1  1  1  1  0  0  0  0 ...    0 -133812.704961       1 ...
2  1  1  1  1  1  1  1  1  1  1  1  1  1  1 ...    0 -133762.313316       1 ...
1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 ...    0 -133748.558747       1 ...
4  1  1  1  1  1  1  1  1  1  1  1  1  1  1 ...    0 -126403.603133       1 ...
['BINARY', 5 rows, 6 samples, 10000 variables]


{'num_clusters': 20,
 'nodes': 500,
 'edges': 981,
 'size': 10000,
 'subqubo_size': 64,
 'wall_clock_time_seconds': 34.783062,
 'num_qpu_accesses': 5,
 'total_qpu_time': 159135.8,
 'energy': -133835.10182767647,
 'num_occ': 2,
 'num_diff_solns': 5,
 'total_solns': 6}

In [16]:
# Process solution
part_number = QCD.process_solution(ss, G, num_blocks, num_nodes, num_parts, result)


num non-zeros =  500

last part size 0 -25.0
part 0 has 431 nodes
part 1 has 0 nodes
part 2 has 0 nodes
part 3 has 0 nodes
part 4 has 0 nodes
part 5 has 0 nodes
part 6 has 0 nodes
part 7 has 0 nodes
part 8 has 0 nodes
part 9 has 0 nodes
part 10 has 0 nodes
part 11 has 0 nodes
part 12 has 2 nodes
part 13 has 0 nodes
part 14 has 0 nodes
part 15 has 66 nodes
part 16 has 0 nodes
part 17 has 0 nodes
part 18 has 1 nodes
part 19 has 0 nodes


In [17]:
mmetric = QCD.calcModularityMetric(mtotal, modularity, part_number)
result['modularity_metric'] = mmetric


 Dim =  500


In [None]:
# draw graph clusters and save .png
GFU.showClusters(part_number, G, args_dict)

In [18]:
# write comms file 
GFU.write_partFile(
    part_num=part_number, 
    Dim=num_nodes, 
    nparts=num_parts, 
    args_dict=args_dict
) 

500


In [19]:
columns = ["node_id", "comm_id"]
communities = []

pred_arr=[]

comm_file_path = os.path.join(args_dict['output_dir'], f"comm{num_parts}.txt")
with open(comm_file_path) as comm_file:
    i = 0
    for line in comm_file:
        i += 1
        if i == 1:
            continue
        fields = line.strip().split("  ")
        communities.append(fields)
        pred_arr.append(fields[1])

pred_arr = [int(x) for x in pred_arr]
pred_arr[:10]

[15, 15, 15, 15, 15, 0, 0, 0, 0, 0]

In [23]:
submission_file_path = os.path.join(args_dict['output_dir'], 'submission.csv')
with open(comm_file_path, 'r') as infile, open(submission_file_path, 'w') as submission_file:
    # Skip the first line
    next(infile)
    submission_file.write("id,prediction" + "\n")
    for line in infile:
        # Strip the line and split by whitespace
        fields = line.strip().split()
        # Join fields with commas
        csv_line = ",".join(fields)
        submission_file.write(csv_line + "\n")

In [24]:
# Submit the file
api.competition_submit(file_name=submission_file_path, competition=competition, message="Submission")

print("Submission complete!")

100%|██████████| 2.90k/2.90k [00:00<00:00, 4.61kB/s]


Submission complete!
