In [1]:
import urllib.request
import zipfile
import pandas as pd
import os
from grakel import Graph
import networkx as nx
import json
import random
import numpy as np
from grakel.datasets import fetch_dataset
from grakel.kernels import WeisfeilerLehman, VertexHistogram
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report



In [2]:
# Download the dataset zip file
url = "https://snap.stanford.edu/data/github_stargazers.zip"
filepath = "./../data/github_stargazers.zip"
if not os.path.isfile(filepath):
    urllib.request.urlretrieve(url, filepath)

    # Extract the csv file from the zip file
    with zipfile.ZipFile(filepath, 'r') as zip_ref:
        zip_ref.extractall('./../data/')

In [3]:
# Step 1: Load the graph data and target classes
with open("./../data/github_stargazers/git_edges.json", "r") as f:
    edges = json.load(f)
    
targets = pd.read_csv("./../data/github_stargazers/git_target.csv", index_col=0)

In [16]:
# Limit the number of graphs to approximately 100 with an equal distribution of all classes
graph_ids = []
for i in range(2):
    class_ids = targets[targets["target"] == i].index.tolist()
    class_ids = random.sample(class_ids, min(len(class_ids), 100))
    graph_ids.extend(class_ids)

In [18]:
# Filter the graph data and target classes based on the selected graph ids
edges = {graph_id: edges[str(graph_id)] for graph_id in graph_ids}
targets = targets.loc[graph_ids]

In [19]:
# Step 2: Convert the graph data into a suitable format
graphs = []
for graph_id in graph_ids:
    graph_edges = edges[graph_id]
    mat_ij = max([x[0] for x in graph_edges] + [x[1] for x in graph_edges]) + 1
    adj_matrix = np.zeros((mat_ij, mat_ij))
    node_labels = {}
    node_id = 0
    for edge in graph_edges:
        if edge[0] not in node_labels:
            node_labels[edge[0]] = str(node_id)
            node_id += 1
        if edge[1] not in node_labels:
            node_labels[edge[1]] = str(node_id)
            node_id += 1
        adj_matrix[node_id-1][node_id-2] = 1
        adj_matrix[node_id-2][node_id-1] = 1
    graphs.append(Graph(adj_matrix, node_labels=node_labels))

In [33]:
# Step 3: Extract features using the Weisfeiler-Lehman graph kernel
wl_kernel = WeisfeilerLehman(n_iter=5)
X_wl = wl_kernel.fit_transform(graphs)

In [34]:
# Step 4: Train a machine learning model on the extracted features
X_train, X_test, y_train, y_test = train_test_split(X_wl, targets["target"], test_size=0.2, random_state=42)

clf = SVC(kernel="rbf", gamma=0.01, C=1.0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [35]:
# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(f"Classification report: \n{report}")

Accuracy: 0.475
Classification report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        21
           1       0.47      1.00      0.64        19

    accuracy                           0.48        40
   macro avg       0.24      0.50      0.32        40
weighted avg       0.23      0.47      0.31        40



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
