In [1]:
import pandas as pd
import pickle
import networkx as nx
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import itertools
from sklearn.metrics import classification_report

# Load Data
df = pd.read_pickle('Data/preprocessed_df.pkl')

df['categories'] = df['categories'].apply(lambda x: len(x) if x else None)
data = df.drop(columns=["title", "abstract"])

data['venue_exists'] = data['venue'].apply(lambda x: 1 if x else 0)

# Graph Construction
G = nx.DiGraph()
for _, row in df.iterrows():
    G.add_node(row['index'], title=row['title'], authors=row['authors'], year=row['year'], venue=row['venue'])
    for citation in row['citations']:
        G.add_edge(row['index'], citation)

# Feature Extraction
def extract_graph_features(G, node1, node2):
    common_neighbors = len(list(nx.common_neighbors(G, node1, node2)))
    jaccard_coefficient = list(nx.jaccard_coefficient(G, [(node1, node2)]))[0][2]
    return [common_neighbors, jaccard_coefficient]

In [4]:
# Feature Extraction
def extract_graph_features(G, node1, node2):
    # Convert to undirected for common neighbors and Jaccard coefficient
    G_undirected = G.to_undirected()
    common_neighbors = len(list(nx.common_neighbors(G_undirected, node1, node2)))
    jaccard_coefficient = list(nx.jaccard_coefficient(G_undirected, [(node1, node2)]))[0][2]
    return [common_neighbors, jaccard_coefficient]

# Prepare Training Data
edges = list(G.edges())
features = []
labels = []

for edge in edges:
    node1, node2 = edge
    features.append(extract_graph_features(G, node1, node2))
    labels.append(1)  # Positive example: node1 cites node2

non_edges = [pair for pair in itertools.combinations(G.nodes(), 2) if not G.has_edge(*pair)]
for pair in non_edges:
    features.append(extract_graph_features(G, pair[0], pair[1]))
    labels.append(0)  # Negative example: node1 does not cite node2

X = np.array(features)
y = np.array(labels)

# Model Training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Evaluation
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


KeyboardInterrupt: 