<center><h2>ALTEGRAD Project</h2>

<hr>
<span style="font-variant: small-caps;">Xavier Jiménez</span><br>
<hr>
</center>

# Imports

In [1]:
import networkx as nx
import csv
import numpy as np
import pandas as pd
from random import randint
from sklearn.linear_model import LogisticRegression
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string
from sklearn.model_selection import cross_val_score
from tqdm import tqdm
from gensim.models import Word2Vec

# !pip install pip install karateclub
from karateclub import DeepWalk
from os import path
import pickle


In [2]:
if not path.isdir('data'):
    !mkdir data
    !wget -O altegrad.zip https://www.dropbox.com/sh/fhfjjtk0sr7pmse/AAD4ZEtHv9OI5HfVO22tdMX0a?dl=1
    !unzip altegrad.zip
else:
    print('Data already downloaded')

Data already downloaded


# Preprocessing

In [2]:
# Create a graph
G = nx.read_edgelist('data/edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)
nodes = list(G.nodes())
n = G.number_of_nodes()
m = G.number_of_edges()
print('Number of nodes:', n)
print('Number of edges:', m)

Number of nodes: 138499
Number of edges: 1091955


In [3]:
stop_words = stopwords.words('english')
porter = PorterStemmer()

# Read the abstract of each paper
try:
    a_file = open("data/abstract_preprocessed.pkl", "rb")
    abstracts = pickle.load(a_file)
    a_file.close()
    print('Abstract already preprocessed')
except:
    print('Preprocessing abstracts')
    abstracts = dict()
    with open('data/abstracts.txt', 'r') as f:
        for line in tqdm(f):
            node, abstract = line.split('|--|')
            abstract = abstract.lower()
            abstract = "".join([char for char in abstract if char not in string.punctuation])
            abstract = word_tokenize(abstract)
            abstract = [word for word in abstract if word not in stop_words]
            abstract = [porter.stem(word) for word in abstract]
            abstracts[int(node)] = set(abstract)
    a_file = open("data/abstract_preprocessed.pkl", "wb")
    pickle.dump(abstracts, a_file)
    a_file.close()
    print('Preprocessing Done')

27it [00:00, 264.35it/s]

Preprocessing abstracts


138499it [05:49, 395.77it/s]


Preprocessing Done


# Embeddings

In [10]:
def return_embeddings(G, model, parameters):
    """Creates embeddings for a given model

    Args:
        G: nx Graph.
        model: Graph embedding model from karateclub library.
        parameters (dict): Dictionary containing model parameters.

    Returns:
        np.ndarray: embeddings.
    """
    
    emb = model(**parameters)
    emb.fit(G)
    
    return emb.get_embedding()

def validation_score(model, X_train, y_train, cv, scoring = 'neg_log_loss', n_jobs = None):
    """Computes scores using cross validation for a given model.

    Args:
        model: classifier
        X_train (array like): training set.
        y_train (array like): training lavels.
        cv (int): number of splits.
        scoring (str, optional): Metric. Defaults to 'neg_log_loss'.
        n_jobs (int, optional): Number of cores. Defaults to None.

    Returns:
        list: list containing scores from cross validation
    """
    
    scores = cross_val_score(model, X_train, y_train, cv = cv, scoring = 'neg_log_loss', n_jobs = n_jobs)
    print('Score: {:.2f} ± {:.2f}'.format(-scores.mean(), scores.std()/np.sqrt(cv)))
    
    return scores


## Baseline

In [6]:
# Create the training matrix. Each row corresponds to a pair of nodes and
# its class label is 1 if it corresponds to an edge and 0, otherwise.
# Use the following 3 features for each pair of nodes:
# (1) sum of number of unique terms of the two nodes' abstracts
# (2) absolute value of difference of number of unique terms of the two nodes' abstracts
# (3) number of common terms between the abstracts of the two nodes

# (1) sum of degrees of two nodes
# (2) absolute value of difference of degrees of two nodes

# computes structural features for each node
G.remove_edges_from(nx.selfloop_edges(G))
core_number = nx.core_number(G)
# onion_number = nx.onion_layers(G)
# avg_neighbor_degree = nx.average_neighbor_degree(G)
# degree_centrality = nx.degree_centrality(G)
# clustering = nx.clustering(G)
print("Features computed")

X_train = np.zeros((2*m, 7))
y_train = np.zeros(2*m)
for i,edge in tqdm(enumerate(G.edges())):
    # an edge
    X_train[2*i,0] = len(abstracts[edge[0]]) + len(abstracts[edge[1]])
    X_train[2*i,1] = abs(len(abstracts[edge[0]]) - len(abstracts[edge[1]]))
    X_train[2*i,2] = len(abstracts[edge[0]].intersection(abstracts[edge[1]]))
    X_train[2*i,3] = G.degree(edge[0]) + G.degree(edge[1])
    X_train[2*i,4] = abs(G.degree(edge[0]) - G.degree(edge[1]))
    X_train[2*i,5] = core_number[edge[0]] + core_number[edge[1]]
    X_train[2*i,6] = abs(core_number[edge[0]] - core_number[edge[1]])
    y_train[2*i] = 1

    # a randomly generated pair of nodes
    n1 = randint(0, n-1)
    n2 = randint(0, n-1)
    X_train[2*i+1,0] = len(abstracts[n1]) + len(abstracts[n2])
    X_train[2*i+1,1] = abs(len(abstracts[n1]) - len(abstracts[n2]))
    X_train[2*i+1,2] = len(abstracts[n1].intersection(abstracts[n2]))
    X_train[2*i+1,3] = G.degree(n1) + G.degree(n2)
    X_train[2*i+1,4] = abs(G.degree(n1) - G.degree(n2))
    X_train[2*i,5] = core_number[n1] + core_number[n2]
    X_train[2*i,6] = abs(core_number[n1] - core_number[n2])
    y_train[2*i+1] = 0

print('Size of training matrix:', X_train.shape)

2957it [00:00, 29489.24it/s]

Features computed


1091955it [00:32, 33215.66it/s]

Size of training matrix: (2183910, 7)





In [11]:
clf = LogisticRegression()
scores = validation_score(clf, X_train, y_train, cv = 4, n_jobs = 1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Score: 0.00 ± 0.00


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## DeepWalk

In [6]:
parameters = {'walk_number': 20, 'walk_length': 80, 'dimensions': 128, 'workers': 3}
dw_embeddings = return_embeddings(G, DeepWalk, parameters)
print(type(dw_embeddings))

## Node2Vec

In [6]:
from karateclub import Node2Vec
parameters = {'walk_number': 10, 'walk_length': 80, 'dimensions': 128, 'workers': 3, 'window_size': 5}
n2v_embeddings = return_embeddings(G, Node2Vec, parameters)
np.save('data/embedding_n2v_wn10_wl80_d128_ws5', n2v_embeddings)

# Training

In [None]:
X_train = np.zeros((2*m, 7))
y_train = np.zeros(2*m)
for i,edge in tqdm(enumerate(G.edges())):
    # an edge
    X_train[2*i,0] = len(abstracts[edge[0]]) + len(abstracts[edge[1]])
    X_train[2*i,1] = abs(len(abstracts[edge[0]]) - len(abstracts[edge[1]]))
    X_train[2*i,2] = len(abstracts[edge[0]].intersection(abstracts[edge[1]]))
    X_train[2*i,3] = G.degree(edge[0]) + G.degree(edge[1])
    X_train[2*i,4] = abs(G.degree(edge[0]) - G.degree(edge[1]))
    X_train[2*i,5] = dw_embeddings[edge[0],:]
    X_train[2*i,6] = dw_embeddings[edge[1],:]
    y_train[2*i] = 1

    # a randomly generated pair of nodes
    n1 = randint(0, n-1)
    n2 = randint(0, n-1)
    X_train[2*i+1,0] = len(abstracts[n1]) + len(abstracts[n2])
    X_train[2*i+1,1] = abs(len(abstracts[n1]) - len(abstracts[n2]))
    X_train[2*i+1,2] = len(abstracts[n1].intersection(abstracts[n2]))
    X_train[2*i+1,3] = G.degree(n1) + G.degree(n2)
    X_train[2*i+1,4] = abs(G.degree(n1) - G.degree(n2))
    X_train[2*i,5] = dw_embeddings[edge[n1],:]
    X_train[2*i,6] = dw_embeddings[edge[n2],:]
    y_train[2*i+1] = 0

In [None]:
clf = LogisticRegression()
scores = validation_score(clf, X_train, y_train, cv = 4, n_jobs = 4)