<center><h2>ALTEGRAD Project</h2>

<hr>
<span style="font-variant: small-caps;">Xavier Jiménez</span><br>
<hr>
</center>

# Imports

In [5]:
import networkx as nx
import csv
import numpy as np
import pandas as pd
from random import randint
from sklearn.linear_model import LogisticRegression
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string
from sklearn.model_selection import cross_val_score
from tqdm import tqdm
from gensim.models import Word2Vec

# !pip install pip install karateclub
from karateclub import DeepWalk


In [6]:
!mkdir data

# Preprocessing

In [7]:
# Create a graph
G = nx.read_edgelist('data/edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)
nodes = list(G.nodes())
n = G.number_of_nodes()
m = G.number_of_edges()
print('Number of nodes:', n)
print('Number of edges:', m)

Number of nodes: 138499
Number of edges: 1091955


In [10]:
stop_words = stopwords.words('english')
porter = PorterStemmer()

# Read the abstract of each paper
try:
    df_abstracts = pd.read_csv('data/abstract_preprocessed.csv')
except:
    abstracts = dict()
    with open('data/abstracts.txt', 'r') as f:
        for line in tqdm(f):
            node, abstract = line.split('|--|')
            abstract = abstract.lower()
            abstract = "".join([char for char in abstract if char not in string.punctuation])
            abstract = word_tokenize(abstract)
            abstract = [word for word in abstract if word not in stop_words]
            abstract = [porter.stem(word) for word in abstract]
            abstracts[int(node)] = set(abstract)
    df_abstracts = pd.DataFrame(data=abstract)
    df_abstracts.to_csv('data/abstract_preprocessed.csv', index = False)

# Embeddings

## Baseline

In [None]:
# Create the training matrix. Each row corresponds to a pair of nodes and
# its class label is 1 if it corresponds to an edge and 0, otherwise.
# Use the following 3 features for each pair of nodes:
# (1) sum of number of unique terms of the two nodes' abstracts
# (2) absolute value of difference of number of unique terms of the two nodes' abstracts
# (3) number of common terms between the abstracts of the two nodes

# (1) sum of degrees of two nodes
# (2) absolute value of difference of degrees of two nodes

X_train = np.zeros((2*m, 5))
y_train = np.zeros(2*m)
for i,edge in tqdm(enumerate(G.edges())):
    # an edge
    X_train[2*i,0] = len(abstracts[edge[0]]) + len(abstracts[edge[1]])
    X_train[2*i,1] = abs(len(abstracts[edge[0]]) - len(abstracts[edge[1]]))
    X_train[2*i,2] = len(abstracts[edge[0]].intersection(abstracts[edge[1]]))
    X_train[2*i,3] = G.degree(edge[0]) + G.degree(edge[1])
    X_train[2*i,4] = abs(G.degree(edge[0]) - G.degree(edge[1]))
    y_train[2*i] = 1

    # a randomly generated pair of nodes
    n1 = randint(0, n-1)
    n2 = randint(0, n-1)
    X_train[2*i+1,0] = len(abstracts[n1]) + len(abstracts[n2])
    X_train[2*i+1,1] = abs(len(abstracts[n1]) - len(abstracts[n2]))
    X_train[2*i+1,2] = len(abstracts[n1].intersection(abstracts[n2]))
    X_train[2*i+1,3] = G.degree(n1) + G.degree(n2)
    X_train[2*i+1,4] = abs(G.degree(n1) - G.degree(n2))
    y_train[2*i+1] = 0

print('Size of training matrix:', X_train.shape)

In [None]:
def validation_score(model, X_train, y_train, cv, scoring = 'neg_log_loss', n_jobs = None):

    scores = cross_val_score(model, X_train, y_train, cv = cv, scoring = 'neg_log_loss', n_jobs = 4)
    print('Score: {:.2f} ± {:.2f}'.format(-scores.mean(), scores.std()/np.sqrt(cv)))
    
    return scores

clf = LogisticRegression()
scores = validation_score(clf, X_train, y_train, cv = 4, n_jobs = 4)

1091955it [00:52, 20615.23it/s]


Size of training matrix: (2183910, 5)
Score: 0.34 ± 0.03


## Deepwalk

In [None]:
"""
Deep Learning on Graphs - ALTEGRAD - Dec 2020
"""

import numpy as np
import networkx as nx
from random import randint
from gensim.models import Word2Vec


# Task 1
# Simulates a random walk of length "walk_length" starting from node "node"
def random_walk(G, node, walk_length):

    walk = [node]
    for _ in range(walk_length-1):
        neighbors = list(G.neighbors(walk[-1]))
        random_neighbor = neighbors[randint(0, len(neighbors)-1)]
        walk.append(random_neighbor)

    walk = [str(node) for node in walk]
    return walk


# Task 2
# Runs "num_walks" random walks from each node
def generate_walks(G, num_walks, walk_length):
    walks = []

    nodes = list(G.nodes())
    for _ in range(num_walks):
        permuted_nodes = np.random.permutation(nodes)
        for node in permuted_nodes:
            walks.append(random_walk(G, node, walk_length))

    return walks

# Simulates walks and uses the Skipgram model to learn node representations


def deepwalk(G, num_walks, walk_length, n_dim):
    print("Generating walks")
    walks = generate_walks(G, num_walks, walk_length)

    print("Training word2vec")
    model = Word2Vec(size=n_dim, window=8, min_count=0, sg=1, workers=8)
    model.build_vocab(walks)
    model.train(walks, total_examples=model.corpus_count, epochs=5)

    return model

In [None]:
n_dim = 128
n_walks = 20
walk_length = 80

model = DeepWalk(walk_length=walk_length, walk_number=n_walks, dimensions=n_dim, workers=2)
model.fit(G)
embeddings = model.get_embedding()

In [None]:
X_train = np.zeros((2*m, 7))
y_train = np.zeros(2*m)
for i,edge in tqdm(enumerate(G.edges())):
    # an edge
    X_train[2*i,0] = len(abstracts[edge[0]]) + len(abstracts[edge[1]])
    X_train[2*i,1] = abs(len(abstracts[edge[0]]) - len(abstracts[edge[1]]))
    X_train[2*i,2] = len(abstracts[edge[0]].intersection(abstracts[edge[1]]))
    X_train[2*i,3] = G.degree(edge[0]) + G.degree(edge[1])
    X_train[2*i,4] = abs(G.degree(edge[0]) - G.degree(edge[1]))
    X_train[2*i,5] = embeddings[edge[0],:]
    X_train[2*i,6] = embeddings[edge[1],:]
    y_train[2*i] = 1

    # a randomly generated pair of nodes
    n1 = randint(0, n-1)
    n2 = randint(0, n-1)
    X_train[2*i+1,0] = len(abstracts[n1]) + len(abstracts[n2])
    X_train[2*i+1,1] = abs(len(abstracts[n1]) - len(abstracts[n2]))
    X_train[2*i+1,2] = len(abstracts[n1].intersection(abstracts[n2]))
    X_train[2*i+1,3] = G.degree(n1) + G.degree(n2)
    X_train[2*i+1,4] = abs(G.degree(n1) - G.degree(n2))
    X_train[2*i,5] = embeddings[edge[n1],:]
    X_train[2*i,6] = embeddings[edge[n2],:]
    y_train[2*i+1] = 0

In [None]:
clf = LogisticRegression()
scores = validation_score(clf, embeddingd, y, cv = 4, n_jobs = 4)