## Citation Prediction Challenge
***
### Course : Data Science Challenge

Members: Thomas Saltos, Nikos Kafritsas


In [1]:
import csv
import networkx as nx
import numpy as np
import pandas as pd
from random import randint
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier

from tqdm.auto import tqdm

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
import re

import os
import gensim
import smart_open
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from nltk.stem import WordNetLemmatizer
from deepwalk import deepwalk

from nodevectors import Node2Vec

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nkaf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nkaf\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# import stopwords
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# set up Logging
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Data Preparation

In [4]:
# Create the graph
G = nx.read_edgelist('edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)
nodes = list(G.nodes())
edges = list(G.edges())
n = G.number_of_nodes()
m = G.number_of_edges()
print('Total Number of nodes:', n)
print('Total Number of edges:', m)


# Split into training/validation set
n_val= int(m*0.05)
idx=np.random.permutation(m)
val_edges= [edges[idx[i]] for i in range(n_val)]

for edge in val_edges:
    G.remove_edge( edge[0], edge[1] )

Total Number of nodes: 138499
Total Number of edges: 1091955


# Feature Engineering

### Abstracts Tokenization

In [5]:
import re

abstracts = dict()
with open('abstracts.txt', 'r', errors='ignore') as f:
    for line in f:
        node, abstract = line.split('|--|')
        abstracts[int(node)] = abstract

# Map text to set of terms
for node in abstracts:
   #list_of_words=abstracts[node].split()
   list_of_words=re.split(' |/',abstracts[node])
   
   #convert to lowercase
   list_of_words = map(lambda x: x.lower(), list_of_words)
   
   #replace special characters
   list_of_words = map(lambda x: x.replace('.', '').replace(',', '').replace('(', '').replace(')', ''), list_of_words) 
   
   #replace new line
   list_of_words = map(lambda x: x.replace('\n', ''), list_of_words) 
   
   #remove http addresses
   list_of_words = filter(lambda x: not x.startswith('http'), list_of_words)
    
   #remove digits
   list_of_words = filter(lambda x: not x.isdigit(), list_of_words)  
    
   #remove stopwords
   list_of_words = filter(lambda x: x not in stop_words, list_of_words)   
    
   #lematization
   list_of_words = map(lambda x: lemmatizer.lemmatize(x), list_of_words)  
    
    
   abstracts[node] = set(list_of_words) #list(list_of_words)

### Create doc2vec model

In [6]:
def read_corpus(fname, sentences_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if sentences_only:
                line=line.replace('\n', '')
                index, text = line.split('|--|')
                yield text
            else:
                tokens = gensim.utils.simple_preprocess(line)
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus('abstracts.txt'))
train_corpus_as_sentences = list(read_corpus('abstracts.txt', sentences_only=True))

In [24]:
#Don't run again

# Build vocabulary
#dc2vec = gensim.models.doc2vec.Doc2Vec(vector_size=32, dm=1, hs=1, min_count=2)
#dc2vec.build_vocab(train_corpus)

2021-06-11 02:23:32,469 : INFO : collecting all words and their counts
2021-06-11 02:23:32,471 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-06-11 02:23:32,662 : INFO : PROGRESS: at example #10000, processed 1472954 words (7694876/s), 23526 word types, 10000 tags
2021-06-11 02:23:32,868 : INFO : PROGRESS: at example #20000, processed 3037302 words (7635312/s), 33069 word types, 20000 tags
2021-06-11 02:23:33,053 : INFO : PROGRESS: at example #30000, processed 4483342 words (7807501/s), 41248 word types, 30000 tags
2021-06-11 02:23:33,239 : INFO : PROGRESS: at example #40000, processed 5899643 words (7657959/s), 48536 word types, 40000 tags
2021-06-11 02:23:33,435 : INFO : PROGRESS: at example #50000, processed 7425140 words (7802744/s), 54927 word types, 50000 tags
2021-06-11 02:23:33,630 : INFO : PROGRESS: at example #60000, processed 8825698 words (7225173/s), 61001 word types, 60000 tags
2021-06-11 02:23:33,819 : INFO : PROGRESS: at example #70

In [25]:
# Train
#dc2vec.train(train_corpus, total_examples=dc2vec.corpus_count, epochs=dc2vec.epochs)

2021-06-11 02:24:00,422 : INFO : training model with 8 workers on 15728 vocabulary and 32 features, using sg=1 hs=1 sample=0.001 negative=5 window=10
2021-06-11 02:24:01,431 : INFO : EPOCH 1 - PROGRESS: at 7.29% examples, 1148440 words/s, in_qsize 15, out_qsize 0
2021-06-11 02:24:02,440 : INFO : EPOCH 1 - PROGRESS: at 14.70% examples, 1193365 words/s, in_qsize 15, out_qsize 0
2021-06-11 02:24:03,453 : INFO : EPOCH 1 - PROGRESS: at 22.18% examples, 1171658 words/s, in_qsize 16, out_qsize 0
2021-06-11 02:24:04,457 : INFO : EPOCH 1 - PROGRESS: at 28.37% examples, 1115736 words/s, in_qsize 15, out_qsize 0
2021-06-11 02:24:05,462 : INFO : EPOCH 1 - PROGRESS: at 34.05% examples, 1076799 words/s, in_qsize 16, out_qsize 1
2021-06-11 02:24:06,462 : INFO : EPOCH 1 - PROGRESS: at 39.94% examples, 1051136 words/s, in_qsize 15, out_qsize 0
2021-06-11 02:24:07,463 : INFO : EPOCH 1 - PROGRESS: at 45.93% examples, 1030828 words/s, in_qsize 16, out_qsize 1
2021-06-11 02:24:08,467 : INFO : EPOCH 1 - PRO

2021-06-11 02:24:52,050 : INFO : EPOCH 4 - PROGRESS: at 19.56% examples, 1039277 words/s, in_qsize 15, out_qsize 0
2021-06-11 02:24:53,059 : INFO : EPOCH 4 - PROGRESS: at 26.53% examples, 1045473 words/s, in_qsize 16, out_qsize 0
2021-06-11 02:24:54,060 : INFO : EPOCH 4 - PROGRESS: at 32.99% examples, 1042244 words/s, in_qsize 15, out_qsize 0
2021-06-11 02:24:55,067 : INFO : EPOCH 4 - PROGRESS: at 39.82% examples, 1046648 words/s, in_qsize 16, out_qsize 0
2021-06-11 02:24:56,072 : INFO : EPOCH 4 - PROGRESS: at 46.32% examples, 1038491 words/s, in_qsize 16, out_qsize 0
2021-06-11 02:24:57,076 : INFO : EPOCH 4 - PROGRESS: at 53.06% examples, 1035269 words/s, in_qsize 16, out_qsize 1
2021-06-11 02:24:58,093 : INFO : EPOCH 4 - PROGRESS: at 59.98% examples, 1032231 words/s, in_qsize 16, out_qsize 0
2021-06-11 02:24:59,105 : INFO : EPOCH 4 - PROGRESS: at 66.79% examples, 1029912 words/s, in_qsize 16, out_qsize 0
2021-06-11 02:25:00,106 : INFO : EPOCH 4 - PROGRESS: at 73.66% examples, 1029473

In [8]:
#### compute all doc2vec vectors for each node once, to avoid computing them multiple times.


#Don't run again
# abstract2vec= dict()

# pbar = tqdm(total=len(nodes))
# for i in nodes:
#     abstract2vec[i]=dc2vec.infer_vector(train_corpus[i].words).reshape(-1, 1).T
#     pbar.update(1)

# pbar.close()

# Save
#np.save('my_file.npy', abstract2vec) 

# Load
abstract2vec = np.load('my_file.npy',allow_pickle='TRUE').item()

### Create authors dict

In [9]:
authors = dict()
with open('authors.txt', 'r', errors='ignore') as f:
    for line in f:
        line=line.replace('\n', '')
        node, author = line.split('|--|')
        authors[int(node)] = author

# Map text to set of terms
for node in authors:
   authors[node] = set(authors[node].split(','))

### Bert

In [30]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('allenai-specter')

2021-06-11 13:55:08,346 : INFO : Load pretrained SentenceTransformer: allenai-specter
2021-06-11 13:55:08,346 : INFO : Did not find folder allenai-specter
2021-06-11 13:55:08,346 : INFO : Search model on server: http://sbert.net/models/allenai-specter.zip
2021-06-11 13:55:08,351 : INFO : Downloading sentence transformer model from http://sbert.net/models/allenai-specter.zip and saving it at C:\Users\nkaf/.cache\torch\sentence_transformers\sbert.net_models_allenai-specter


  0%|          | 0.00/408M [00:00<?, ?B/s]

2021-06-11 14:54:41,790 : INFO : Load SentenceTransformer from folder: C:\Users\nkaf/.cache\torch\sentence_transformers\sbert.net_models_allenai-specter
2021-06-11 14:54:43,202 : INFO : Use pytorch device: cpu


In [10]:
#Implement some data transformation (remove special tokens, convert to lowercase, remove stopwords)


#train_corpus_as_sentences_df=pd.DataFrame(train_corpus_as_sentences,columns=['sentences'])
#train_corpus_as_sentences_df['sentences_cleaned']=train_corpus_as_sentences_df.sentences.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words) )
#train_corpus_as_sentences_df['sentences_cleaned']

#document_embeddings = sbert_model.encode(train_corpus_as_sentences_df.sentences_cleaned)
#np.save('bert_paper_embeddings', document_embeddings)
document_embeddings = np.load('bert_paper_embeddings.npy')

###  Variational Graph Autoencoder

In [11]:
z=np.load('vGAE_training.npy')

# Model validation

## Create the training graph

In [13]:
n = G.number_of_nodes()
m = G.number_of_edges()
print('Number of nodes in the Training graph:', n)
print('Number of edges in the Training graph:', m)

Number of nodes in the Training graph: 138499
Number of edges in the Training graph: 1037358


In [14]:
%%time

X_train = np.zeros((2*m, 11))
y_train = np.zeros(2*m)

pbar = tqdm(total=m)
for i,edge in enumerate(G.edges()):
    
    # centrality and neighbours
    X_train[2*i,0] = G.degree(edge[0]) + G.degree(edge[1])
    X_train[2*i,1] = abs(G.degree(edge[0]) - G.degree(edge[1]))
    X_train[2*i,2] = len(list(nx.common_neighbors(G, edge[0], edge[1])))
    
    # abstracts as set
    X_train[2*i,3] = len(abstracts[edge[0]]) + len(abstracts[edge[1]])
    X_train[2*i,4] = abs(len(abstracts[edge[0]]) - len(abstracts[edge[1]]))
    X_train[2*i,5] = len(abstracts[edge[0]].intersection(abstracts[edge[1]]))
    
    #common authors between papers
    X_train[2*i,6] = len(authors[edge[0]].intersection(authors[edge[1]]))
    
    #embeddings from Bert
    X_train[2*i,7] = np.squeeze(cosine_similarity(document_embeddings[edge[0]].reshape(1, -1),document_embeddings[edge[1]].reshape(1, -1)))
    X_train[2*i,8] = np.squeeze(euclidean_distances(document_embeddings[edge[0]].reshape(1, -1),document_embeddings[edge[1]].reshape(1, -1)))
    
    # Graph variational autoencoder
    X_train[2*i,9] = np.dot(z[edge[0],:], z[edge[1],:])/(np.linalg.norm(z[edge[0],:])*np.linalg.norm(z[edge[1],:]))
    X_train[2*i,10] = np.linalg.norm(z[edge[0],:]-z[edge[1],:])
   
    y_train[2*i] = 1

    # a randomly generated pair of nodes
    
    n1 = nodes[randint(0, n-1)]
    n2 = nodes[randint(0, n-1)]

    while G.has_edge(n1,n2) or n1==n2:
        n1 = nodes[randint(0, n-1)]
        n2 = nodes[randint(0, n-1)]
    
    #centrality and neighbours
    X_train[2*i+1,0] = G.degree(n1) + G.degree(n2)
    X_train[2*i+1,1] = abs(G.degree(n1) - G.degree(n2))
    X_train[2*i+1,2] = len(list(nx.common_neighbors(G, n1, n2)))
    
    # abstracts as set
    X_train[2*i+1,3] = len(abstracts[n1]) + len(abstracts[n2])
    X_train[2*i+1,4] = abs(len(abstracts[n1]) - len(abstracts[n2]))
    X_train[2*i+1,5] = len(abstracts[n1].intersection(abstracts[n2]))
    
    #common authors between papers
    X_train[2*i+1,6] = len(authors[n1].intersection(authors[n2]))
    
    #embeddings from Bert
    X_train[2*i+1,7] = np.squeeze(cosine_similarity(document_embeddings[n1].reshape(1, -1),document_embeddings[n2].reshape(1, -1)))
    X_train[2*i+1,8] = np.squeeze(euclidean_distances(document_embeddings[n1].reshape(1, -1),document_embeddings[n2].reshape(1, -1)))
    
    # Graph variational autoencoder
    X_train[2*i+1,9] = np.dot(z[n1,:], z[n2,:])/(np.linalg.norm(z[n1,:])*np.linalg.norm(z[n2,:]))
    X_train[2*i+1,10] = np.linalg.norm(z[n1,:]-z[n2,:])
    
    y_train[2*i+1] = 0
    
    pbar.update(1)

pbar.close()

print('Size of training matrix:', X_train.shape)

  0%|          | 0/1037358 [00:00<?, ?it/s]

Size of training matrix: (2074716, 11)
Wall time: 23min 40s


### Create the validation graph

In [15]:
X_val = np.zeros((2*n_val,11))
y_val = np.zeros(2*n_val)

pbar = tqdm(total=n_val)
for i,edge in enumerate(val_edges):

    
    # centrality and neighbours
    X_val[2*i,0] = G.degree(edge[0]) + G.degree(edge[1])
    X_val[2*i,1] = abs(G.degree(edge[0]) - G.degree(edge[1]))
    X_val[2*i,2] = len(list(nx.common_neighbors(G, edge[0], edge[1])))
    
    # abstracts as set
    X_val[2*i,3] = len(abstracts[edge[0]]) + len(abstracts[edge[1]])
    X_val[2*i,4] = abs(len(abstracts[edge[0]]) - len(abstracts[edge[1]]))
    X_val[2*i,5] = len(abstracts[edge[0]].intersection(abstracts[edge[1]]))
    
    #common authors between papers
    X_val[2*i,6] = len(authors[edge[0]].intersection(authors[edge[1]]))
    
    # Bert
    X_val[2*i,7] = np.squeeze(cosine_similarity(document_embeddings[edge[0]].reshape(1, -1),document_embeddings[edge[1]].reshape(1, -1)))
    X_val[2*i,8] = np.squeeze(euclidean_distances(document_embeddings[edge[0]].reshape(1, -1),document_embeddings[edge[1]].reshape(1, -1)))
    
    # Graph variational autoencoder
    X_val[2*i,9] = np.dot(z[edge[0],:], z[edge[1],:])/(np.linalg.norm(z[edge[0],:])*np.linalg.norm(z[edge[1],:]))
    X_val[2*i,10] = np.linalg.norm(z[edge[0],:]-z[edge[1],:])

    y_val[2*i] = 1

    # a randomly generated pair of nodes
    n1 = nodes[randint(0, n-1)]
    n2 = nodes[randint(0, n-1)]

    while G.has_edge(n1,n2) or n1==n2:
        print('rejected')
        n1 = nodes[randint(0, n-1)]
        n2 = nodes[randint(0, n-1)]
    
    # centrality and neighbours
    X_val[2*i+1,0] = G.degree(n1) + G.degree(n2)
    X_val[2*i+1,1] = abs(G.degree(n1) - G.degree(n2))
    X_val[2*i+1,2] = len(list(nx.common_neighbors(G, n1, n2)))
    
    # abstracts as set
    X_val[2*i+1,3] = len(abstracts[n1]) + len(abstracts[n2])
    X_val[2*i+1,4] = abs(len(abstracts[n1]) - len(abstracts[n2]))
    X_val[2*i+1,5] = len(abstracts[n1].intersection(abstracts[n2]))
    
    #common authors between papers
    X_val[2*i+1,6] = len(authors[n1].intersection(authors[n2]))
    
    # Bert
    X_val[2*i+1,7] = np.squeeze(cosine_similarity(document_embeddings[n1].reshape(1, -1),document_embeddings[n2].reshape(1, -1)))
    X_val[2*i+1,8] = np.squeeze(euclidean_distances(document_embeddings[n1].reshape(1, -1),document_embeddings[n2].reshape(1, -1)))
    
    # Graph variational autoencoder
    X_val[2*i+1,9] = np.dot(z[n1,:], z[n2,:])/(np.linalg.norm(z[n1,:])*np.linalg.norm(z[n2,:]))
    X_val[2*i+1,10] = np.linalg.norm(z[n1,:]-z[n2,:])
   
    y_val[2*i+1] = 0
    pbar.update(1)

pbar.close()

print('Size of validation matrix:', X_val.shape)

  0%|          | 0/54597 [00:00<?, ?it/s]

rejected
rejected
rejected
rejected
rejected
rejected
rejected
rejected
rejected
Size of validation matrix: (109194, 11)


In [110]:
# train the algorithm 
clf = LogisticRegression(max_iter=2000, C=0.0001)
clf.fit(X_train, y_train)

# train loss
y_pred_train = clf.predict_proba(X_train)
y_pred_train = y_pred_train[:,1]
print(log_loss(y_train, y_pred_train))


# validation loss
y_pred_val = clf.predict_proba(X_val)
y_pred_val = y_pred_val[:,1]
print(log_loss(y_val, y_pred_val))


0.07887584426291587
0.12901919140611703


In [133]:
# Test in validation set

parameters = {'max_depth':[8], 'min_samples_leaf':[10,11, 12, 13, 14, 15, 16, 17, 18, 19, 20] }
best_parameters = dict()
best_loss = np.inf
for s in parameters['max_depth']:
    for c in parameters['min_samples_leaf']:
        print('Looking for max_depth= '+str(s)+' and '+ 'min_samples_leaf=' + str(c))
        clf = RandomForestClassifier(max_depth=s, min_samples_leaf=c,random_state=0, n_jobs=-1,n_estimators=200)
        clf.fit(X_train2, y_train)
        y_pred = clf.predict_proba(X_val2)
        y_pred = y_pred[:,1] 
        loss = log_loss(y_val, y_pred)
        print(loss)
        if loss < best_loss:      
            best_loss = loss
            print(best_loss)
            best_parameters['max_depth'] = s
            best_parameters['min_samples_leaf'] = c

print("Best params:", best_parameters)

Looking for max_depth= 8 and min_samples_leaf=10
0.11392678644197245
0.11392678644197245
Looking for max_depth= 8 and min_samples_leaf=11
0.11405382902204954
Looking for max_depth= 8 and min_samples_leaf=12
0.11411705747044959
Looking for max_depth= 8 and min_samples_leaf=13
0.11404264951832285
Looking for max_depth= 8 and min_samples_leaf=14
0.11404772663470586
Looking for max_depth= 8 and min_samples_leaf=15
0.11437139670717601
Looking for max_depth= 8 and min_samples_leaf=16
0.11399907274315534
Looking for max_depth= 8 and min_samples_leaf=17
0.11376953001095534
0.11376953001095534
Looking for max_depth= 8 and min_samples_leaf=18
0.11340071661648615
0.11340071661648615
Looking for max_depth= 8 and min_samples_leaf=19
0.11357629946367787
Looking for max_depth= 8 and min_samples_leaf=20
0.11393120277395143
Best params: {'max_depth': 8, 'min_samples_leaf': 18}


In [136]:
clf = RandomForestClassifier(max_depth=8, random_state=0, n_jobs=-1,n_estimators=250, min_samples_leaf=18)
clf.fit(X_train, y_train)

y_pred_train = clf.predict_proba(X_train)
y_pred_train = y_pred_train[:,1]
print(log_loss(y_train, y_pred_train))


# validation loss
y_pred_val = clf.predict_proba(X_val)
y_pred_val = y_pred_val[:,1]
print(log_loss(y_val, y_pred_val))


0.0650153618898971
0.11351903987344884


0.1134

## Training on the whole dataset
***
### Create training graph

In [3]:
G = nx.read_edgelist('edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)
nodes = list(G.nodes())
edges = list(G.edges())
n = G.number_of_nodes()
m = G.number_of_edges()
print('Number of nodes:', n)
print('Number of edges:', m)

Number of nodes: 138499
Number of edges: 1091955


### Load Vgae embeddings for the whole dataset 

In [None]:
z=np.load('vGAE.npy')

In [24]:
%%time

X_train = np.zeros((2*m, 11))
y_train = np.zeros(2*m)

pbar = tqdm(total=m)
for i,edge in enumerate(G.edges()):
    
    # centrality and neighbours
    X_train[2*i,0] = G.degree(edge[0]) + G.degree(edge[1])
    X_train[2*i,1] = abs(G.degree(edge[0]) - G.degree(edge[1]))
    X_train[2*i,2] = len(list(nx.common_neighbors(G, edge[0], edge[1])))
    
    # abstracts as set
    X_train[2*i,3] = len(abstracts[edge[0]]) + len(abstracts[edge[1]])
    X_train[2*i,4] = abs(len(abstracts[edge[0]]) - len(abstracts[edge[1]]))
    X_train[2*i,5] = len(abstracts[edge[0]].intersection(abstracts[edge[1]]))
    
    #common authors between papers
    X_train[2*i,6] = len(authors[edge[0]].intersection(authors[edge[1]]))
    
    #embeddings from Bert
    X_train[2*i,7] = np.squeeze(cosine_similarity(document_embeddings[edge[0]].reshape(1, -1),document_embeddings[edge[1]].reshape(1, -1)))
    X_train[2*i,8] = np.squeeze(euclidean_distances(document_embeddings[edge[0]].reshape(1, -1),document_embeddings[edge[1]].reshape(1, -1)))
    
    # Graph variational autoencoder
    X_train[2*i,9] = np.dot(z[edge[0],:], z[edge[1],:])/(np.linalg.norm(z[edge[0],:])*np.linalg.norm(z[edge[1],:]))
    X_train[2*i,10] = np.linalg.norm(z[edge[0],:]-z[edge[1],:])
   
    y_train[2*i] = 1

    # a randomly generated pair of nodes
    
    n1 = nodes[randint(0, n-1)]
    n2 = nodes[randint(0, n-1)]

    while G.has_edge(n1,n2) or n1==n2:
        n1 = nodes[randint(0, n-1)]
        n2 = nodes[randint(0, n-1)]
    
    #centrality and neighbours
    X_train[2*i+1,0] = G.degree(n1) + G.degree(n2)
    X_train[2*i+1,1] = abs(G.degree(n1) - G.degree(n2))
    X_train[2*i+1,2] = len(list(nx.common_neighbors(G, n1, n2)))
    
    # abstracts as set
    X_train[2*i+1,3] = len(abstracts[n1]) + len(abstracts[n2])
    X_train[2*i+1,4] = abs(len(abstracts[n1]) - len(abstracts[n2]))
    X_train[2*i+1,5] = len(abstracts[n1].intersection(abstracts[n2]))
    
    #common authors between papers
    X_train[2*i+1,6] = len(authors[n1].intersection(authors[n2]))
    
    #embeddings from Bert
    X_train[2*i+1,7] = np.squeeze(cosine_similarity(document_embeddings[n1].reshape(1, -1),document_embeddings[n2].reshape(1, -1)))
    X_train[2*i+1,8] = np.squeeze(euclidean_distances(document_embeddings[n1].reshape(1, -1),document_embeddings[n2].reshape(1, -1)))
    
    # Graph variational autoencoder
    X_train[2*i+1,9] = np.dot(z[n1,:], z[n2,:])/(np.linalg.norm(z[n1,:])*np.linalg.norm(z[n2,:]))
    X_train[2*i+1,10] = np.linalg.norm(z[n1,:]-z[n2,:])
    
    y_train[2*i+1] = 0
    
    pbar.update(1)

pbar.close()

print('Size of training matrix:', X_train.shape)

100%|███████████████████████████████████████████████████████████████████████| 1091955/1091955 [32:09<00:00, 565.97it/s]

Size of training matrix: (2183910, 11)
Wall time: 32min 9s
Compiler : 120 ms





### Create testing graph

In [40]:
node_pairs = list()
with open('test.txt', 'r') as f:
    for line in f:
        t = line.split(',')
        node_pairs.append((int(t[0]), int(t[1])))

X_test = np.zeros((len(node_pairs), 11))

pbar = tqdm(total=len(node_pairs))
for i,node_pair in enumerate(node_pairs):
    
    X_test[i,0] = G.degree(node_pair[0]) + G.degree(node_pair[1])
    X_test[i,1] = abs(G.degree(node_pair[0]) - G.degree(node_pair[1]))
    X_test[i,2] = len(list(nx.common_neighbors(G, node_pair[0], node_pair[1])))
    
    X_test[i,3] = len(abstracts[node_pair[0]]) + len(abstracts[node_pair[1]])
    X_test[i,4] = abs(len(abstracts[node_pair[0]]) - len(abstracts[node_pair[1]]))
    X_test[i,5] = len(abstracts[node_pair[0]].intersection(abstracts[node_pair[1]]))
    
    X_test[i,6] = len(authors[node_pair[0]].intersection(authors[node_pair[1]]))
    
    X_test[i,7] = np.squeeze(cosine_similarity(document_embeddings[node_pair[0]].reshape(1, -1),document_embeddings[node_pair[1]].reshape(1, -1)))
    X_test[i,8] = np.squeeze(euclidean_distances(document_embeddings[node_pair[0]].reshape(1, -1),document_embeddings[node_pair[1]].reshape(1, -1)))

    X_test[i,9] = np.dot(z[node_pair[0],:], z[node_pair[1],:])/(np.linalg.norm(z[node_pair[0],:])*np.linalg.norm(z[node_pair[1],:]))
    X_test[i,10] = np.linalg.norm(z[node_pair[0],:]-z[node_pair[1],:])
   

    pbar.update(1)

pbar.close()
    

print('Size of test matrix:', X_test.shape)


  0%|                                                                                       | 0/106692 [00:00<?, ?it/s][A
  0%|                                                                             | 4/106692 [00:00<1:03:19, 28.08it/s][A
  0%|                                                                             | 9/106692 [00:00<1:12:24, 24.56it/s][A
  0%|                                                                              | 20/106692 [00:00<36:44, 48.38it/s][A
  0%|                                                                              | 29/106692 [00:00<32:14, 55.14it/s][A
  0%|                                                                              | 47/106692 [00:00<21:08, 84.10it/s][A
  0%|                                                                             | 82/106692 [00:00<12:23, 143.42it/s][A
  0%|                                                                            | 104/106692 [00:00<10:58, 161.77it/s][A
  0%|          

 13%|█████████▎                                                               | 13552/106692 [00:14<00:59, 1554.74it/s][A
 13%|█████████▍                                                               | 13708/106692 [00:14<00:59, 1555.80it/s][A
 13%|█████████▍                                                               | 13866/106692 [00:15<00:59, 1562.78it/s][A
 13%|█████████▌                                                               | 14023/106692 [00:15<00:59, 1546.06it/s][A
 13%|█████████▋                                                               | 14178/106692 [00:15<01:02, 1470.75it/s][A
 13%|█████████▊                                                               | 14326/106692 [00:15<01:03, 1445.21it/s][A
 14%|█████████▉                                                               | 14484/106692 [00:15<01:02, 1482.74it/s][A
 14%|██████████                                                               | 14650/106692 [00:15<01:00, 1529.06it/s][A
 14%|██████████▏

 33%|████████████████████████▎                                                | 35555/106692 [00:28<00:42, 1661.24it/s][A
 33%|████████████████████████▍                                                | 35722/106692 [00:28<00:43, 1618.12it/s][A
 34%|████████████████████████▌                                                | 35885/106692 [00:28<00:45, 1554.68it/s][A
 34%|████████████████████████▋                                                | 36042/106692 [00:29<00:46, 1531.97it/s][A
 34%|████████████████████████▊                                                | 36196/106692 [00:29<00:46, 1520.48it/s][A
 34%|████████████████████████▊                                                | 36349/106692 [00:29<00:47, 1484.12it/s][A
 34%|████████████████████████▉                                                | 36507/106692 [00:29<00:46, 1510.73it/s][A
 34%|█████████████████████████                                                | 36659/106692 [00:29<00:47, 1474.54it/s][A
 34%|███████████

 52%|█████████████████████████████████████▉                                   | 55506/106692 [00:42<00:34, 1465.43it/s][A
 52%|██████████████████████████████████████                                   | 55667/106692 [00:42<00:33, 1506.57it/s][A
 52%|██████████████████████████████████████▏                                  | 55837/106692 [00:42<00:32, 1550.61it/s][A
 52%|██████████████████████████████████████▎                                  | 55996/106692 [00:42<00:32, 1559.46it/s][A
 53%|██████████████████████████████████████▍                                  | 56158/106692 [00:43<00:32, 1576.45it/s][A
 53%|██████████████████████████████████████▌                                  | 56328/106692 [00:43<00:31, 1600.40it/s][A
 53%|██████████████████████████████████████▋                                  | 56489/106692 [00:43<00:31, 1572.28it/s][A
 53%|██████████████████████████████████████▊                                  | 56647/106692 [00:43<00:32, 1546.62it/s][A
 53%|███████████

 71%|███████████████████████████████████████████████████▌                     | 75376/106692 [00:56<00:21, 1457.89it/s][A
 71%|███████████████████████████████████████████████████▋                     | 75527/106692 [00:56<00:21, 1471.90it/s][A
 71%|███████████████████████████████████████████████████▊                     | 75675/106692 [00:56<00:22, 1406.99it/s][A
 71%|███████████████████████████████████████████████████▉                     | 75825/106692 [00:56<00:21, 1429.98it/s][A
 71%|███████████████████████████████████████████████████▉                     | 75990/106692 [00:56<00:20, 1492.24it/s][A
 71%|████████████████████████████████████████████████████                     | 76140/106692 [00:56<00:20, 1482.34it/s][A
 72%|████████████████████████████████████████████████████▏                    | 76289/106692 [00:56<00:20, 1459.58it/s][A
 72%|████████████████████████████████████████████████████▎                    | 76436/106692 [00:57<00:20, 1461.22it/s][A
 72%|███████████

 90%|█████████████████████████████████████████████████████████████████▎       | 95515/106692 [01:10<00:07, 1517.33it/s][A
 90%|█████████████████████████████████████████████████████████████████▍       | 95667/106692 [01:10<00:07, 1512.58it/s][A
 90%|█████████████████████████████████████████████████████████████████▌       | 95819/106692 [01:10<00:07, 1502.10it/s][A
 90%|█████████████████████████████████████████████████████████████████▋       | 95980/106692 [01:10<00:06, 1533.20it/s][A
 90%|█████████████████████████████████████████████████████████████████▊       | 96134/106692 [01:10<00:06, 1533.69it/s][A
 90%|█████████████████████████████████████████████████████████████████▉       | 96288/106692 [01:10<00:06, 1507.36it/s][A
 90%|█████████████████████████████████████████████████████████████████▉       | 96449/106692 [01:10<00:06, 1536.11it/s][A
 91%|██████████████████████████████████████████████████████████████████       | 96603/106692 [01:10<00:06, 1524.89it/s][A
 91%|███████████

Size of test matrix: (106692, 10)





#### Train using Logistic Regression

In [None]:
clf = LogisticRegression(max_iter=2000, C=0.001)
clf.fit(X_train, y_train)

# output training loss
y_pred_train = clf.predict_proba(X_train)
y_pred_train = y_pred_train[:,1]
print(log_loss(y_train, y_pred_train))
print(accuracy_score(y_train==1, y_pred_train>0.5))


#predictions on test set
y_pred = clf.predict_proba(X_test)
y_pred = y_pred[:,1]

### Train using Random Forest

In [None]:
clf = RandomForestClassifier(max_depth=10, random_state=0, n_jobs=-1,n_estimators=250, min_samples_leaf=19)
clf.fit(X_train, y_train)

#predictions on train set
y_pred_train = clf.predict_proba(X_train)
y_pred_train = y_pred_train[:,1]
print(log_loss(y_train, y_pred_train))

#predictions on test set
y_pred = clf.predict_proba(X_test)
y_pred = y_pred[:,1]

In [63]:
# Write predictions to a file
predictions = zip(range(len(y_pred)), y_pred)
with open("no_n2v_sub_scibert_0.0008.csv","w") as pred:
    csv_out = csv.writer(pred)
    csv_out.writerow(['id','predicted'])
    for row in predictions:
        csv_out.writerow(row) 

## Appendix
***
#### Here are some techniques that were impemented, but not used in the final model:

#### 1. Deep Walk

In [None]:
n_dim = 64
n_walks = 5
walk_length = 10
deepwalk_model = deepwalk(G, n_walks, walk_length, n_dim) 

embeddings_deepwalk = np.zeros((n, n_dim))

for node in G.nodes():
    embeddings_deepwalk[node,:] = deepwalk_model.wv[str(node)]

#### 2. Node2vec

In [None]:
# pip install nodevectors
from nodevectors import Node2Vec

In [None]:
g2v = Node2Vec(
    n_components=32,
    walklen=40
)

g2v.fit(G)