In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib as plt
import os
import networkx as nx
from gensim.models import Word2Vec
from node2vec import Node2Vec
import pickle
import spacy
from collections import Counter
from tqdm import tqdm


from scipy.stats.stats import pearsonr
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score 
from sklearn.preprocessing import LabelBinarizer 
from sklearn.model_selection import train_test_split


from namematcher import NameMatcher

  from scipy.stats.stats import pearsonr


In [2]:
spacy_nlp = spacy.load("en_core_web_sm")

## Usefull stuff

### Paths

In [3]:
information_path = "Data/raw_data/node_information.csv"
test_set_path = "Data/raw_data/testing_set.txt"
train_set_path = "Data/raw_data/training_set.txt"
random_preds_path = "Data/raw_data/random_predictions.csv"

### Useful functions

In [4]:
def generate_samples(graph, train_set_ratio):
    """
    Graph pre-processing step required to perform supervised link prediction
    Create training and test sets
    """    
    # --- Step 1: Generate positive edge samples for testing set ---
    residual_g = graph.copy()
    test_pos_samples = []
      
    # Store the shuffled list of current edges of the graph
    edges = list(residual_g.edges())
    np.random.shuffle(edges)
    
    # Define number of positive test samples desired
    test_set_size = int((1.0 - train_set_ratio) * graph.number_of_edges())
    train_set_size = graph.number_of_edges() - test_set_size
    
    # Remove random edges from the graph, leaving it connected
    # Fill in the blanks
    for i,edge in enumerate(edges[:test_set_size]):
        if i%1000==0:
            print(i)
        
        # Remove the edge
        residual_g.remove_edge(edge[0], edge[1])
        
        # Add the removed edge to the positive sample list 
        test_pos_samples.append(edge)
        
        
    # --- Step 2: Generate positive edge samples for training set ---
    # The remaining edges are simply considered for positive samples of the training set
    train_pos_samples = list(residual_g.edges())
        
        
    # --- Step 3: Generate the negative samples for testing and training sets ---
    # Fill in the blanks

    print("compute negative samples")
    train_neg_samples = []
    test_neg_samples = []

    print('train neg samples')
    i = 0
    while i < train_set_size:
        a = np.random.choice(nx.nodes(G),1)[0]
        b = np.random.choice(nx.nodes(G),1)[0]
        if (a,b) not in edges and (a,b) not in train_neg_samples:
            i+=1
            train_neg_samples.append((a,b))

    print('test neg samples')
    j = 0
    while j < test_set_size:
        a = np.random.choice(nx.nodes(G),1)[0]
        b = np.random.choice(nx.nodes(G),1)[0]
        if (a,b) not in edges and (a,b) not in test_neg_samples and (a,b) not in train_neg_samples:
            j+=1
            test_neg_samples.append((a,b))


    print("done")
    
    # --- Step 4: Combine sample lists and create corresponding labels ---
    # For training set
    print("final step")
    train_samples = train_pos_samples + train_neg_samples
    train_labels = [1 for _ in train_pos_samples] + [0 for _ in train_neg_samples]
    # For testing set
    test_samples = test_pos_samples + test_neg_samples
    test_labels = [1 for _ in test_pos_samples] + [0 for _ in test_neg_samples]
    
    return train_samples, train_labels, test_samples, test_labels

## Import data

In [5]:
information_df = pd.read_csv(information_path, header=None)
information_df.columns = ["ID",'pub_year','title','authors','journal_name','abstract']
### !!!! We have to use new index starting from 0 because of the implementation of karate-club library
information_df = information_df.assign(new_ID = [i for i in range(information_df.shape[0])])
information_df.sample(3)

Unnamed: 0,ID,pub_year,title,authors,journal_name,abstract,new_ID
26953,9909192,1999,physical auxiliary field in supersymmetric qcd...,Noriaki Kitazawa,,supersymmetry breaking it is shown that the au...,26953
20372,9703176,1997,a note on the picard-fuchs equations for n 2 s...,"J. M. Isidro, A. Mukherjee, J. P. Nunes, H. J....",Int.J.Mod.Phys.,a concise presentation of the pf equations for...,20372
19520,9611172,1996,on orbifolds of 0 2 models,"Ralph Blumenhagen, Savdeep Sethi",Nucl.Phys.,we study orbifolds of 0 2 models including som...,19520


In [6]:
pre_train_set = pd.read_csv(train_set_path, sep =" ", header = None)
pre_train_set.columns = ['node1','node2','label']
### !!! we will use the new indices!!! (see information_df for correspondances)
pre_train_set = (pre_train_set
    .merge(information_df[['ID','new_ID']], how = 'left', left_on = ['node1'], right_on = ['ID'])
    .drop(columns = ['node1','ID'])
    .rename(columns = {'new_ID':'node1'})
    .merge(information_df[['ID','new_ID']], how = 'left', left_on = ['node2'], right_on = ['ID'])
    .drop(columns = ['node2','ID'])
    .rename(columns = {'new_ID':'node2'})
)
pre_train_set.sample(5)

Unnamed: 0,label,node1,node2
54501,0,18854,1877
36609,0,26761,4153
455266,1,24105,23780
105393,0,912,27189
337693,1,19786,18309


In [7]:
test_set = pd.read_csv(test_set_path, sep =" ", header = None)
test_set.columns = ['node1','node2']
test_set.sample(5)

Unnamed: 0,node1,node2
9178,212171,9910069
32472,204112,9902116
25188,9810123,9805097
25451,12265,209125
30444,303218,9601175


## Create train and validation set

In [8]:
X = pre_train_set.drop(columns = ['label'])
y = pre_train_set[['label']]
train_samples, validation_samples, train_labels, validation_labels = train_test_split(X,y, test_size=0.2, random_state=0, shuffle=True, stratify=y)

In [58]:
train_set = pd.concat([train_samples, train_labels], axis = 1).reset_index(drop = True)
validation_set = pd.concat([validation_samples, validation_labels], axis = 1).reset_index(drop = True)

## Information pre_processing

### Missing values

In [10]:
information_df.isna().sum()

ID                 0
pub_year           0
title              0
authors         4033
journal_name    7472
abstract           0
new_ID             0
dtype: int64

In [11]:
information_df = information_df.fillna({'authors':'', 'journal_name':''})


### Authors

In [12]:
information_df.authors = information_df.authors.apply(lambda x:x.split(","))

### Titles

In [93]:
if os.path.isfile("Data/processed_data/information.csv"):
    information_df = pickle.load(open("Data/processed_data/information.csv",'rb'))
else:
    information_df['title_lemma'] = information_df.title.apply(lambda x: [token.lemma_ for token in spacy_nlp(x) if not token.is_punct if not token.is_digit if not token.is_stop])
    pickle.dump(information_df, open("Data/processed_data/information.csv",'wb'))

## Create graphs

### Articles based graph

In [14]:
nodes = set(np.concatenate((train_set.node1,train_set.node2), axis = 0))
edges = set(train_set.query("label == 1").apply(lambda x: (x.node1,x.node2), axis = 1))

In [15]:
G_articles = nx.Graph()
G_articles.add_nodes_from(nodes)
G_articles.add_edges_from(edges)

print("The number of nodes: {}".format(G_articles.number_of_nodes()))
print("The number of edges: {}".format(G_articles.number_of_edges()))

The number of nodes: 27770
The number of edges: 267828


### Authors co-authorship based graph

In [16]:
import string
# convert to lower case, remove punctuation, strip the names
authors_raw_set = set([auth.strip().lower().translate(str.maketrans('', '', string.punctuation)) for list_auth in information_df.authors for auth in list_auth if len(auth)>1])

Name matching: to make identify people name by different ways

In [17]:
from namematcher import NameMatcher
name_matcher = NameMatcher()

def compute_unique_names(authors_raw_set):
    """
    one author can be named differently on different papers
    this function aims at finding a 'representant' (longest name that describe an author) for each 
    author
    inputs:
        - authors_raw_set: set of previously extracted author names
    outputs:
        - dict: keys are the name in authors_raw_set and the values are the representant
    """
    representant_dict = {}
    attributed_nodes = [] # names that already have a representant
    for name in tqdm(authors_raw_set, position = 0):
        sim_list = [] # similar names 
        if name not in attributed_nodes:
            for name2 in authors_raw_set:
                try:
                    if name != name2 and name[0]==name2[0] and name2 not in attributed_nodes:
                        # two names need to start by the same letter to be consider as potential equivalents
                        score = name_matcher.match_names(name, name2)
                        if score > 0.9: # if names are close enough
                            sim_list.append(name2)
                except:
                    continue
            sim_list.append(name) # the representant is in this list
            attributed_nodes.extend(sim_list) # we have fund a representant for those names
            representant = max(sim_list, key=len) # the representant is the longest name
            for name in sim_list: # all those names have the same representant
                representant_dict[name] = representant
    return(representant_dict)

if os.path.isfile('Data/processed_data/representant_dict.pkl'):
    representant_dict = pickle.load(open('Data/processed_data/representant_dict.pkl','rb'))
else:
    representant_dict = compute_unique_names(authors_raw_set)
    pickle.dump(representant_dict, open('Data/processed_data/representant_dict.pkl','wb'))

In [94]:
# set each name to its representant value
information_df.authors = information_df.authors.apply(lambda x: [representant_dict[auth.strip().lower().translate(str.maketrans('', '', string.punctuation))] for auth in x])

In [95]:
# create a unique index for each author
representants_list = list(set(representant_dict.values()))
authors2idx = {k: v for v, k in enumerate(representants_list)}

information_df["authors_id"] = information_df.authors.apply(lambda x: [authors2idx[auth] for auth in x])

In [96]:
information_df.sample(2)

Unnamed: 0,ID,pub_year,title,authors,journal_name,abstract,new_ID,title_lemma,authors_id
10397,9202058,1992,hamiltonian reduction and classical extended s...,"[katsushi ito, jens ole madsen]",Phys.,we present a systematic construction of classi...,10397,"[hamiltonian, reduction, classical, extend, su...","[12299, 14012]"
26823,9909060,1999,branched polymer revisited,"[hajime aoki, satoshi iso, hikaru kawai, yoshi...",Prog.Theor.Phys.,we show that correlation functions for branche...,26823,"[branch, polymer, revisit]","[2550, 4810, 12325, 13907]"


In [97]:
# compute nodes and edges
pre_edges = list(information_df.authors_id.apply(lambda x : [(x[i],x[j]) for i in range(len(x)) for j in range(len(x)) if i>j]))
authors_edges = [edge for list_edge in pre_edges for edge in list_edge]
authors_edges_dict = Counter(authors_edges)

In [99]:
G_authors = nx.Graph()
G_authors.add_nodes_from(authors2idx.values())
G_authors.add_weighted_edges_from([(a,b,weight) for (a,b),weight in authors_edges_dict.items()])

print("The number of nodes: {}".format(G_authors.number_of_nodes()))
print("The number of edges: {}".format(G_authors.number_of_edges()))

The number of nodes: 14447
The number of edges: 29111


## Features construction

### Graph Based Features

In [22]:
### Walklets
from karateclub import Walklets
if os.path.isfile('Data/processed_data/articles_walklets_embeddings.pkl'):
    walklets_articles_embeddings = pickle.load(open('Data/processed_data/articles_walklets_embeddings.pkl','rb'))
else:
    walklets = Walklets(walk_length=80) # we leave the defaults parameters for the other values
    walklets.fit(G_articles)
    walklets_articles_embeddings = walklets.get_embedding()
    pickle.dump(walklets_articles_embeddings, open('Data/processed_data/articles_walklets_embeddings.pkl','wb'))

In [23]:
### Node2Vec
from karateclub import Node2Vec
if os.path.isfile('Data/processed_data/articles_node2vec_embeddings.pkl'):
    articles_node2vec_embeddings = pickle.load(open('Data/processed_data/articles_node2vec_embeddings.pkl','rb'))
else:
    node2vec = Node2Vec(walk_length=15) # we leave the defaults parameters for the other values
    node2vec.fit(G_articles)
    articles_node2vec_embeddings = node2vec.get_embedding()
    pickle.dump(articles_node2vec_embeddings, open('Data/processed_data/articles_node2vec_embeddings.pkl','wb'))

### Node based features

In [None]:
# compute abstracts embeddings using specter network

from transformers import AutoTokenizer, AutoModel


if os.path.isfile('Data/processed_data/abstracts_embeddings.pkl'):
    abstracts_embeddings = pickle.load(open('Data/processed_data/abstracts_embeddings.pkl','rb'))
else:
    
    # load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
    model = AutoModel.from_pretrained('allenai/specter')
    abstracts_embeddings = []
    for i in tqdm(range(information_df.shape[0]), position = 0):
        article = information_df.loc[i]
        title = article.title
        abstract = article.abstract
        paper = [{'title':title, 'abstract':abstract}]

        # concatenate title and abstract
        title_abs = [d['title'] + tokenizer.sep_token + (d.get('abstract') or '') for d in paper]
        # preprocess the input
        inputs = tokenizer(title_abs, padding=True, truncation=True, return_tensors="pt", max_length=512)
        result = model(**inputs)
        # take the first token in the batch as the embedding
        embedding = result.last_hidden_state[:, 0, :]
        abstracts_embeddings.append(embedding)
    pickle.dump(abstracts_embeddings, open('Data/processed_data/abstracts_embeddings.pkl','wb'))

In [59]:
### Common authors, delta publication year, common words in titles
useful_information_df = information_df[['new_ID','author_representant','pub_year', 'title_lemma']]

# prepare data frame for common authors computation
common_authors_df = (train_set
.merge(useful_information_df, how ='left', left_on = ['node1'], right_on = ['new_ID'])
.rename(columns = {'author_representant':'authors_node_1', 'pub_year':'pub_year1', 'title_lemma':'title_lemma1'})
.merge(useful_information_df, how ='left', left_on = ['node2'], right_on = ['new_ID'])
.rename(columns = {'author_representant':'authors_node_2', 'pub_year':'pub_year2', 'title_lemma':'title_lemma2'})
)

#  compute common authors
train_set['common_authors'] = common_authors_df.apply(lambda x:set(x.authors_node_1)&set(x.authors_node_2),axis = 1)

#  compute common words in titles
train_set['common_title_words'] = common_authors_df.apply(lambda x:set(x.title_lemma1)&set(x.title_lemma2),axis = 1)

# compute delta publication year
train_set['delta_publication'] = common_authors_df.apply(lambda x:np.abs(x.pub_year2 - x.pub_year1),axis = 1)

In [60]:
train_set

Unnamed: 0,node1,node2,label,common_authors,common_title_words,delta_publication
0,11515,21190,0,{},{},4
1,7314,16753,1,{},{},7
2,8567,2038,1,{},{brane},2
3,3241,27543,1,{},"{domain, wall}",2
4,11593,2386,0,{},{},8
...,...,...,...,...,...,...
492404,25569,24704,1,{},{},0
492405,10240,15360,0,{},{},8
492406,2274,1294,1,"{y m cho, d k park}",{qed},0
492407,5310,22541,0,{},{},3


In [36]:
common_authors_df

Unnamed: 0,node1,node2,label,new_ID_x,authors_node_1,pub_year1,title_lemma1,new_ID_y,authors_node_2,pub_year2,title_lemma2
0,11515,21190,0,11515,"[hssharatch, ra]",1993,"[bosonisation, dimension]",21190,[nikita a nekrasov],1997,"[duality, calogero, moser, sutherland, system]"
1,7314,16753,1,7314,"[r m ellem, v v bazhanov anu]",2002,"[excited, state, tba, phi, perturb, m, model]",16753,[],1995,"[generalize, kdv, quantum, inverse, scatter, d..."
2,8567,2038,1,8567,"[a delgado, g v gersdorff, m quiros]",2002,"[brane, assist, scherk, schwarz, supersymmetry...",2038,[max zucker],2000,"[supersymmetric, brane, world, scenario, shell..."
3,3241,27543,1,3241,[klaus behrndt],2001,"[domain, wall, flow, equation, supergravity]",27543,[martin gremm],1999,"[dimensional, gravity, thick, domain, wall]"
4,11593,2386,0,11593,"[kenji hamada, asato tsuchiya]",1993,"[quantum, gravity, black, hole]",2386,"[machiko hatsuda, makoto sakaguchi]",2001,"[open, superstring, theory, superalgebra, bran..."
...,...,...,...,...,...,...,...,...,...,...,...
492404,25569,24704,1,25569,[ian i kogan],1999,"[singleton, logarithmic, cft, ad, cft, corresp...",24704,[hong liu imperial college],1999,"[scatter, anti, de, sitter, space, operator, p..."
492405,10240,15360,0,10240,"[mahbub majumdar, annechristine davis]",2003,"[inflation, tachyon, condensation, large, n, e...",15360,"[andrea cappelli, carlo a trugenberger, guille...",1995,"[stable, hierarchical, quantum, hall, fluid, w..."
492406,2274,1294,1,2274,"[y m cho, d k park]",2000,"[non, perturbative, estimate, vacuum, polariza...",1294,"[y m cho, d k park]",2000,"[effective, action, convergent, series, qed]"
492407,5310,22541,0,5310,[s deger],2001,"[century, gravity]",22541,"[marco billo, frederik denef, pietro fre, igor...",1998,"[special, geometry, calabi, yau, compactificat..."


In [None]:
### Node2Vec
from karateclub import Node2Vec
if os.path.isfile('Data/processed_data/articles_node2vec_embeddings.pkl'):
    articles_node2vec_embeddings = pickle.load(open('Data/processed_data/articles_node2vec_embeddings.pkl','rb'))
else:
    node2vec = Node2Vec(walk_length=15) # we leave the defaults parameters for the other values
    node2vec.fit(G_articles)
    articles_node2vec_embeddings = node2vec.get_embedding()
    pickle.dump(articles_node2vec_embeddings, open('Data/processed_data/articles_node2vec_embeddings.pkl','wb'))

In [65]:
### authors embedding leveraging author graphs (random walk for instance)

### Node2Vec
if os.path.isfile('Data/processed_data/authors_node2vec_embeddings.pkl'):
    authors_node2vec_embeddings = pickle.load(open('Data/processed_data/authors_node2vec_embeddings.pkl', 'rb'))
else:
    node2vec_authors = Node2Vec(walk_length=15)
    node2vec_authors.fit(G_authors)
    authors_node2vec_embeddings = node2vec_authors.get_embedding()
    pickle.dump(authors_node2vec_embeddings, open('Data/processed_data/authors_node2vec_embeddings.pkl', 'wb'))


AssertionError: The node indexing is wrong.