In [19]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib as plt
import os
import networkx as nx
from gensim.models import Word2Vec
from node2vec import Node2Vec
import pickle
import spacy
from collections import Counter
from tqdm import tqdm


from scipy.stats.stats import pearsonr
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score 
from sklearn.preprocessing import LabelBinarizer 
from sklearn.model_selection import train_test_split


from namematcher import NameMatcher

  from scipy.stats.stats import pearsonr


In [20]:
spacy_nlp = spacy.load("en_core_web_sm")

## Usefull stuff

### Paths

In [21]:
information_path = "Data/raw_data/node_information.csv"
test_set_path = "Data/raw_data/testing_set.txt"
train_set_path = "Data/raw_data/training_set.txt"
random_preds_path = "Data/raw_data/random_predictions.csv"

### Useful functions

In [22]:
def generate_samples(graph, train_set_ratio):
    """
    Graph pre-processing step required to perform supervised link prediction
    Create training and test sets
    """    
    # --- Step 1: Generate positive edge samples for testing set ---
    residual_g = graph.copy()
    test_pos_samples = []
      
    # Store the shuffled list of current edges of the graph
    edges = list(residual_g.edges())
    np.random.shuffle(edges)
    
    # Define number of positive test samples desired
    test_set_size = int((1.0 - train_set_ratio) * graph.number_of_edges())
    train_set_size = graph.number_of_edges() - test_set_size
    
    # Remove random edges from the graph, leaving it connected
    # Fill in the blanks
    for i,edge in enumerate(edges[:test_set_size]):
        if i%1000==0:
            print(i)
        
        # Remove the edge
        residual_g.remove_edge(edge[0], edge[1])
        
        # Add the removed edge to the positive sample list 
        test_pos_samples.append(edge)
        
        
    # --- Step 2: Generate positive edge samples for training set ---
    # The remaining edges are simply considered for positive samples of the training set
    train_pos_samples = list(residual_g.edges())
        
        
    # --- Step 3: Generate the negative samples for testing and training sets ---
    # Fill in the blanks

    print("compute negative samples")
    train_neg_samples = []
    test_neg_samples = []

    print('train neg samples')
    i = 0
    while i < train_set_size:
        a = np.random.choice(nx.nodes(G),1)[0]
        b = np.random.choice(nx.nodes(G),1)[0]
        if (a,b) not in edges and (a,b) not in train_neg_samples:
            i+=1
            train_neg_samples.append((a,b))

    print('test neg samples')
    j = 0
    while j < test_set_size:
        a = np.random.choice(nx.nodes(G),1)[0]
        b = np.random.choice(nx.nodes(G),1)[0]
        if (a,b) not in edges and (a,b) not in test_neg_samples and (a,b) not in train_neg_samples:
            j+=1
            test_neg_samples.append((a,b))


    print("done")
    
    # --- Step 4: Combine sample lists and create corresponding labels ---
    # For training set
    print("final step")
    train_samples = train_pos_samples + train_neg_samples
    train_labels = [1 for _ in train_pos_samples] + [0 for _ in train_neg_samples]
    # For testing set
    test_samples = test_pos_samples + test_neg_samples
    test_labels = [1 for _ in test_pos_samples] + [0 for _ in test_neg_samples]
    
    return train_samples, train_labels, test_samples, test_labels

## Import data

In [23]:
information_df = pd.read_csv(information_path, header=None)
information_df.columns = ["ID",'pub_year','title','authors','journal_name','abstract']
information_df.sample(3)

Unnamed: 0,ID,pub_year,title,authors,journal_name,abstract
5501,110225,2001,brane gases on k3 and calabi-yau manifolds,Damien A. Easson,,presentation improved we initiate the study of...
21696,9709206,1997,d-brane decay and hawking radiation,,Nucl.Phys.Proc.Suppl.,at amsterdam june 1997 references added tree l...
8738,211044,2002,thick domain walls and charged dilaton black h...,"R. Moderski, M. Rogatko",Phys.Rev.,we study a black hole domain wall system in di...


In [24]:
pre_train_set = pd.read_csv(train_set_path, sep =" ", header = None)
pre_train_set.columns = ['node1','node2','label']
pre_train_set.sample(5)

Unnamed: 0,node1,node2,label
329743,9703080,9209063,0
137352,2156,207119,0
457904,9411199,9803192,0
136809,204058,110108,1
416567,304222,9907027,1


In [25]:
test_set = pd.read_csv(test_set_path, sep =" ", header = None)
test_set.columns = ['node1','node2']
test_set.sample(5)

Unnamed: 0,node1,node2
14955,9606193,9603090
21729,9610084,9503232
5551,9904095,9710230
4419,205273,110140
28753,9911220,9306096


## Create train and validation set

In [26]:
X = pre_train_set.drop(columns = ['label'])
y = pre_train_set[['label']]
train_samples, validation_samples, train_labels, validation_labels = train_test_split(X,y, test_size=0.2, random_state=0, shuffle=True, stratify=y)

In [27]:
train_set = pd.concat([train_samples, train_labels], axis = 1)
validation_set = pd.concat([validation_samples, validation_labels], axis = 1)

## Information pre_processing

### Missing values

In [28]:
information_df.isna().sum()

ID                 0
pub_year           0
title              0
authors         4033
journal_name    7472
abstract           0
dtype: int64

In [29]:
information_df = information_df.fillna({'authors':'', 'journal_name':''})


### Authors

In [30]:
information_df.authors = information_df.authors.apply(lambda x:x.split(","))

### Titles

In [None]:
information_df['title_lemma'] = information_df.title.apply(lambda x: [token.lemma_ for token in spacy_nlp(x) if not token.is_punct if not token.is_digit if not token.is_stop])

### Save information df

In [None]:
pickle.dump(information_df, open("Data/processed_data/information.csv",'wb'))

In [32]:
information_df = pickle.load(open("Data/processed_data/information.csv",'rb'))

## Create graphs

### Articles based graph

In [33]:
nodes = set(np.concatenate((train_set.node1,train_set.node2), axis = 0))
edges = set(train_set.query("label == 1").apply(lambda x: (x.node1,x.node2), axis = 1))

In [34]:
G_articles = nx.Graph()
G_articles.add_nodes_from(nodes)
G_articles.add_edges_from(edges)

print("The number of nodes: {}".format(G_articles.number_of_nodes()))
print("The number of edges: {}".format(G_articles.number_of_edges()))

The number of nodes: 27770
The number of edges: 267828


### Authors co-authorship based graph

In [4]:
import string
# convert to lower case, remove punctuation, strip the names
authors_raw_set = set([auth.strip().lower().translate(str.maketrans('', '', string.punctuation)) for list_auth in information_df.authors for auth in list_auth if len(auth)>1])

Name matching: to make identify people name by different ways

In [9]:
from namematcher import NameMatcher
name_matcher = NameMatcher()

def compute_unique_names(authors_raw_set):
    """
    one author can be named differently on different papers
    this function aims at finding a 'representant' (longest name that describe an author) for each 
    author
    inputs:
        - authors_raw_set: set of previously extracted author names
    outputs:
        - dict: keys are the name in authors_raw_set and the values are the representant
    """
    representant_dict = {}
    attributed_nodes = [] # names that already have a representant
    for name in tqdm(authors_raw_set, position = 0):
        sim_list = [] # similar names 
        if name not in attributed_nodes:
            for name2 in authors_raw_set:
                try:
                    if name != name2 and name[0]==name2[0] and name2 not in attributed_nodes:
                        # two names need to start by the same letter to be consider as potential equivalents
                        score = name_matcher.match_names(name, name2)
                        if score > 0.9: # if names are close enough
                            sim_list.append(name2)
                except:
                    continue
            sim_list.append(name) # the representant is in this list
            attributed_nodes.extend(sim_list) # we have fund a representant for those names
            representant = max(sim_list, key=len) # the representant is the longest name
            for name in sim_list: # all those names have the same representant
                representant_dict[name] = representant
    return(representant_dict)

if os.path.isfile('Data/processed_data/representant_dict.pkl'):
    representant_dict = pickle.load(open('Data/processed_data/representant_dict.pkl','rb'))
else:
    representant_dict = compute_unique_names(authors_raw_set)
    pickle.dump(representant_dict, open('Data/processed_data/representant_dict.pkl','wb'))

100%|██████████| 16049/16049 [50:28<00:00,  5.30it/s] 


In [12]:
# set each name to its representant value
information_df['author_representant'] = information_df.authors.apply(lambda x: [representant_dict[auth.strip().lower().translate(str.maketrans('', '', string.punctuation))] for auth in x])

In [13]:
information_df

Unnamed: 0,ID,pub_year,title,authors,journal_name,abstract,title_lemma,author_representant
0,1001,2000,compactification geometry and duality,[Paul S. Aspinwall],,these are notes based on lectures given at tas...,"[compactification, geometry, duality]",[paul s aspinwall]
1,1002,2000,domain walls and massive gauged supergravity p...,"[M. Cvetic, H. Lu, C.N. Pope]",Class.Quant.Grav.,we point out that massive gauged supergravity ...,"[domain, wall, massive, gauge, supergravity, p...","[m cvetivc, h lu, cn pope]"
2,1003,2000,comment on metric fluctuations in brane worlds,"[Y.S. Myung, Gungwon Kang]",,recently ivanov and volovich hep-th 9912242 cl...,"[comment, metric, fluctuation, brane, world]","[ys myung, gungwon kang]"
3,1004,2000,moving mirrors and thermodynamic paradoxes,[Adam D. Helfer],Phys.Rev.,quantum fields responding to moving mirrors ha...,"[move, mirror, thermodynamic, paradox]",[adam d helfer]
4,1005,2000,bundles of chiral blocks and boundary conditio...,"[J. Fuchs, C. Schweigert]",,proceedings of lie iii clausthal july 1999 var...,"[bundle, chiral, block, boundary, condition, cft]","[j fuchs, c schweigert]"
...,...,...,...,...,...,...,...,...
27765,9912289,2002,gauge fixing in the chain by chain method,"[A Shirzad, F Loran]",,in a recent work we showed that for a hamilton...,"[gauge, fixing, chain, chain, method]","[a shirzad, f loran]"
27766,9912290,2000,shuffling quantum field theory,[Dirk Kreimer],Lett.Math.Phys.,we discuss shuffle identities between feynman ...,"[shuffle, quantum, field, theory]",[dirk kreimer]
27767,9912291,1999,small object limit of casimir effect and the s...,"[O. Kenneth, S. Nussinov]",Phys.Rev.,we show a simple way of deriving the casimir p...,"[small, object, limit, casimir, effect, sign, ...","[o kenneth, s nussinov]"
27768,9912292,1999,1 4 pbgs and superparticle actions,"[F.Delduc, E. Ivanov, S. Krivonos]",,karpacz poland september 21-25 1999 we constru...,"[pbgs, superparticle, action]","[fdelduc, e a ivanov, s krivonos]"


In [14]:
# compute nodes and edges
pre_edges = list(information_df.author_representant.apply(lambda x : [(x[i],x[j]) for i in range(len(x)) for j in range(len(x)) if i>j]))
authors_edges = [edge for list_edge in pre_edges for edge in list_edge]
authors_edges_dict = Counter(authors_edges)

In [15]:
G_authors = nx.Graph()
G_authors.add_nodes_from(set(representant_dict.values()))
G_authors.add_weighted_edges_from([(a,b,weight) for (a,b),weight in authors_edges_dict.items()])

print("The number of nodes: {}".format(G_authors.number_of_nodes()))
print("The number of edges: {}".format(G_authors.number_of_edges()))

The number of nodes: 14447
The number of edges: 29111


## Features construction

### Graph Based Features

In [None]:
### Walklets
from karateclub import Walklets
if os.path.isfile('Data/processed_data/articles_walklets_embeddings.pkl','wb'):
    walklets_embeddings = pickle.load(open('Data/processed_data/articles_walklets_embeddings.pkl','rb'))
else:
    walklets = Walklets(walk_length=80) # we leave the defaults parameters for the other values
    walklets.fit(G_articles)
    walklets_embeddings = walklets.get_embedding()
    pickle.dump(walklets_embeddings, open('Data/processed_data/articles_walklets_embeddings.pkl','wb'))

In [35]:
### Node2Vec
node2vec = Node2Vec(G_articles, dimensions=32, walk_length=8, num_walks=200, workers=1, p=1, q=1)
# Embed nodes
model = node2vec.fit(window=5, min_count=1, batch_words=4)

Computing transition probabilities: 100%|██████████| 27770/27770 [03:12<00:00, 143.96it/s]
Generating walks (CPU: 1): 100%|██████████| 200/200 [09:47<00:00,  2.94s/it]


In [None]:
pickle.dump(model.wv, open('Data/processed_data/articles_node2vec_embeddings.pkl','wb'))

In [None]:
# Retrieve node embeddings and corresponding subjects
node_ids = model.wv.index2word  # list of node IDs
node_embeddings = (
    model.wv.vectors
)  # numpy.ndarray of size number of nodes times embeddings dimensionality
node_targets = node_subjects[[int(node_id) for node_id in node_ids]]

### Node based features

In [None]:
# compute abstracts embeddings using specter network

from transformers import AutoTokenizer, AutoModel


if os.path.isfile('Data/processed_data/abstracts_embeddings.pkl'):
    abstracts_embeddings = pickle.load(open('Data/processed_data/abstracts_embeddings.pkl','rb'))
else:
    
    # load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
    model = AutoModel.from_pretrained('allenai/specter')
    papers = []
    for i in range(information_df.shape[0]):
        article = information_df.loc[i]
        title = article.title
        abstract = article.abstract
        papers.append({'title':title, 'abstract':abstract})

    # concatenate title and abstract
    title_abs = [d['title'] + tokenizer.sep_token + (d.get('abstract') or '') for d in papers]
    # preprocess the input
    inputs = tokenizer(title_abs, padding=True, truncation=True, return_tensors="pt", max_length=512)
    result = model(**inputs)
    # take the first token in the batch as the embedding
    abstracts_embeddings = result.last_hidden_state[:, 0, :]
    pickle.dump(abstracts_embeddings, open('Data/processed_data/abstracts_embeddings.pkl','wb'))

In [None]:
### Common authors
authors_information_df = information_df[['ID','author_representant']]

# prepare data frame for common authors computation
common_authors_df = (train_set
.merge(authors_information_df, how ='left', left_on = ['node1'], right_on = ['ID'])
.rename(columns = {'author_representant':'authors_node_1'})
.merge(authors_information_df, how ='left', left_on = ['node2'], right_on = ['ID'])
.rename(columns = {'author_representant':'authors_node_2'})
)

#  compute common auhtors
train_set['common_authors'] = common_authors_df.apply(lambda x:set(x.authors_node_1)&set(x.authors_node_2),axis = 1)

In [None]:
### authors embedding leveraging author graphs (random walk for instance)

### Node2Vec
if os.path.isfile('Data/processed_data/articles_node2vec_embeddings.pkl'):
    articles_node2vec_embeddings = pickle.load(open('Data/processed_data/articles_node2vec_embeddings.pkl', 'rb'))
else:
    node2vec_authors = Node2Vec(G_authors, dimensions=32, walk_length=8, num_walks=200, workers=1, p=1, q=1)
    model_authors = node2vec_authors.fit(window=5, min_count=1, batch_words=4)
    articles_node2vec_embeddings = model_authors.wv
    pickle.dump(articles_node2vec_embeddings, open('Data/processed_data/articles_node2vec_embeddings.pkl', 'wb'))


In [None]:
### Delta publication year

In [None]:
### titles common words

In [None]:
### 