In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib as plt
import os
import networkx as nx
from gensim.models import Word2Vec
from node2vec import Node2Vec
import pickle
import spacy
from collections import Counter
from tqdm import tqdm
import torch


from scipy.stats.stats import pearsonr
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score 
from sklearn.preprocessing import LabelBinarizer 
from sklearn.model_selection import train_test_split


from namematcher import NameMatcher

  from scipy.stats.stats import pearsonr


In [2]:
spacy_nlp = spacy.load("en_core_web_sm")

## Usefull stuff

### Paths

In [3]:
information_path = "Data/raw_data/node_information.csv"
test_set_path = "Data/raw_data/testing_set.txt"
train_set_path = "Data/raw_data/training_set.txt"
random_preds_path = "Data/raw_data/random_predictions.csv"

### Useful functions

In [5]:
def cosine(a,b):
    return(a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b)))

## Import data

In [6]:
information_df = pd.read_csv(information_path, header=None)
information_df.columns = ["ID",'pub_year','title','authors','journal_name','abstract']
### !!!! We have to use new index starting from 0 because of the implementation of karate-club library
information_df = information_df.assign(new_ID = [i for i in range(information_df.shape[0])])
information_df.sample(3)

Unnamed: 0,ID,pub_year,title,authors,journal_name,abstract,new_ID
16959,9511024,1996,classical geometry and target space duality,,,sentence comment on a reference was amended th...,16959
23871,9807177,1998,nonlocal electrodynamics in 2 1 dimensions fro...,Qiong-gui Lin,Commun.Theor.Phys.,the theory of a spinor field interacting with ...,23871
11523,9302008,1993,parameter restrictions in a non-commutative ge...,"E. Alvarez, J.M. Gracia-Bondia, C.P. Martin",Phys.Lett.,survive standard quantum corrections we have i...,11523


In [7]:
train_set = pd.read_csv(train_set_path, sep =" ", header = None)
train_set.columns = ['node1','node2','label']
### !!! we will use the new indices!!! (see information_df for correspondances)
train_set = (train_set
    .merge(information_df[['ID','new_ID']], how = 'left', left_on = ['node1'], right_on = ['ID'])
    .drop(columns = ['node1','ID'])
    .rename(columns = {'new_ID':'node1'})
    .merge(information_df[['ID','new_ID']], how = 'left', left_on = ['node2'], right_on = ['ID'])
    .drop(columns = ['node2','ID'])
    .rename(columns = {'new_ID':'node2'})
)
train_set.sample(5)

Unnamed: 0,label,node1,node2
22161,1,16802,16378
292422,1,6760,450
222854,1,9913,8026
567115,1,25601,24561
196647,1,5507,4053


In [8]:
test_set = pd.read_csv(test_set_path, sep =" ", header = None)
test_set.columns = ['node1','node2']
test_set.sample(5)

Unnamed: 0,node1,node2
9771,9711205,107082
20185,9701082,9511006
24511,9703094,9201027
24628,9901150,9207025
19063,11041,201070


## Information pre_processing

### Missing values

In [11]:
information_df.isna().sum()

ID                 0
pub_year           0
title              0
authors         4033
journal_name    7472
abstract           0
new_ID             0
dtype: int64

In [12]:
information_df = information_df.fillna({'authors':'', 'journal_name':''})


### Authors

In [13]:
information_df.authors = information_df.authors.apply(lambda x:x.split(","))

### Titles

In [14]:
if os.path.isfile("Data/processed_data/information.csv"):
    information_df = pickle.load(open("Data/processed_data/information.csv",'rb'))
else:
    information_df['title_lemma'] = information_df.title.apply(lambda x: [token.lemma_ for token in spacy_nlp(x) if not token.is_punct if not token.is_digit if not token.is_stop])
    pickle.dump(information_df, open("Data/processed_data/information.csv",'wb'))

## Create graphs

### Articles based graph

In [15]:
nodes = set(np.concatenate((train_set.node1,train_set.node2), axis = 0))
edges = set(train_set.query("label == 1").apply(lambda x: (x.node1,x.node2), axis = 1))

In [16]:
G_articles = nx.Graph()
G_articles.add_nodes_from(nodes)
G_articles.add_edges_from(edges)

print("The number of nodes: {}".format(G_articles.number_of_nodes()))
print("The number of edges: {}".format(G_articles.number_of_edges()))

The number of nodes: 27770
The number of edges: 267828


### Authors co-authorship based graph

In [17]:
import string
# convert to lower case, remove punctuation, strip the names
authors_raw_set = set([auth.strip().lower().translate(str.maketrans('', '', string.punctuation)) for list_auth in information_df.authors for auth in list_auth if len(auth)>1])

Name matching: to make identify people name by different ways

In [18]:
from namematcher import NameMatcher
name_matcher = NameMatcher()

def compute_unique_names(authors_raw_set):
    """
    one author can be named differently on different papers
    this function aims at finding a 'representant' (longest name that describe an author) for each 
    author
    inputs:
        - authors_raw_set: set of previously extracted author names
    outputs:
        - dict: keys are the name in authors_raw_set and the values are the representant
    """
    representant_dict = {}
    attributed_nodes = [] # names that already have a representant
    for name in tqdm(authors_raw_set, position = 0):
        sim_list = [] # similar names 
        if name not in attributed_nodes:
            for name2 in authors_raw_set:
                try:
                    if name != name2 and name[0]==name2[0] and name2 not in attributed_nodes:
                        # two names need to start by the same letter to be consider as potential equivalents
                        score = name_matcher.match_names(name, name2)
                        if score > 0.9: # if names are close enough
                            sim_list.append(name2)
                except:
                    continue
            sim_list.append(name) # the representant is in this list
            attributed_nodes.extend(sim_list) # we have fund a representant for those names
            representant = max(sim_list, key=len) # the representant is the longest name
            for name in sim_list: # all those names have the same representant
                representant_dict[name] = representant
    return(representant_dict)

if os.path.isfile('Data/processed_data/representant_dict.pkl'):
    representant_dict = pickle.load(open('Data/processed_data/representant_dict.pkl','rb'))
else:
    representant_dict = compute_unique_names(authors_raw_set)
    pickle.dump(representant_dict, open('Data/processed_data/representant_dict.pkl','wb'))

In [19]:
# set each name to its representant value
information_df.authors = information_df.authors.apply(lambda x: [representant_dict[auth.strip().lower().translate(str.maketrans('', '', string.punctuation))] for auth in x])

In [20]:
# create a unique index for each author
representants_list = list(set(representant_dict.values()))
authors2idx = {k: v for v, k in enumerate(representants_list)}

information_df["authors_id"] = information_df.authors.apply(lambda x: [authors2idx[auth] for auth in x])

In [21]:
information_df.sample(2)

Unnamed: 0,ID,pub_year,title,authors,journal_name,abstract,new_ID,title_lemma,authors_id
12845,9311053,1993,anyonic construction of the sl q s 2 algebra,"[jl matheusvalle, mrmonteiro]",Mod.Phys.Lett.,after the end document on the directory you ar...,12845,"[anyonic, construction, sl, q, s, algebra]","[4477, 9493]"
10020,303266,2003,time-dependent backgrounds from supergravity w...,"[klaus behrndt, mirjam cvetivc]",,r-symmetry we obtain a general class of time-d...,10020,"[time, dependent, background, supergravity, ga...","[8749, 4752]"


In [22]:
# compute nodes and edges
pre_edges = list(information_df.authors_id.apply(lambda x : [(x[i],x[j]) for i in range(len(x)) for j in range(len(x)) if i>j]))
authors_edges = [edge for list_edge in pre_edges for edge in list_edge]
authors_edges_dict = Counter(authors_edges)

In [23]:
G_authors = nx.Graph()
G_authors.add_nodes_from(authors2idx.values())
G_authors.add_weighted_edges_from([(a,b,weight) for (a,b),weight in authors_edges_dict.items()])

print("The number of nodes: {}".format(G_authors.number_of_nodes()))
print("The number of edges: {}".format(G_authors.number_of_edges()))

The number of nodes: 14447
The number of edges: 29111


## Various embeddings computation

### Graph Based embeddings

Using articles graph

In [24]:
### Walklets
from karateclub import Walklets
if os.path.isfile('Data/processed_data/articles_walklets_embeddings.pkl'):
    walklets_articles_embeddings = pickle.load(open('Data/processed_data/articles_walklets_embeddings.pkl','rb'))
else:
    walklets = Walklets(walk_length=80) # we leave the defaults parameters for the other values
    walklets.fit(G_articles)
    walklets_articles_embeddings = walklets.get_embedding()
    pickle.dump(walklets_articles_embeddings, open('Data/processed_data/articles_walklets_embeddings.pkl','wb'))

In [25]:
### Node2Vec
from karateclub import Node2Vec
if os.path.isfile('Data/processed_data/articles_node2vec_embeddings.pkl'):
    articles_node2vec_embeddings = pickle.load(open('Data/processed_data/articles_node2vec_embeddings.pkl','rb'))
else:
    node2vec = Node2Vec(walk_length=15) # we leave the defaults parameters for the other values
    node2vec.fit(G_articles)
    articles_node2vec_embeddings = node2vec.get_embedding()
    pickle.dump(articles_node2vec_embeddings, open('Data/processed_data/articles_node2vec_embeddings.pkl','wb'))

In [26]:
### compute shortes path between nodes
shortest_path_lengths = []
for i in tqdm(range(train_set.shape[0]), position = 0):
    value = train_set.loc[i]
    try:
        path_len = nx.shortest_path_length(G_articles, source=value.node1, target=value.node2)
    except:
        path_len = np.nan
    shortest_path_lengths.append(path_len)
train_set['shortest_path'] = shortest_path_lengths
train_set = train_set.fillna({"shortest_path":train_set.shortest_path+10})

100%|██████████| 492409/492409 [02:34<00:00, 3192.48it/s]


Using author graph

In [27]:
### authors embedding leveraging author graphs (random walk for instance)

### Node2Vec
if os.path.isfile('Data/processed_data/authors_node2vec_embeddings.pkl'):
    authors_node2vec_embeddings = pickle.load(open('Data/processed_data/authors_node2vec_embeddings.pkl', 'rb'))
else:
    node2vec_authors = Node2Vec(walk_length=15)
    node2vec_authors.fit(G_authors)
    authors_node2vec_embeddings = node2vec_authors.get_embedding()
    pickle.dump(authors_node2vec_embeddings, open('Data/processed_data/authors_node2vec_embeddings.pkl', 'wb'))

In [38]:
information_df.sample(2)

Unnamed: 0,ID,pub_year,title,authors,journal_name,abstract,new_ID,title_lemma,authors_id
8421,210029,2003,cosmology with radion and bulk scalar field in...,"[shinpei kobayashi, kazuya koyama]",JHEP,we investigate cosmological evolutions of the ...,8421,"[cosmology, radion, bulk, scalar, field, brane...","[11142, 4747]"
19964,9701146,1997,expanding and contracting universes in third q...,"[a buonanno, m gasperini, m maggiore, c ungare...",Class.Quant.Grav.,collection of papers on the pre-big bang scena...,19964,"[expand, contracting, universe, quantize, stri...","[4822, 7060, 7149, 5218]"


In [29]:
# for each article take a mean of the authors embedding as global autors embedding
articles_authors_embedding = []
for i in range(information_df.shape[0]):
    value = information_df[information_df.new_ID == i]
    authors_id = value.authors_id
    embeddings = np.array([0 for i in range(128)]).astype('float64')
    for author in authors_id:
        embeddings+=authors_node2vec_embeddings[author][0]
    articles_authors_embedding.append(embeddings/len(authors_id))



### Node based embeddings

In [31]:
# compute abstracts embeddings using specter network

from transformers import AutoTokenizer, AutoModel

if torch.cuda.is_available():
    device = "cuda"
else:
    device = 'gpu'

if os.path.isfile('Data/processed_data/abstracts_embeddings.pkl'):
    abstracts_embeddings = pickle.load(open('Data/processed_data/abstracts_embeddings.pkl','rb'))
else:
    
    # load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
    model = AutoModel.from_pretrained('allenai/specter').to(device)
    model.eval()
    abstracts_embeddings = []
    for i in tqdm(range(information_df.shape[0]), position = 0):
        article = information_df.loc[i]
        title = article.title
        abstract = article.abstract
        paper = [{'title':title, 'abstract':abstract}]

        # concatenate title and abstract
        title_abs = [d['title'] + tokenizer.sep_token + (d.get('abstract') or '') for d in paper]
        # preprocess the input
        inputs = tokenizer(title_abs, padding=True, truncation=True, return_tensors="pt", max_length=512)
        inputs = inputs.to(device)
        result = model(**inputs)
        # take the first token in the batch as the embedding
        embedding = result.last_hidden_state[:, 0, :].detach().cpu().numpy()
        abstracts_embeddings.append(embedding)
    pickle.dump(abstracts_embeddings, open('Data/processed_data/abstracts_embeddings.pkl','wb'))

## Compute edges features

In [45]:
def Jaccard(graph, edge):

    inter_size = len(list(nx.common_neighbors(graph, edge[0], edge[1])))
    union_size = len(set(graph[edge[0]]) | set(graph[edge[1]]))
    jacard = inter_size / union_size

    return jacard

In [46]:
def AdamicAdar(graph, edge):

    inter_list = nx.common_neighbors(graph, edge[0], edge[1])
    adamic_adar = AdamicAdar[edge] = sum( [1/np.log(graph.degree(node)) for node in inter_list])
    
    return adamic_adar

In [47]:
def preferential_attachement(graph, edge):


    pa = graph.degree(edge[0]) * graph.degree(edge[1])
        
    return pa

## Compute features

In [None]:
def compute_non_embeddings_features(df, information_df, G_articles):
    useful_information_df = information_df[['new_ID','authors','pub_year', 'title_lemma']]

    # prepare data frame for common authors computation
    df = (df
    .merge(useful_information_df, how ='left', left_on = ['node1'], right_on = ['new_ID'])
    .rename(columns = {'authors':'authors_node_1', 'pub_year':'pub_year1', 'title_lemma':'title_lemma1'})
    .merge(useful_information_df, how ='left', left_on = ['node2'], right_on = ['new_ID'])
    .rename(columns = {'authors':'authors_node_2', 'pub_year':'pub_year2', 'title_lemma':'title_lemma2'})
    )

    print('computing common authors')
    #  compute common authors
    df['common_authors'] = df.apply(lambda x:len(set(x.authors_node_1)&set(x.authors_node_2)),axis = 1)

    print('computing common words')
    #  compute common words in titles
    df['common_title_words'] = df.apply(lambda x:len(set(x.title_lemma1)&set(x.title_lemma2)),axis = 1)

    print('computing delta publication year')
    # compute delta publication year
    df['delta_publication'] = df.apply(lambda x:np.abs(x.pub_year2 - x.pub_year1),axis = 1)

    # compute edges features
    print('computing jacard index')
    df['jacard'] = df.apply(lambda x: Jaccard(G_articles, (x.node1, x.node2)),axis = 1)

    print('computing preferential attachement')
    df['pa'] = df.apply(lambda x: preferential_attachement(G_articles, (x.node1, x.node2)),axis = 1)

    print('computing adamic_adar')
    df['adamic_adar'] = df.apply(lambda x: AdamicAdar(G_articles, (x.node1, x.node2)),axis = 1)

    return(df)


In [None]:
def compute_embedding_cosines_features(df,
                                         articles_node2vec_embeddings,
                                         walklets_articles_embeddings,
                                         articles_authors_embedding, 
                                         abstracts_embeddings):
    # compute some cosine based distances
    df['articles_node2vec_cosine'] = df.apply(lambda x:cosine(articles_node2vec_embeddings[x.node1],articles_node2vec_embeddings[x.node2]), axis = 1)
    df['articles_walklets_cosine'] = df.apply(lambda x:cosine(walklets_articles_embeddings[x.node1],walklets_articles_embeddings[x.node2]), axis = 1)
    df['authors_embeddings_cosine'] = df.apply(lambda x:cosine(articles_authors_embedding[x.node1],articles_authors_embedding[x.node2]), axis = 1)
    df['abstracts_embeddings_cosine'] = df.apply(lambda x:cosine(abstracts_embeddings[x.node1],abstracts_embeddings[x.node2]), axis = 1)

    return(df)

## Split train/validation/test sets


In [None]:
X = train_set.drop(columns = ['label'])
y = train_set[['label']]
train_samples, validation_samples, train_labels, validation_labels = train_test_split(X,y, test_size=0.2, random_state=0, shuffle=True, stratify=y)

In [None]:
train_set = pd.concat([train_samples, train_labels], axis = 1).reset_index(drop = True)
validation_set = pd.concat([validation_samples, validation_labels], axis = 1).reset_index(drop = True)

## Run models