In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib as plt
import os
import networkx as nx
from gensim.models import Word2Vec
from node2vec import Node2Vec
import pickle
import spacy
from collections import Counter
from tqdm import tqdm
import torch
import random

from sklearn.preprocessing import StandardScaler

from hpsklearn import HyperoptEstimator, any_classifier, any_preprocessing
from hyperopt import tpe


from sklearn.metrics import roc_curve
from sklearn.decomposition import PCA

from scipy.stats.stats import pearsonr
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score 
from sklearn.preprocessing import LabelBinarizer 
from sklearn.model_selection import train_test_split
from scipy.spatial import distance

WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely


  from scipy.stats.stats import pearsonr


In [2]:
from link_pred.utils import cosine, euclidian, retrieve_and_pre_processed_informations, compute_unique_names
from link_pred.folds_creation import create_and_save_folds
from link_pred.create_graphs import create_articles_graph, create_co_authorship_graph, create_authors_co_citation_graph
from link_pred.node_embeddings import compute_abstracts_embeddings, compute_titles_embeddings, compute_walklets, compute_node2vec, compute_deep_walks
from link_pred.edges_features import Jaccard, AdamicAdar, preferential_attachement, are_connected, common_journal
from link_pred.models_training import get_best_xgb, get_xgb, get_best_MLP

### Paths

In [3]:
information_path = "Data/raw_data/node_information.csv"
test_set_path = "Data/raw_data/testing_set.txt"
train_set_path = "Data/raw_data/training_set.txt"
random_preds_path = "Data/raw_data/random_predictions.csv"

number_of_folds = 5

## Import and pre_process data

In [4]:
information_df = retrieve_and_pre_processed_informations(information_path)

In [5]:
# assign new id to each node (needs to start from 0)
id_old2new = {k: v for v, k in enumerate(list(information_df.ID))}
id_new2old = {v: k for v, k in id_old2new.items()}

information_df['new_ID'] = information_df.ID.apply(lambda x: id_old2new[x])

In [6]:
initial_train_set = pd.read_csv(train_set_path, sep =" ", header = None)
initial_train_set.columns = ['node1','node2','label']

## update nodes values to new indices
initial_train_set.node1 = initial_train_set.apply(lambda x:id_old2new[x.node1], axis = 1)
initial_train_set.node2 = initial_train_set.apply(lambda x:id_old2new[x.node2], axis = 1)

In [7]:
## load test set

test_set = pd.read_csv(test_set_path, sep =" ", header = None)
test_set.columns = ['node1','node2']

## update nodes values to new indices
test_set.node1 = test_set.apply(lambda x:id_old2new[x.node1], axis = 1)
test_set.node2 = test_set.apply(lambda x:id_old2new[x.node2], axis = 1)

## Create folds

In [8]:
create_and_save_folds(initial_train_set, number_of_folds = number_of_folds, validation_size = 0.05)

fold_1 created and saved !
fold_2 created and saved !
fold_3 created and saved !


### Deal with authors various names

In [None]:
import string
# convert to lower case, remove punctuation, strip the names
authors_raw_set = set([auth.strip().lower().translate(str.maketrans('', '', string.punctuation)) for list_auth in information_df.authors for auth in list_auth if len(auth)>1])

In [None]:
# several authors can be named differently (eg. Jean DUPONT, J.Dupont, etc.)
# we create a name matcher function to try to indentify each author and assign each denomination a "representant"

if os.path.isfile('Data/processed_data/representant_dict.pkl'):
    representant_dict = pickle.load(open('Data/processed_data/representant_dict.pkl','rb'))
else:
    representant_dict = compute_unique_names(authors_raw_set)
    pickle.dump(representant_dict, open('Data/processed_data/representant_dict.pkl','wb'))

In [None]:
# set each name to its representant value
information_df.authors = information_df.authors.apply(lambda x: [representant_dict[auth.strip().lower().translate(str.maketrans('', '', string.punctuation))] for auth in x])

In [None]:
# create a unique index for each author
representants_list = list(set(representant_dict.values()))
authors2idx = {k: v for v, k in enumerate(representants_list)}
information_df["authors_id"] = information_df.authors.apply(lambda x: [authors2idx[auth] for auth in x])

information_df.sample(2)

Unnamed: 0,ID,pub_year,title,authors,journal_name,abstract,title_lemma,new_ID,authors_id
11267,9211116,1992,cpt strings and the k bar k system,"[alan kostelecky, robertus potting]",,this talk contains a summary of our work on dy...,"[cpt, string, k, bar, k, system]",11267,"[2579, 7544]"
4198,105216,2001,d-branes and vector bundles on calabi-yau mani...,"[suresh govindarajan iitm, t jayaraman imsc]",,helix we review some recent results on d-brane...,"[d, brane, vector, bundle, calabi, yau, manifold]",4198,"[9722, 14201]"


In [None]:
G_authors_co_auth = create_co_authorship_graph(information_df, authors2idx)

## Compute node embeddings

In [None]:
# those embeddings do not depend on the fold
abstracts_embeddings = compute_abstracts_embeddings(information_df)
abstracts_embeddings = compute_titles_embeddings(information_df)


if os.path.isfile(f'Data/embeddings/walklets_co_auth_embeddings.pkl'):
    walklets_co_auth_embeddings = pickle.load(open('Data/embeddings/walklets_co_auth_embeddings.pkl','rb'))
else:
    walklets_co_auth_embeddings = compute_walklets(G_authors_co_auth)
    pickle.dump(walklets_co_auth_embeddings,open('Data/embeddings/walklets_co_auth_embeddings.pkl','wb'))

In [None]:
# those embeddings depend on the fold
for i in range(number_of_folds):
    print(f"fold: {i+1}")

    train_set = pd.read_csv(f"Data/folds/train_set_{i+1}")
    articles_graph = create_articles_graph(train_set,information_df)
    authors_citation_graph = create_authors_co_citation_graph(train_set, information_df, authors2idx)

    if os.path.isfile(f'Data/embeddings/articles_walklets_{i+1}.pkl') == False:
        walklets_articles_embeddings = compute_walklets(articles_graph)
        pickle.dump(walklets_articles_embeddings, open(f'Data/embeddings/articles_walklets_{i+1}.pkl','wb'))
    if os.path.isfile(f'Data/embeddings/articles_node2vec_{i+1}.pkl') == False:
        node2vec_articles_embeddings = compute_node2vec(articles_graph)
        pickle.dump(node2vec_articles_embeddings, open(f'Data/embeddings/articles_node2vec_{i+1}.pkl','wb'))
    if os.path.isfile(f'Data/embeddings/articles_walklets_{i+1}.pkl') == False:
        walklets_co_citation_embeddings = compute_walklets(authors_citation_graph)
        pickle.dump(walklets_co_citation_embeddings, open(f'Data/embeddings/co_citation_walklets_{i+1}.pkl','wb'))  

fold: 1
fold: 2
fold: 3
fold: 4
fold: 5


## Compute features

In [None]:
def compute_non_embeddings_features(df, information_df, G_articles):
    information_df['new_ID'] = information_df.ID.apply(lambda x:id_old2new[x])
    useful_information_df = information_df[['new_ID','authors','pub_year', 'title_lemma']]

    # prepare data frame for common authors computation
    df = (df
    .merge(useful_information_df, how ='left', left_on = ['node1'], right_on = ['new_ID'])
    .rename(columns = {'authors':'authors_node_1', 'pub_year':'pub_year1', 'title_lemma':'title_lemma1'})
    .merge(useful_information_df, how ='left', left_on = ['node2'], right_on = ['new_ID'])
    .rename(columns = {'authors':'authors_node_2', 'pub_year':'pub_year2', 'title_lemma':'title_lemma2'})
    )

    ### Compute page rank
    page_rank_dict = nx.pagerank(G_articles)

    ### compute degree centrality
    centrality_dict = nx.degree_centrality(G_articles)

    print("compute ressource allocation index")
    df['ressource_allocation_index'] = df.apply(lambda x: nx.resource_allocation_index(G_articles, [(x.node1, x.node2)])[0],axis = 1)

    print("common_journal")
    df['common_journals'] = df.apply(lambda x: common_journal(information_df, x.node1, x.node2),axis = 1)
    
    print('computing common authors')
    #  compute common authors
    df['common_authors'] = df.apply(lambda x:len(set(x.authors_node_1)&set(x.authors_node_2)),axis = 1)

    print('computing common words')
    #  compute common words in titles
    df['common_title_words'] = df.apply(lambda x:len(set(x.title_lemma1)&set(x.title_lemma2)),axis = 1)

    print('computing delta publication year')
    # compute delta publication year
    df['delta_publication'] = df.apply(lambda x:np.abs(x.pub_year2 - x.pub_year1),axis = 1)

    # compute edges features
    print('computing jacard index')
    df['jacard'] = df.apply(lambda x: Jaccard(G_articles, (x.node1, x.node2)),axis = 1)

    print('computing preferential attachement')
    df['pa'] = df.apply(lambda x: preferential_attachement(G_articles, (x.node1, x.node2)),axis = 1)

    print('computing adamic_adar')
    df['adamic_adar'] = df.apply(lambda x: AdamicAdar(G_articles, (x.node1, x.node2)),axis = 1)

    print('are connected')
    df['connection'] = df.apply(lambda x: are_connected(G_articles, (x.node1, x.node2)),axis = 1)

    print('page ranks')
    df['page_rank1'] = df.apply(lambda x: page_rank_dict[x.node1],axis = 1)
    df['page_rank2'] = df.apply(lambda x: page_rank_dict[x.node2],axis = 1)
    
    print('compute degree')

    df['degree1'] = df.apply(lambda x: centrality_dict[x.node1],axis = 1)
    df['degree2'] = df.apply(lambda x: centrality_dict[x.node2],axis = 1)

    
    df = df.fillna({ 'jacard':df.jacard.mean(),
                     'adamic_adar':df.adamic_adar.mean()
                     })

    return(df)


In [None]:
def compute_embedding_features(df,
                                information_df,
                                abstracts_embeddings,
                                walklets_articles_embeddings,
                                walklets_co_auth_embeddings,
                                walklets_co_citation_embeddings,
                                node2vec_articles_embeddings,
                                pca = False):

    # for each article take a mean of the authors embedding as global autors embedding (idem for citation)
    articles_authors_embedding = []
    articles_authors_embedding_citation = []
    for i in range(information_df.shape[0]):
        value = information_df[information_df.new_ID == i]
        authors_id = value.authors_id
        embeddings = np.array([0 for i in range(128)]).astype('float64')
        embeddings_citation = np.array([0 for i in range(128)]).astype('float64')
        for author in authors_id:
            embeddings+=walklets_co_auth_embeddings[author][0]
            embeddings_citation+=walklets_co_citation_embeddings[author][0]
        articles_authors_embedding.append(embeddings/len(authors_id))
        articles_authors_embedding_citation.append(embeddings_citation/len(authors_id))


    # compute some cosine and euclidian based distances
    df['articles_walklets_cosine'] = df.apply(lambda x:cosine(walklets_articles_embeddings[x.node1],walklets_articles_embeddings[x.node2]), axis = 1)
    df['articles_node2vec_cosine'] = df.apply(lambda x:cosine(node2vec_articles_embeddings[x.node1],node2vec_articles_embeddings[x.node2]), axis = 1)
    df['abstracts_embeddings_cosine'] = df.apply(lambda x:cosine(abstracts_embeddings[x.node1][0],abstracts_embeddings[x.node2][0]), axis = 1)
    
    df['articles_walklets_euclidian'] = df.apply(lambda x:euclidian(walklets_articles_embeddings[x.node1],walklets_articles_embeddings[x.node2]), axis = 1)
    df['articles_node2vec_euclidian'] = df.apply(lambda x:euclidian(node2vec_articles_embeddings[x.node1],node2vec_articles_embeddings[x.node2]), axis = 1)
    df['abstracts_embeddings_euclidian'] = df.apply(lambda x:euclidian(abstracts_embeddings[x.node1][0],abstracts_embeddings[x.node2][0]), axis = 1)
    
    # compute some cosine and euclidian based distances for authors
    df['co_authorship_embeddings_cosine'] = df.apply(lambda x:cosine(articles_authors_embedding[x.node1],articles_authors_embedding[x.node2]), axis = 1)
    df['authors_embeddings_cosine_citation'] = df.apply(lambda x:cosine(articles_authors_embedding_citation[x.node1],articles_authors_embedding_citation[x.node2]), axis = 1)    
    df['co_authorship_embeddings_euclidian'] = df.apply(lambda x:euclidian(articles_authors_embedding[x.node1],articles_authors_embedding[x.node2]), axis = 1)
    df['authors_embeddings_euclidian_citation'] = df.apply(lambda x:euclidian(articles_authors_embedding_citation[x.node1],articles_authors_embedding_citation[x.node2]), axis = 1)

    # node1 and node2 article embedding
    print("add articles walklets embeddings")
    # only append vectors of size 10 that represent articles embeddings (quicker to compute)
    if pca:
        pca_walklets= PCA(n_components = 5)
        walklets_articles_embeddings = pca_walklets.fit_transform(walklets_articles_embeddings)
        pca_node2vec =  PCA(n_components = 5)
        node2vec_articles_embeddings = pca_node2vec.fit_transform(node2vec_articles_embeddings)


    walklets_node_embeddings_df = pd.DataFrame(walklets_articles_embeddings, columns = [f'emb_walklets_{i}' for i in range(len(walklets_articles_embeddings[0]))])
    walklets_node_embeddings_df = walklets_node_embeddings_df.reset_index().rename(columns = {'index':'node'})
    df = (df
        .merge(walklets_node_embeddings_df, how ='left', left_on = ['node1'], right_on = ['node'])
        .drop(columns = ['node'])
        .merge(walklets_node_embeddings_df, how ='left', left_on = ['node2'], right_on = ['node'])
        .drop(columns = ['node'])
    )

    print("add articles node2vecs embeddings")
    # only append vectors of size 10 that represent articles embeddings (quicker to compute)

    node_node2vec_embeddings_df = pd.DataFrame(node2vec_articles_embeddings, columns = [f'emb_node2vec{i}' for i in range(len(node2vec_articles_embeddings[0]))])
    node_node2vec_embeddings_df = node_node2vec_embeddings_df.reset_index().rename(columns = {'index':'node'})
    df = (df
        .merge(node_node2vec_embeddings_df, how ='left', left_on = ['node1'], right_on = ['node'])
        .drop(columns = ['node'])
        .merge(node_node2vec_embeddings_df, how ='left', left_on = ['node2'], right_on = ['node'])
        .drop(columns = ['node'])
    )
    return(df)

##  Compute features for all folds

In [None]:
for i in range(number_of_folds):
    print(f"fold: {i+1}")
    if os.path.isfile(f"Data/processed_data/train_set_features{i+1}.csv") and os.path.isfile(f"Data/processed_data/val_set_features{i+1}.csv") and os.path.isfile(f"Data/processed_data/test_set_features{i+1}.csv"):
        continue
    else:
        # load sets
        train_set = pd.read_csv(f"Data/folds/train_set_{i+1}")
        validation_set = pd.read_csv(f"Data/folds/validation_set_{i+1}")

        # compute graphs
        G_articles = create_articles_graph(train_set,information_df)

        # load embeddings
        walklets_articles_embeddings = pickle.load(open(f'Data/embeddings/articles_walklets_{i+1}.pkl','rb'))
        walklets_co_citation_embeddings = pickle.load(open(f'Data/embeddings/co_citation_walklets_{i+1}.pkl','rb'))
        node2vec_articles_embeddings = pickle.load(open(f'Data/embeddings/articles_node2vec_{i+1}.pkl','rb'))

        # compute features for train
        print("compute train features")
        train_set_with_features = compute_non_embeddings_features(train_set, information_df, G_articles)
        train_set_with_features = compute_embedding_features(train_set_with_features, information_df, abstracts_embeddings,
                                                                walklets_articles_embeddings,
                                                                walklets_co_auth_embeddings,
                                                                walklets_co_citation_embeddings,
                                                                node2vec_articles_embeddings)
        # compute features for val
        print("compute validation features")
        val_set_with_features = compute_non_embeddings_features(validation_set, information_df, G_articles)
        val_set_with_features = compute_embedding_features(val_set_with_features, information_df, abstracts_embeddings,
                                                                walklets_articles_embeddings,
                                                                walklets_co_auth_embeddings,
                                                                walklets_co_citation_embeddings,
                                                                node2vec_articles_embeddings)

        # compute features for test
        print("compute test features")
        test_set_with_features = compute_non_embeddings_features(test_set, information_df, G_articles)
        test_set_with_features = compute_embedding_features(test_set_with_features, information_df, abstracts_embeddings,
                                                                walklets_articles_embeddings,
                                                                walklets_co_auth_embeddings,
                                                                walklets_co_citation_embeddings,
                                                                node2vec_articles_embeddings)

        train_set_with_features.to_csv(f"Data/processed_data/train_set_features{i+1}.csv", index = False)
        val_set_with_features.to_csv(f"Data/processed_data/val_set_features{i+1}.csv", index = False)
        test_set_with_features.to_csv(f"Data/processed_data/test_set_features{i+1}.csv", index = False)
    

fold: 1
fold: 2
fold: 3
fold: 4
fold: 5


In [None]:
## compute columns of interest
all_columns = set(pd.read_csv("Data/processed_data/train_set_features1.csv").columns)

to_remove = set(['node1', 'node2', 'label', 'new_ID_x', 'authors_node_1',
       'title_lemma1', 'new_ID_y', 'authors_node_2', 'title_lemma2'])
columns_to_keep= list(all_columns-to_remove)

## Train classifiers

In [None]:
for i in range(number_of_folds):
    print(f'fold: {i+1}')
    if os.path.isfile(f"Data/models/scaler_{i+1}.pkl"):
        continue
    else:
        train_set_with_features = pd.read_csv(f"Data/processed_data/train_set_features{i+1}.csv")
        validation_set_with_features = pd.read_csv(f"Data/processed_data/val_set_features{i+1}.csv")

        # only keep columns of interest
        train_set_with_features = train_set_with_features[columns_to_keep+['label']]
        validation_set_with_features = validation_set_with_features[columns_to_keep+['label']]

        train_samples, train_labels = train_set_with_features.drop(columns = ['label']), train_set_with_features[['label']]
        validation_samples, validation_labels = validation_set_with_features.drop(columns = ['label']), validation_set_with_features[['label']]
        

        # scale data
        scaler = StandardScaler()
        train_samples_scaled = scaler.fit_transform(np.float32(train_samples))
        validation_samples_scaled = scaler.fit_transform(np.float32(validation_samples))


        # train classifier (grid search best params)
        clf_xgb, thresh_xgb = get_best_xgb(train_samples_scaled, list(train_labels.label), validation_samples_scaled, list(validation_labels.label),gpu = True, verbose =1)
        clf_mlp, thresh_mlp = get_best_MLP(train_samples_scaled, list(train_labels.label), validation_samples_scaled, list(validation_labels.label))
     
    
        pickle.dump((clf_xgb,thresh_xgb), open(f"Data/models/clf_xgb_{i+1}.pkl", 'wb'))
        pickle.dump((clf_mlp,thresh_mlp), open(f"Data/models/clf_mlp_{i+1}.pkl", 'wb'))
        pickle.dump(scaler, open(f"Data/models/scaler_{i+1}.pkl", 'wb'))

fold: 1
fold: 2
fold: 3




[0]	validation_0-aucpr:0.99004
[1]	validation_0-aucpr:0.99064
[2]	validation_0-aucpr:0.99125
[3]	validation_0-aucpr:0.99198
[4]	validation_0-aucpr:0.99202
[5]	validation_0-aucpr:0.99207
[6]	validation_0-aucpr:0.99216
[7]	validation_0-aucpr:0.99218
[8]	validation_0-aucpr:0.99233
[9]	validation_0-aucpr:0.99242
[10]	validation_0-aucpr:0.99246
[11]	validation_0-aucpr:0.99277
[12]	validation_0-aucpr:0.99299
[13]	validation_0-aucpr:0.99288
[14]	validation_0-aucpr:0.99289
xgb
accuracy: 0.963931762794476
f1: 0.9667166416791604
xgb simple
accuracy: 0.9603899268887084
f1: 0.9637018729714439
accuracy: 0.9648090982940699
f1: 0.9676010410745803


TypeError: get_neural_net() got an unexpected keyword argument 'gpu'

## Predict on test set

In [None]:
preds = np.array([])
for i in range(number_of_folds):
    
    test_set_with_features = pd.read_csv(f"Data/processed_data/test_set_features{i+1}.csv")

    # scale data
    scaler = pickle.load(open(f"Data/models/scaler_{i+1}.pkl", 'rb'))

    # only keep columns of interest
    test_set_with_features = test_set_with_features[columns_to_keep]

    test_set_with_features = scaler.transform(np.float32(test_set_with_features))


    # train classifier (grid search best params)
    clf, thresh = pickle.load(open(f"Data/models/clf_{i+1}.pkl", 'rb'))


    if preds.shape[0]==0:
        preds = np.array(np.int32(clf.predict_proba(test_set_with_features)[:,1] >= thresh))
    else:
        preds += np.array(np.int32(clf.predict_proba(test_set_with_features)[:,1] >= thresh))

In [None]:
# voting 
test_set['category'] = preds

test_set = (test_set
.reset_index()
.rename(columns = {'index':'id'})
.drop(columns = ['node1','node2'])
)

test_set.to_csv('final_predictions_no_emb.csv', index=False)