In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib as plt
import os
import networkx as nx
from gensim.models import Word2Vec
from node2vec import Node2Vec
import pickle
import spacy
from collections import Counter
from tqdm import tqdm
import torch
import random

from hpsklearn import HyperoptEstimator, any_classifier, any_preprocessing
from hyperopt import tpe



from scipy.stats.stats import pearsonr
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score 
from sklearn.preprocessing import LabelBinarizer 
from sklearn.model_selection import train_test_split


from namematcher import NameMatcher

WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely


  from scipy.stats.stats import pearsonr


In [2]:
spacy_nlp = spacy.load("en_core_web_sm")

## Usefull stuff

### Paths

In [3]:
information_path = "Data/raw_data/node_information.csv"
test_set_path = "Data/raw_data/testing_set.txt"
train_set_path = "Data/raw_data/training_set.txt"
random_preds_path = "Data/raw_data/random_predictions.csv"

In [4]:
def split_embedding_subgraph(graph,embedding_ratio = 0.3):
    
    # --- Step 1: Generate positive edge samples for testing set ---
    residual_g = graph.copy()
    embedding_samples = []
      
    # Store the shuffled list of current edges of the graph
    edges = list(residual_g.edges())
    np.random.shuffle(edges)
    
    # Define number of positive test samples desired
    embedding_set_size = int(embedding_ratio * graph.number_of_edges())
    num_of_embedding_samples = 0
    
    # Remove random edges from the graph, leaving it connected
    # Fill in the blanks
    for edge in edges:
        if graph.degree[edge[0]]<=1 or graph.degree[edge[1]]<=1:
            continue
        else:
            # Remove the edge
            residual_g.remove_edge(edge[0], edge[1])
            num_of_embedding_samples += 1
            embedding_samples.append(edge)
        
        # If we have collected enough number of edges for testing set, we can terminate the loop
        if num_of_embedding_samples == embedding_set_size:
            break

    train_samples = list(residual_g.edges())
            
    
    return embedding_samples,train_samples


### Useful functions

In [5]:
def cosine(a,b):
    return(a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b)))

## Import data

In [6]:
information_df = pd.read_csv(information_path, header=None)
information_df.columns = ["ID",'pub_year','title','authors','journal_name','abstract']
### !!!! We have to use new index starting from 0 because of the implementation of karate-club library
information_df = information_df.assign(new_ID = [i for i in range(information_df.shape[0])])
information_df.sample(3)

Unnamed: 0,ID,pub_year,title,authors,journal_name,abstract,new_ID
1044,5090,2000,e8 gauge theory and a derivation of k-theory f...,"Duiliu-Emanuel Diaconescu, Gregory Moore, , Ed...",,the partition function of ramond-ramond p-form...,1044
23751,9807052,1998,spinning particles on spacelike hypersurfaces ...,"F.Bigazzi (Milano Univ.), L.Lusanna (Firenze U...",Int.J.Mod.Phys.,description a new spinning particle with a def...,23751
14208,9407138,1994,drinfel'd algebra deformations homotopy comodu...,,,the aim of this work is to construct a cohomol...,14208


In [7]:
initial_train_set = pd.read_csv(train_set_path, sep =" ", header = None)
initial_train_set.columns = ['node1','node2','label']
### !!! we will use the new indices!!! (see information_df for correspondances)
initial_train_set = (initial_train_set
    .merge(information_df[['ID','new_ID']], how = 'left', left_on = ['node1'], right_on = ['ID'])
    .drop(columns = ['node1','ID'])
    .rename(columns = {'new_ID':'node1'})
    .merge(information_df[['ID','new_ID']], how = 'left', left_on = ['node2'], right_on = ['ID'])
    .drop(columns = ['node2','ID'])
    .rename(columns = {'new_ID':'node2'})
)
initial_train_set.sample(5)

Unnamed: 0,label,node1,node2
212643,0,9124,12759
430319,1,15292,13272
4208,0,17562,2299
11933,1,13067,11295
47650,0,5450,23844


## Create test and validation set

In [8]:
embeddings_ratio = 0.7
train_validation_ratio = 0.05

if os.path.isfile("Data/processed_data/train_set.csv") and  os.path.isfile("Data/processed_data/validation_set.csv") and  os.path.isfile("Data/processed_data/embedding_set.csv"):
    # load sets
    train_set = pd.read_csv("Data/processed_data/train_set.csv",)
    validation_set = pd.read_csv("Data/processed_data/validation_set.csv")
    embedding_set = pd.read_csv("Data/processed_data/embedding_set.csv")
else:
    all_nodes = list(information_df.new_ID.unique())
    all_edges = set(initial_train_set.query('label==1').apply(lambda x: (x.node1,x.node2), axis = 1))

    # create subset for embeddings
    nodes = list(information_df.new_ID.unique())
    edges = set(initial_train_set.query('label==1').apply(lambda x: (x.node1,x.node2), axis = 1))
    negative_edges = set(initial_train_set.query('label==0').apply(lambda x: (x.node1,x.node2), axis = 1))
    G_total = nx.Graph()
    G_total.add_nodes_from(nodes)
    G_total.add_edges_from(edges)

    embeddings_samples, train_samples = split_embedding_subgraph(G_total,embedding_ratio = embeddings_ratio)

    embedding_set = pd.DataFrame([[a,b,1] for (a,b) in embeddings_samples], columns = ['node1','node2','label'])
    
    train_set = pd.DataFrame([[a,b,1] for (a,b) in train_samples], columns = ['node1','node2','label'])
    train_set = pd.concat([train_set, initial_train_set.query('label==0').sample(train_set.shape[0])],axis = 0)

    # split train set into train and validation set
    X = train_set.drop(columns = ['label'])
    y = train_set[['label']]

    train_samples, validation_samples, train_labels, validation_labels = train_test_split(X,y, test_size=train_validation_ratio, random_state=0, shuffle=True, stratify=y)
    
    train_set = pd.concat([train_samples, train_labels], axis = 1)
    validation_set = pd.concat([validation_samples, validation_labels], axis = 1)

    # save sets
    train_set.to_csv("Data/processed_data/train_set.csv", index = False)
    validation_set.to_csv("Data/processed_data/validation_set.csv", index = False)
    embedding_set.to_csv("Data/processed_data/embedding_set.csv", index = False)

 We also need to make an "embedding graph" to compute the various features (otherwise we would train a model to predict edges on a graph in which the edges are present)

## Information pre_processing

### Missing values

In [9]:
information_df.isna().sum()

ID                 0
pub_year           0
title              0
authors         4033
journal_name    7472
abstract           0
new_ID             0
dtype: int64

In [10]:
information_df = information_df.fillna({'authors':'', 'journal_name':''})


### Authors

In [11]:
information_df.authors = information_df.authors.apply(lambda x:x.split(","))

### Titles

In [12]:
if os.path.isfile("Data/processed_data/information.csv"):
    information_df = pickle.load(open("Data/processed_data/information.csv",'rb'))
else:
    information_df['title_lemma'] = information_df.title.apply(lambda x: [token.lemma_ for token in spacy_nlp(x) if not token.is_punct if not token.is_digit if not token.is_stop])
    pickle.dump(information_df, open("Data/processed_data/information.csv",'wb'))

## Create graphs

### Articles based graph (based on embeddings graph)

In [13]:
nodes = list(information_df.new_ID.unique())
edges = set(embedding_set.apply(lambda x: (x.node1,x.node2), axis = 1))


G_articles_embedding = nx.Graph()
G_articles_embedding.add_nodes_from(nodes)
G_articles_embedding.add_edges_from(edges)

print("The number of nodes: {}".format(G_articles_embedding .number_of_nodes()))
print("The number of edges: {}".format(G_articles_embedding .number_of_edges()))

The number of nodes: 27770
The number of edges: 234282


### Authors co-authorship based graph

In [14]:
import string
# convert to lower case, remove punctuation, strip the names
authors_raw_set = set([auth.strip().lower().translate(str.maketrans('', '', string.punctuation)) for list_auth in information_df.authors for auth in list_auth if len(auth)>1])

Name matching: to make identify people name by different ways

In [15]:
from namematcher import NameMatcher
name_matcher = NameMatcher()

def compute_unique_names(authors_raw_set):
    """
    one author can be named differently on different papers
    this function aims at finding a 'representant' (longest name that describe an author) for each 
    author
    inputs:
        - authors_raw_set: set of previously extracted author names
    outputs:
        - dict: keys are the name in authors_raw_set and the values are the representant
    """
    representant_dict = {}
    attributed_nodes = [] # names that already have a representant
    for name in tqdm(authors_raw_set, position = 0):
        sim_list = [] # similar names 
        if name not in attributed_nodes:
            for name2 in authors_raw_set:
                try:
                    if name != name2 and name[0]==name2[0] and name2 not in attributed_nodes:
                        # two names need to start by the same letter to be consider as potential equivalents
                        score = name_matcher.match_names(name, name2)
                        if score > 0.9: # if names are close enough
                            sim_list.append(name2)
                except:
                    continue
            sim_list.append(name) # the representant is in this list
            attributed_nodes.extend(sim_list) # we have fund a representant for those names
            representant = max(sim_list, key=len) # the representant is the longest name
            for name in sim_list: # all those names have the same representant
                representant_dict[name] = representant
    return(representant_dict)

if os.path.isfile('Data/processed_data/representant_dict.pkl'):
    representant_dict = pickle.load(open('Data/processed_data/representant_dict.pkl','rb'))
else:
    representant_dict = compute_unique_names(authors_raw_set)
    pickle.dump(representant_dict, open('Data/processed_data/representant_dict.pkl','wb'))

In [16]:
# set each name to its representant value
information_df.authors = information_df.authors.apply(lambda x: [representant_dict[auth.strip().lower().translate(str.maketrans('', '', string.punctuation))] for auth in x])

In [17]:
# create a unique index for each author
representants_list = list(set(representant_dict.values()))
authors2idx = {k: v for v, k in enumerate(representants_list)}

information_df["authors_id"] = information_df.authors.apply(lambda x: [authors2idx[auth] for auth in x])

In [18]:
information_df.sample(2)

Unnamed: 0,ID,pub_year,title,authors,journal_name,abstract,new_ID,title_lemma,authors_id
17089,9511163,1995,colliding plane waves in einstein-maxwell-dila...,"[nora breton, tonatiuh matos, alberto garcia d...",Phys.Rev.,cinvestav mexico within the metric structure e...,17089,"[collide, plane, wave, einstein, maxwell, dila...","[11849, 9435, 10351, 0]"
18519,9607036,1996,fusion of twisted representations,[],Int.J.Mod.Phys.,the comultiplication formula for fusion produc...,18519,"[fusion, twisted, representation]",[0]


In [19]:
# compute nodes and edges
pre_edges = list(information_df.authors_id.apply(lambda x : [(x[i],x[j]) for i in range(len(x)) for j in range(len(x)) if i>j]))
authors_edges = [edge for list_edge in pre_edges for edge in list_edge]
authors_edges_dict = Counter(authors_edges)

In [20]:
G_authors = nx.Graph()
G_authors.add_nodes_from(authors2idx.values())
G_authors.add_weighted_edges_from([(a,b,weight) for (a,b),weight in authors_edges_dict.items()])

print("The number of nodes: {}".format(G_authors.number_of_nodes()))
print("The number of edges: {}".format(G_authors.number_of_edges()))

The number of nodes: 14447
The number of edges: 29111


## Various embeddings computation

### Graph Based embeddings

Using articles graph

In [21]:
### Walklets
from karateclub import Walklets
if os.path.isfile('Data/processed_data/articles_walklets_embeddings.pkl'):
    walklets_articles_embeddings = pickle.load(open('Data/processed_data/articles_walklets_embeddings.pkl','rb'))
else:
    walklets = Walklets() # we leave the defaults parameters for the other values
    walklets.fit(G_articles_embedding)
    walklets_articles_embeddings = walklets.get_embedding()
    pickle.dump(walklets_articles_embeddings, open('Data/processed_data/articles_walklets_embeddings.pkl','wb'))

In [22]:
### Node2Vec
from karateclub import Node2Vec
if os.path.isfile('Data/processed_data/articles_node2vec_embeddings.pkl'):
    articles_node2vec_embeddings = pickle.load(open('Data/processed_data/articles_node2vec_embeddings.pkl','rb'))
else:
    node2vec = Node2Vec() # we leave the defaults parameters for the other values
    node2vec.fit(G_articles_embedding)
    articles_node2vec_embeddings = node2vec.get_embedding()
    pickle.dump(articles_node2vec_embeddings, open('Data/processed_data/articles_node2vec_embeddings.pkl','wb'))

In [23]:
### deepwalk
from karateclub import DeepWalk
if os.path.isfile('Data/processed_data/articles_DeepWalk_embeddings.pkl'):
    articles_DeepWalk_embeddings = pickle.load(open('Data/processed_data/articles_DeepWalk_embeddings.pkl','rb'))
else:
    DeepWalk = DeepWalk() # we leave the defaults parameters for the other values
    DeepWalk.fit(G_articles_embedding)
    articles_DeepWalk_embeddings = DeepWalk.get_embedding()
    pickle.dump(articles_DeepWalk_embeddings, open('Data/processed_data/articles_DeepWalk_embeddings.pkl','wb'))

Using author graph

In [24]:
### authors embedding leveraging author graphs (random walk for instance)

### Node2Vec
if os.path.isfile('Data/processed_data/authors_node2vec_embeddings.pkl'):
    authors_node2vec_embeddings = pickle.load(open('Data/processed_data/authors_node2vec_embeddings.pkl', 'rb'))
else:
    node2vec_authors = Node2Vec(walk_length=15)
    node2vec_authors.fit(G_authors)
    authors_node2vec_embeddings = node2vec_authors.get_embedding()
    pickle.dump(authors_node2vec_embeddings, open('Data/processed_data/authors_node2vec_embeddings.pkl', 'wb'))

In [25]:
information_df.sample(2)

Unnamed: 0,ID,pub_year,title,authors,journal_name,abstract,new_ID,title_lemma,authors_id
12131,9306080,1993,topological sigma-models in four dimensions an...,"[damiano anselmi, pietro fre]",Nucl.Phys.,it is well-known that topological sigma-models...,12131,"[topological, sigma, model, dimension, triholo...","[10381, 10529]"
23513,9806054,1998,variational principle and a perturbative solut...,[],Nucl.Phys.,equations in curved space string dynamics in a...,23513,"[variational, principle, perturbative, solutio...",[0]


In [26]:
# for each article take a mean of the authors embedding as global autors embedding
articles_authors_embedding = []
for i in range(information_df.shape[0]):
    value = information_df[information_df.new_ID == i]
    authors_id = value.authors_id
    embeddings = np.array([0 for i in range(128)]).astype('float64')
    for author in authors_id:
        embeddings+=authors_node2vec_embeddings[author][0]
    articles_authors_embedding.append(embeddings/len(authors_id))

### Node based embeddings

In [27]:
# compute abstracts embeddings using specter network

from transformers import AutoTokenizer, AutoModel

if torch.cuda.is_available():
    device = "cuda"
else:
    device = 'gpu'

if os.path.isfile('Data/processed_data/abstracts_embeddings.pkl'):
    abstracts_embeddings = pickle.load(open('Data/processed_data/abstracts_embeddings.pkl','rb'))
else:
    
    # load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
    model = AutoModel.from_pretrained('allenai/specter').to(device)
    model.eval()
    abstracts_embeddings = []
    for i in tqdm(range(information_df.shape[0]), position = 0):
        article = information_df.loc[i]
        title = article.title
        abstract = article.abstract
        paper = [{'title':title, 'abstract':abstract}]

        # concatenate title and abstract
        title_abs = [d['title'] + tokenizer.sep_token + (d.get('abstract') or '') for d in paper]
        # preprocess the input
        inputs = tokenizer(title_abs, padding=True, truncation=True, return_tensors="pt", max_length=512)
        inputs = inputs.to(device)
        result = model(**inputs)
        # take the first token in the batch as the embedding
        embedding = result.last_hidden_state[:, 0, :].detach().cpu().numpy()
        abstracts_embeddings.append(embedding)
    pickle.dump(abstracts_embeddings, open('Data/processed_data/abstracts_embeddings.pkl','wb'))

In [28]:
str(information_df[information_df.new_ID==1].journal_name.values[0])

'Class.Quant.Grav.'

In [29]:
### Compute page rank
page_rank_dict = nx.pagerank(G_articles_embedding)

In [30]:
### compute degree centrality
centrality_dict = nx.degree_centrality(G_articles_embedding)

## Compute edges features

In [31]:
def Jaccard(graph, edge):

    inter_size = len(list(nx.common_neighbors(graph, edge[0], edge[1])))
    union_size = len(set(graph[edge[0]]) | set(graph[edge[1]]))
    try:
        jacard = inter_size / union_size
    except:
        jacard = np.nan

    return jacard

In [32]:
def AdamicAdar(graph, edge):

    inter_list = nx.common_neighbors(graph, edge[0], edge[1])
    try:
        adamic_adar =  sum( [1/np.log(graph.degree(node)) for node in inter_list])
    except:
        adamic_adar = np.nan
    
    return adamic_adar

In [33]:
def preferential_attachement(graph, edge):
    pa = graph.degree(edge[0]) * graph.degree(edge[1])
    return pa

In [34]:
def are_connected(graph,edge):
    try:
        connect = nx.shortest_path(graph, source=edge[0], target=edge[1])
        connected  = 1
    except:
        connected = 0
    return(connected)

In [35]:
def shortest_path(graph,edge):
    try:
        l = nx.shortest_path_length(graph, source=edge[0], target=edge[1])
    except:
        l = -1
    return(l)

In [36]:
def common_journal(information_df, node1, node2):
    journal1 = str(information_df[information_df.new_ID==node1].journal_name.values[0])
    journal2 = str(information_df[information_df.new_ID==node2].journal_name.values[0])

    if journal1 == '' or journal2 == '':
        return(-1)
    elif journal1 == journal2:
        return(1)
    else:
        return(0)

## Compute features

In [37]:
def compute_non_embeddings_features(df, information_df, G_articles):
    useful_information_df = information_df[['new_ID','authors','pub_year', 'title_lemma']]

    # prepare data frame for common authors computation
    df = (df
    .merge(useful_information_df, how ='left', left_on = ['node1'], right_on = ['new_ID'])
    .rename(columns = {'authors':'authors_node_1', 'pub_year':'pub_year1', 'title_lemma':'title_lemma1'})
    .merge(useful_information_df, how ='left', left_on = ['node2'], right_on = ['new_ID'])
    .rename(columns = {'authors':'authors_node_2', 'pub_year':'pub_year2', 'title_lemma':'title_lemma2'})
    )

    print("common_journal")
    df['common_journals'] = df.apply(lambda x: common_journal(information_df, x.node1, x.node2),axis = 1)
    
    print('computing common authors')
    #  compute common authors
    df['common_authors'] = df.apply(lambda x:len(set(x.authors_node_1)&set(x.authors_node_2)),axis = 1)

    print('computing common words')
    #  compute common words in titles
    df['common_title_words'] = df.apply(lambda x:len(set(x.title_lemma1)&set(x.title_lemma2)),axis = 1)

    print('computing delta publication year')
    # compute delta publication year
    df['delta_publication'] = df.apply(lambda x:np.abs(x.pub_year2 - x.pub_year1),axis = 1)

    # compute edges features
    print('computing jacard index')
    df['jacard'] = df.apply(lambda x: Jaccard(G_articles, (x.node1, x.node2)),axis = 1)

    print('computing preferential attachement')
    df['pa'] = df.apply(lambda x: preferential_attachement(G_articles, (x.node1, x.node2)),axis = 1)

    print('computing adamic_adar')
    df['adamic_adar'] = df.apply(lambda x: AdamicAdar(G_articles, (x.node1, x.node2)),axis = 1)

    print('are connected')
    df['connection'] = df.apply(lambda x: are_connected(G_articles, (x.node1, x.node2)),axis = 1)

    print('shortest_path')
    df['shortest_path'] = df.apply(lambda x: shortest_path(G_articles, (x.node1, x.node2)),axis = 1)

    print('page ranks')
    df['page_rank1'] = df.apply(lambda x: page_rank_dict[x.node1],axis = 1)
    df['page_rank2'] = df.apply(lambda x: page_rank_dict[x.node2],axis = 1)
    
    print('compute degree')

    df['degree1'] = df.apply(lambda x: centrality_dict[x.node1],axis = 1)
    df['degree2'] = df.apply(lambda x: centrality_dict[x.node2],axis = 1)

    
    df = df.fillna({ 'jacard':df.jacard.mean(),
                     'adamic_adar':df.adamic_adar.mean()
                     })

    return(df)


In [38]:
def compute_embedding_cosines_features(df,
                                         articles_node2vec_embeddings,
                                         walklets_articles_embeddings,
                                         articles_authors_embedding, 
                                         abstracts_embeddings):
    # compute some cosine based distances
    df['articles_node2vec_cosine'] = df.apply(lambda x:cosine(articles_node2vec_embeddings[x.node1],articles_node2vec_embeddings[x.node2]), axis = 1)
    df['articles_walklets_cosine'] = df.apply(lambda x:cosine(walklets_articles_embeddings[x.node1],walklets_articles_embeddings[x.node2]), axis = 1)
    df['authors_embeddings_cosine'] = df.apply(lambda x:cosine(articles_authors_embedding[x.node1],articles_authors_embedding[x.node2]), axis = 1)
    df['abstracts_embeddings_cosine'] = df.apply(lambda x:cosine(abstracts_embeddings[x.node1][0],abstracts_embeddings[x.node2][0]), axis = 1)
    df['articles_deepwalk_cosine'] = df.apply(lambda x:cosine(articles_DeepWalk_embeddings[x.node1],articles_DeepWalk_embeddings[x.node2]), axis = 1)

    return(df)

## Compute features on train set

In [39]:
if os.path.isfile("Data/processed_data/train_set_with_features.csv"):
    train_set_with_features = pd.read_csv("Data/processed_data/train_set_with_features.csv")
else:
    train_set_with_features = compute_non_embeddings_features(initial_train_set, information_df, G_articles_embedding)
    train_set_with_features = compute_embedding_cosines_features(train_set_with_features,
                                            articles_node2vec_embeddings,
                                            walklets_articles_embeddings,
                                            articles_authors_embedding, 
                                            abstracts_embeddings)
    train_set_with_features.to_csv("Data/processed_data/train_set_with_features.csv", index = False)

In [40]:
columns_to_keep= ['common_authors', 'common_title_words','common_journals',
       'delta_publication', 'jacard', 'pa', 'adamic_adar', 'connection',
       'shortest_path', 'page_rank1', 'page_rank2', 'degree1', 'degree2',
       'articles_node2vec_cosine', 'articles_walklets_cosine',
       'authors_embeddings_cosine', 'abstracts_embeddings_cosine','articles_deepwalk_cosine']

In [41]:
train_set_with_features = train_set_with_features[columns_to_keep+['label']]

## Compute features on validation set


In [42]:
validation_set_with_features = compute_non_embeddings_features(validation_set, information_df, G_articles_embedding)
validation_set_with_features = compute_embedding_cosines_features(validation_set_with_features,
                                         articles_node2vec_embeddings,
                                         walklets_articles_embeddings,
                                         articles_authors_embedding, 
                                         abstracts_embeddings)

common_journal
computing common authors
computing common words
computing delta publication year
computing jacard index
computing preferential attachement
computing adamic_adar
are connected
shortest_path
page ranks
compute degree


In [43]:
validation_set_with_features = validation_set_with_features[columns_to_keep+['label']]

## Run models

In [44]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import pickle
from sklearn.preprocessing import StandardScaler

In [45]:
train_samples, train_labels = train_set_with_features.drop(columns = ['label']), train_set_with_features[['label']]
validation_samples, validation_labels = validation_set_with_features.drop(columns = ['label']), validation_set_with_features[['label']]

In [46]:
# scale data
scaler = StandardScaler()
train_samples_scaled = scaler.fit_transform(np.float32(train_samples))
validation_samples_scaled = scaler.transform(np.float32(validation_samples))

In [47]:
### use hyper opt to find the best model
'''
estim = HyperoptEstimator(classifier=any_classifier('my_clf'),
                          preprocessing=any_preprocessing('my_pre'),
                          algo=tpe.suggest,
                          max_evals=100,
                          trial_timeout=120)

estim.fit(train_samples_scaled,list(train_labels.label))

# Show the results

print(estim.score(validation_samples_scaled, list(validation_labels.label)))

print(estim.best_model())
'''

"\nestim = HyperoptEstimator(classifier=any_classifier('my_clf'),\n                          preprocessing=any_preprocessing('my_pre'),\n                          algo=tpe.suggest,\n                          max_evals=100,\n                          trial_timeout=120)\n\nestim.fit(train_samples_scaled,list(train_labels.label))\n\n# Show the results\n\nprint(estim.score(validation_samples_scaled, list(validation_labels.label)))\n\nprint(estim.best_model())\n"

best :{'learner': ExtraTreesClassifier(bootstrap=True, max_features=0.7327186439657982,
                     n_estimators=95, n_jobs=1, random_state=3, verbose=False), 'preprocs': (), 'ex_preprocs': ()}

In [48]:
# train a model 
from sklearn.ensemble import ExtraTreesClassifier
clf_xt = ExtraTreesClassifier(bootstrap=True, max_features=0.7327186439657982,
                     n_estimators=95, n_jobs=1, random_state=3, verbose=False)

clf_xt.fit(train_samples_scaled,list(train_labels.label))

# test the model
predicted_labels_xt = clf_xt.predict(validation_samples_scaled)
acc = accuracy_score(validation_labels, list(predicted_labels_xt))
print(f"accuracy: {acc}")

accuracy: 0.9658400557713375


In [49]:
# train a model 

clf_xgb = XGBClassifier()

clf_xgb.fit(train_samples_scaled,list(train_labels.label))

# test the model
predicted_labels_xgb = clf_xgb.predict(validation_samples_scaled)
acc = accuracy_score(validation_labels, list(predicted_labels_xgb))
print(f"accuracy: {acc}")



accuracy: 0.9524947714371078


In [50]:
# train a model 
from sklearn.svm import SVC
clf_svc = SVC()

clf_svc.fit(train_samples_scaled,list(train_labels.label))

# test the model
predicted_labels_svc = clf_svc.predict(validation_samples_scaled)
acc = accuracy_score(validation_labels, list(predicted_labels_svc))
print(f"accuracy: {acc}")

accuracy: 0.9522955880888357


In [51]:
# voting
total_pred = np.array(predicted_labels_svc)+np.array(predicted_labels_xt)+np.array(predicted_labels_xgb)
total_pred = (total_pred > 1.5)
acc = accuracy_score(validation_labels, list(total_pred))
print(f"accuracy: {acc}")

accuracy: 0.9566776217508216


## Create test set

In [52]:
## start by using the new ids

test_set = pd.read_csv(test_set_path, sep =" ", header = None)
test_set.columns = ['node1','node2']

test_set = (test_set
    .merge(information_df[['ID','new_ID']], how = 'left', left_on = ['node1'], right_on = ['ID'])
    .drop(columns = ['node1','ID'])
    .rename(columns = {'new_ID':'node1'})
    .merge(information_df[['ID','new_ID']], how = 'left', left_on = ['node2'], right_on = ['ID'])
    .drop(columns = ['node2','ID'])
    .rename(columns = {'new_ID':'node2'})
)
test_set.sample(5)

Unnamed: 0,node1,node2
395,20963,11436
24155,9897,7632
11904,8270,27533
6960,5981,1789
29862,23888,6702


In [53]:
test_set_with_features = compute_non_embeddings_features(test_set, information_df, G_articles_embedding)
test_set_with_features = compute_embedding_cosines_features(test_set_with_features,
                                         articles_node2vec_embeddings,
                                         walklets_articles_embeddings,
                                         articles_authors_embedding, 
                                         abstracts_embeddings)

common_journal
computing common authors
computing common words
computing delta publication year
computing jacard index
computing preferential attachement
computing adamic_adar
are connected
shortest_path
page ranks
compute degree


In [54]:
test_set_with_features = test_set_with_features[columns_to_keep]

In [59]:
## scale data
test_samples_scaled = scaler.transform(np.float32(test_set_with_features))
# prediction
test_predicted_labels_svc = clf_svc.predict(test_samples_scaled)
test_predicted_labels_xgb = clf_xgb.predict(test_samples_scaled)
test_predicted_labels_xt = clf_xt.predict(test_samples_scaled)

# voting 
total_pred_test = np.array(test_predicted_labels_svc)+np.array(test_predicted_labels_xt)+np.array(test_predicted_labels_xgb)
total_pred_test = (total_pred_test > 1.5)

sum(total_pred_test)

16859

In [60]:
len(total_pred_test)

32648

In [61]:
test_set['category'] = total_pred_test

test_set = (test_set
.reset_index()
.rename(columns = {'index':'id'})
.drop(columns = ['node1','node2'])
)

test_set.to_csv('final_predictions.csv', index=False)