<center><h2>ALTEGRAD Project</h2>

<hr>
<span style="font-variant: small-caps;">Xavier Jiménez, Jean Quentin, Sacha Revol</span><br>
<hr>
</center>

# Imports

In [1]:
import networkx as nx
import os
import csv
import numpy as np
import pandas as pd
from random import randint
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string
from tqdm import tqdm

# !pip install pip install karateclub
from gensim.models.doc2vec import Doc2Vec
from os import path
import pickle
from scipy import spatial
import random
from sklearn import preprocessing


# Load preprocessed files

These files are created in `Preprocessing.ipynb`

In [2]:
# Read the abstract of each paper
try:
    print('Loading Authors preprocessed')
    a_file = open("data/authors_preprocessed.pkl", "rb")
    authors = pickle.load(a_file)
    a_file.close()
except:
    raise SyntaxError("File 'authors_preprocessed.pkl' was not found in 'data/'")

Loading Authors preprocessed


In [3]:
# Read the abstract of each paper
try:
    print('Load abstract preprocessed')
    a_file = open("data/abstract_preprocessed.pkl", "rb")
    abstracts = pickle.load(a_file)
    a_file.close()
except:
    raise SyntaxError("File 'abstract_preprocessed.pkl' was not found in 'data/'")

Load abstract preprocessed


# Functions

## Training and submission

In [4]:
def train(X_train, X_test, y_train, y_test = None, model = LogisticRegression(max_iter = 300)):
    scaler = preprocessing.StandardScaler().fit(X_train)

    X_train = scaler.transform(X_train)
    X_test = scaler.fit_transform(X_test)
    
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)
    y_pred = y_pred[:,1]
    if y_test is not None:
        print('Validation loss = {:.4f}'.format(log_loss(y_test, y_pred)))
    else:
        # Write predictions to a file
        print('Creating submission')
        predictions = zip(range(len(y_pred)), y_pred)
        os.remove("data/submission.csv")
        with open("data/submission.csv","w") as pred:
            csv_out = csv.writer(pred)
            csv_out.writerow(['id','predicted'])
            for row in predictions:
                csv_out.writerow(row)
        print('Submision created')
    return y_pred

## Feature matrix creation

### Common functions for matrix creation

In [5]:
def remove_random_edges_from_G(p = 0.05, seed = 1):
    """Removes p lines from edgelist randomly and saves
    remaining lines as edgelist_missing.txt

    Args:
        p (float, optional): line percentage to be removed. Defaults to 0.05.
        seed (int, optional): seed for random. Defaults to 1.
    """

    H = nx.read_edgelist('data/edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)
    with open('data/edgelist.txt') as file:
        lines = file.readlines()

    random.seed(seed)
    indices_to_delete = random.sample(range(len(lines)), int(p * len(lines)))

    # sort to delete biggest index first 
    indices_to_delete.sort(reverse=True)

    for i in tqdm(indices_to_delete):
        line = lines[i]
        t = line.split(',')
        H.remove_edge(int(t[0]), int(t[1]))
        
    return H

In [6]:
def load_info(G, p = 0.05, validation = True):
    """This function returns validation/test node pairs list and
    validation/test + train random nodes that will be used to create
    feature matrix. These lists have a fixed seed.

    Args:
        G (Networkx graph): A Graph loaded with NetworkX library.
        p (float, optional): Percentage of edges removed from G to create validation Graph H.
        See 'Preprocessing.ipynb' to see which value was used. Defaults to 0.05.
        validation (bool, optional): If True, will return validation lists.
        Else, will return test lists. Defaults to True.

    Returns:
        lists: validation/test node pairs list and validation/test + 
               train random nodes
    """
        
    np.random.seed(41)
    train_random_nodes = np.random.choice(list(G.nodes()), size=2*len(G.edges()), replace=True)
        
    if validation:
        val_node_pairs = list()
        with open('data/edgelist.txt') as file:
            lines = file.readlines()

        random.seed(42)
        indices_to_delete = random.sample(range(len(lines)), int(p * len(lines)))
        indices_to_delete.sort(reverse=True)

        for i in tqdm(indices_to_delete):
            line = lines[i]
            t = line.split(',')
            val_node_pairs.append((int(t[0]), int(t[1])))
        np.random.seed(42)
        val_random_nodes = np.random.choice(list(G.nodes()), size=2*len(val_node_pairs), replace=True)
        
        return val_node_pairs, val_random_nodes, train_random_nodes
        

    else:
        test_node_pairs = list()
        with open('data/test.txt', 'r') as f:
            for line in f:
                t = line.split(',')
                test_node_pairs.append((int(t[0]), int(t[1])))
                
        np.random.seed(43)
        test_random_nodes = np.random.choice(list(G.nodes()), size=len(test_node_pairs), replace=True)
        
        return test_node_pairs, test_random_nodes, train_random_nodes
    
def graph_properties(G):
    """Computes standard Graph properties with NetworkX
    Args:
        G ([nx graph)
    Returns:
        list of arrays, each array contains individual property 
        for each node.
    """
    
    print('Computing graph properties')
    avg_neighbor_degree = nx.average_neighbor_degree(G)
    pagerank = nx.pagerank_scipy(G)
    # eig_centrality = nx.eigenvector_centrality_numpy(G)
    # greedy_color = nx.greedy_color(G)
    # triangles = nx.triangles(G)

    return [avg_neighbor_degree, pagerank]#, eig_centrality, greedy_color, triangles]

### Default matrix creation

In [7]:
def create_default_matrix(G, validation = True):
    """Creates feature matrix with baseline features. 

    Args:
        G (NetworkX Graph): A Graph loaded with NetworkX library.
        validation (bool, optional): If True, will return validation lists.
        Else, will return test lists. Defaults to True.
    """

    n_features = 6
    
    if validation:
        val_node_pairs, val_random_nodes, train_random_nodes = load_info(G, validation = validation)
        X_val = np.zeros((2*len(val_node_pairs), n_features))
        y_val = np.zeros(2*len(val_node_pairs))
        
    else:
        test_node_pairs, test_random_nodes, train_random_nodes = load_info(G, validation = validation)
        X_test = np.zeros((len(test_node_pairs), n_features))
                  
    X_train = np.zeros((2*len(G.edges()), n_features))
    y_train = np.zeros(2*len(G.edges()))
    
    
    
    for i, train_edge in tqdm(enumerate(G.edges())):
        X_train, y_train = fill_default_matrix(i, 2, G, X_train, y_train, train_edge, train_random_nodes)
    np.save('data/X_train_default.npy', X_train)
    np.save('data/y_train', y_train)
    if validation:
        for i, val_edge in tqdm(enumerate(val_node_pairs)):
            X_val, y_val = fill_default_matrix(i, 2, G, X_val, y_val, val_edge, val_random_nodes)
        np.save('data/X_val_default.npy', X_val)
        np.save('data/y_val', y_val)
    else:
        for i, test_edge in tqdm(enumerate(test_node_pairs)):
            X_test = fill_default_matrix(i, 1, G, X_test, None, test_edge, test_random_nodes)
        np.save('data/X_test_default.npy', X_test)
        
        
def fill_default_matrix(i, p, G, X, y, edge, random_nodes):
    """Fills each line of the default feature matrix.

    Args:
        i (int): matrix line.
        p (int): If p=1, creates test matrix. If p=2, creates either train or validation.
        G (nx graph): Graph loaded with NetworkX.
        X (np.ndarray): Shape (2 x n_edges, n_features)
        y (np.ndarray): Shape (2 x n_edges, ). Set to None for test.
        edge (list): list of tupple of nodes.
        random_nodes (list): list of random nodes. Shape (2 x n_edges, ).

    Returns:
        np.ndarray: if p=1, returns X, y filled. Elif p=2, returns X filled.
    """
    X[p*i,0] = len(abstracts[edge[0]]) + len(abstracts[edge[1]])
    X[p*i,1] = abs(len(abstracts[edge[0]]) - len(abstracts[edge[1]]))
    X[p*i,2] = len(set(abstracts[edge[0]]).intersection(set(abstracts[edge[1]])))
    try:
        X[p*i,3] = G.degree(edge[0]) + G.degree(edge[1])
        X[p*i,4] = abs(G.degree(edge[0]) - G.degree(edge[1]))
    except:
        X[p*i,3] = -1
        X[p*i,4] = -1
    X[p*i,5] = len(set(authors[edge[0]]).intersection(set(authors[edge[1]])))
    
    
    if p == 2:
        y[2*i] = 1

        n1, n2 = random_nodes[2*i], random_nodes[2*i+1]
            
        X[2*i+1,0] = len(abstracts[n1]) + len(abstracts[n2])
        X[2*i+1,1] = abs(len(abstracts[n1]) - len(abstracts[n2]))
        X[2*i+1,2] = len(set(abstracts[n1]).intersection(set(abstracts[n2])))
        X[2*i+1,3] = G.degree(n1) + G.degree(n2)
        X[2*i+1,4] = abs(G.degree(n1) - G.degree(n2))
        X[2*i+1,5] = len(set(authors[n1]).intersection(set(abstracts[n2])))

        y[2*i+1] = 0

        return X, y
    else:
        return X

### Param matrix creation

In [8]:
def create_param_matrix(G, validation = True):
    """Creates Graph parameters feature matrix. 

    Args:
        G (NetworkX Graph): A Graph loaded with NetworkX library.
        validation (bool, optional): If True, will return validation lists.
        Else, will return test lists. Defaults to True.
    """

    G_params = graph_properties(G)
    n_features = len(G_params)
    
    if validation:
        val_node_pairs, val_random_nodes, train_random_nodes = load_info(G, validation = validation)
        X_val = np.zeros((2*len(val_node_pairs), n_features))
        y_val = np.zeros(2*len(val_node_pairs))
        
    else:
        test_node_pairs, test_random_nodes, train_random_nodes = load_info(G, validation = validation)
        X_test = np.zeros((len(test_node_pairs), n_features))
                  
    X_train = np.zeros((2*len(G.edges()), n_features))
    y_train = np.zeros(2*len(G.edges()))
    
    
    
    for i, train_edge in tqdm(enumerate(G.edges())):
        X_train, y_train = fill_param_matrix(i, 2, G, X_train, y_train, train_edge, train_random_nodes, G_params)
    np.save('data/X_train_param', X_train)
    np.save('data/y_train', y_train)
    if validation:
        for i, val_edge in tqdm(enumerate(val_node_pairs)):
            X_val, y_val = fill_param_matrix(i, 2, G, X_val, y_val, val_edge, val_random_nodes, G_params)
        np.save('data/X_val_param', X_val)
        np.save('data/y_val', y_val)
    else:
        for i, test_edge in tqdm(enumerate(test_node_pairs)):
            X_test = fill_param_matrix(i, 1, G, X_test, None, test_edge, test_random_nodes, G_params)
        np.save('data/X_test_param.npy', X_test)
        
        
def fill_param_matrix(i, p, G, X, y, edge, random_nodes, G_params):
    """Fills each line of the Graph parameters feature matrix.

    Args:
        i (int): matrix line.
        p (int): If p=1, creates test matrix. If p=2, creates either train or validation.
        G (nx graph): Graph loaded with NetworkX.
        X (np.ndarray): Shape (2 x n_edges, n_features)
        y (np.ndarray): Shape (2 x n_edges, ). Set to None for test.
        edge (list): list of tupple of nodes.
        random_nodes (list): list of random nodes. Shape (2 x n_edges, ).
        G_params (list): list containing graph parameters.

    Returns:
        np.ndarray: if p=1, returns X, y filled. Elif p=2, returns X filled.
    """
    
    for idx, j in enumerate(range(len(G_params) + 1, 2)):
        param = G_params[idx]
        try:
            X[p*i,j] = param[edge[0]] + param[edge[1]]
            X[p*i,j+1] = abs(param[edge[0]] - param[edge[1]])
        except:
            X[p*i,j] = -1
            X[p*i,j+1] = -1
    
    
    if p == 2:
        y[2*i] = 1
        y[2*i+1] = 0
        n1, n2 = random_nodes[2*i], random_nodes[2*i+1]
            
        for idx, j in enumerate(range(len(G_params) + 1, 2)):
            param = G_params[idx]
            X[2*i+1,j] = param[n1] + param[n2]
            X[2*i+1,j+1] = abs(param[n1] - param[n2])

        return X, y
    else:
        return X

### Embedding Matrix creation

In [34]:
def create_embeddings_matrix(G, distance, validation = True):
    """Creates Graph embeddings feature matrix. 

    Args:
        G (NetworkX Graph): A Graph loaded with NetworkX library.
        distance (scipy spatial distance): Function that takes two 1d arrays
        and outputs a float. Recommended: cosine distance.
        validation (bool, optional): If True, will return validation lists.
        Else, will return test lists. Defaults to True.
    """
    
    n2v_parameters = {'walk_number': 10, 'walk_length': 15, 'dimensions': 64, 'window_size': 5}
    d2v_parameters = {'vector_size':128, 'window':5, 'min_count':2, 'epochs':100}
    walkets_parameters = {'walk_number': 10, 'walk_length': 80, 'dimensions': 64, 'window_size': 4}
    
    d2v = Doc2Vec.load("data/abstracts_embedding_doc2vec_vs{:d}_w{:d}_mc{:d}_e{:d}".format(d2v_parameters['vector_size'], d2v_parameters['window'],
                                                                            d2v_parameters['min_count'], d2v_parameters['epochs']))
    
    # authors_emb = np.load('data/embedding_authors_articles_mean.npy')
    embeddings = [d2v]#, authors_emb]
    
    
    if validation:
        n2v = np.load('data/embedding_n2v_val_wn{:d}_wl{:d}_d{:d}_ws{:d}.npy'.format(n2v_parameters['walk_number'], n2v_parameters['walk_length'],
                                                                n2v_parameters['dimensions'], n2v_parameters['window_size']))
        embeddings.append(n2v)
        n_features = len(embeddings)
        
        val_node_pairs, val_random_nodes, train_random_nodes = load_info(G, validation = validation)
        X_val = np.zeros((2*len(val_node_pairs), n_features))
        y_val = np.zeros(2*len(val_node_pairs))
        
        

    else:
        n2v = np.load('data/embedding_n2v_wn{:d}_wl{:d}_d{:d}_ws{:d}.npy'.format(n2v_parameters['walk_number'], n2v_parameters['walk_length'],
                                                                n2v_parameters['dimensions'], n2v_parameters['window_size']))
        walkets = np.load('data/embedding_Walklets_wn{:d}_wl{:d}_d{:d}_ws{:d}.npy'.format(walkets_parameters['walk_number'], walkets_parameters['walk_length'],
                                                                walkets_parameters['dimensions'], walkets_parameters['window_size']))
        embeddings.append(n2v)
        embeddings.append(walkets)
        n_features = len(embeddings)
        
        test_node_pairs, test_random_nodes, train_random_nodes = load_info(G, validation = validation)
        X_test = np.zeros((len(test_node_pairs), n_features))
          
    X_train = np.zeros((2*len(G.edges()), n_features))
    y_train = np.zeros(2*len(G.edges()))
    
    
    
    for i, train_edge in tqdm(enumerate(G.edges())):
        X_train, y_train = fill_embeddings_matrix(i, 2, X_train, y_train, train_edge, train_random_nodes, embeddings, distance)
    np.save('data/X_train_embeddings', X_train)
    np.save('data/y_train', y_train)
    if validation:
        for i, val_edge in tqdm(enumerate(val_node_pairs)):
            X_val, y_val = fill_embeddings_matrix(i, 2, X_val, y_val, val_edge, val_random_nodes, embeddings, distance)
        np.save('data/X_val_embeddings', X_val)
        np.save('data/y_val', y_val)
    else:
        for i, test_edge in tqdm(enumerate(test_node_pairs)):
            X_test = fill_embeddings_matrix(i, 1, X_test, None, test_edge, test_random_nodes, embeddings, distance)
        np.save('data/X_test_embeddings', X_test)
          
            
def fill_embeddings_matrix(i, p, X, y, edge, random_nodes, embeddings, distance):
    """Fills each line of the Graph embedding feature matrix.

    Args:
        i (int): matrix line.
        p (int): If p=1, creates test matrix. If p=2, creates either train or validation.
        G (nx graph): Graph loaded with NetworkX.
        X (np.ndarray): Shape (2 x n_edges, n_features)
        y (np.ndarray): Shape (2 x n_edges, ). Set to None for test.
        edge (list): list of tupple of nodes.
        random_nodes (list): list of random nodes. Shape (2 x n_edges, ).
        embeddings (list): list of np.ndarrays containing embeddings for each node in the Graph
        distance (scipy spatial distance): Function that takes two 1d arrays
        and outputs a float. Recommended: cosine distance.

    Returns:
        np.ndarray: if p=1, returns X, y filled. Elif p=2, returns X filled.
    """

    for j, emb in enumerate(embeddings):
        if j == 0:
            X[p*i,j] = emb.dv.similarity(edge[0], edge[1])
        else:
            X[p*i,j] = distance(emb[edge[0]], emb[edge[1]])
    
    # X[p*i,8] = distance(gae.iloc[edge[0]].to_numpy(), gae.iloc[edge[1]].to_numpy())
      
    if p == 2:
        y[2*i] = 1
        y[2*i+1] = 0
        n1, n2 = random_nodes[2*i], random_nodes[2*i+1]        
        for j, emb in enumerate(embeddings):
            if j == 0:
                X[2*i+1,j] = emb.dv.similarity(n1, n2)
            else:
                X[2*i+1,j] = distance(emb[n1], emb[n2])
    
        # X[2*i+1,8] = distance(gae.iloc[n1].to_numpy(), gae.iloc[n2].to_numpy())


        return X, y
    else:
        return X

In [10]:
# def nodes_connected(G, u, v):
#     return u in G.neighbors(v)
# neighbor = nx.single_source_shortest_path_length(H, 0, cutoff=2)
# # neighbor = {v: k for k, v in neighbor.items()}
# embe = np.load('data/embedding_authors_articles_mean.npy')

# for i in list(neighbor.keys())[1:10]:
#     # print(embe[i,:10])
#     # print(embe[0,:10])
#     if nodes_connected(H, i, 1):
#         print('Connected {:d}-{:d}: {:.2f}'.format(i, 0, spatial.distance.cosine(embe[i], embe[0])))
#     else:
#         print('Not Connected {:d}-{:d}: {:.2f}'.format(i, 0, spatial.distance.cosine(embe[i], embe[0])))
        



### Other features

In [11]:
def create_new_feature_matrix(G, measures, validation = True):
    n_features = len(measures)
    
    X_train = np.zeros((2*len(G.edges()), n_features))
    y_train = np.zeros(2*len(G.edges()))
    
    if validation:
        val_node_pairs, val_random_nodes, train_random_nodes = load_info(G, validation = validation)
        X_val = np.zeros((2*len(val_node_pairs), n_features))
        y_val = np.zeros(2*len(val_node_pairs))
    else:
        test_node_pairs, test_random_nodes, train_random_nodes = load_info(G, validation = validation)
        X_test = np.zeros((len(test_node_pairs), n_features))
        
    for i, train_edge in tqdm(enumerate(G.edges())):
        X_train, y_train = fill_new_feature_matrix(i, 2, G, X_train, y_train, train_edge, train_random_nodes, measures)
    np.save('data/X_train_new_features', X_train)
    
    if validation:
        for i, val_edge in tqdm(enumerate(val_node_pairs)):
            X_val, y_val = fill_new_feature_matrix(i, 2, G, X_val, y_val, val_edge, val_random_nodes, measures)
        np.save('data/X_val_new_features', X_val)
        return X_train, X_val, y_train, y_val

    else:
        for i, test_edge in tqdm(enumerate(test_node_pairs)):
            X_test = fill_new_feature_matrix(i, 1, G, X_test, None, test_edge, test_random_nodes, measures)
        np.save('data/X_test_new_features', X_test)
        return X_train, X_test, y_train
        
        
def fill_new_feature_matrix(i, p, G, X, y, edge, random_nodes, measures):
    for j, measure in enumerate(measures):
        X[p*i,j] = measure(edge[0], edge[1], G)
    
    if p == 2:
        y[2*i] = 1
        y[2*i+1] = 0
        n1, n2 = random_nodes[2*i], random_nodes[2*i+1]
        for j, measure in enumerate(measures):
            X[2*i+1,j] = measure(n1, n2, G)

        return X, y
    else:
        return X

### Matrix concatenation & loading

In [12]:
def concatenate_matrix(default = True, param = True, embeddings = True, new_feature = True, validation = True):
    bool_list = [default, param, embeddings, new_feature]
    X_train_default = np.load('data/X_train_default.npy')
    X_train_param = np.load('data/X_train_param.npy')
    X_train_embeddings = np.load('data/X_train_embeddings.npy')
    X_train_new_feature = np.load('data/X_train_new_features.npy')
    print(X_train_default.shape, X_train_param.shape, X_train_embeddings.shape, X_train_new_feature.shape)
    X_train_list = [X for i,X in enumerate([X_train_default, X_train_param, X_train_embeddings, X_train_new_feature]) if bool_list[i]]
    X_train = np.concatenate(X_train_list, axis=1)
    np.save('data/X_train', X_train)
    y_train = np.load('data/y_train.npy')
    if validation:
        X_val_default = np.load('data/X_val_default.npy')
        X_val_param = np.load('data/X_val_param.npy')
        X_val_embeddings = np.load('data/X_val_embeddings.npy')
        X_val_new_feature = np.load('data/X_val_new_features.npy')
        X_val_list = [X for i,X in enumerate([X_val_default, X_val_param, X_val_embeddings, X_val_new_feature]) if bool_list[i]]
        X_val = np.concatenate(X_val_list, axis=1)
        np.save('data/X_val', X_val)
        y_val = np.load('data/y_val.npy')
        return X_train, X_val, y_train, y_val
    else:
        X_test_default = np.load('data/X_test_default.npy')
        X_test_param = np.load('data/X_test_param.npy')
        X_test_embeddings = np.load('data/X_test_embeddings.npy')
        X_test_new_feature = np.load('data/X_test_new_features.npy')
        X_test_list = [X for i,X in enumerate([X_test_default, X_test_param, X_test_embeddings, X_test_new_feature]) if bool_list[i]]
        X_test = np.concatenate(X_test_list, axis=1)
        np.save('data/X_test', X_test)
        return X_train, X_test, y_train

def load_matrix(validation = True):
    X_train = np.load('data/X_train.npy')
    y_train = np.load('data/y_train.npy')
    if validation:
        X_val = np.load('data/X_val.npy')
        y_val = np.load('data/y_val.npy')
        return X_train, X_val, y_train, y_val
    else:
        X_test = np.load('data/X_test.npy')
        return X_train, X_test, y_train

### Functions to add/remove new features

In [13]:
def add_new_feature(G, measure, distance, validation = True):
    n_features = 1
    
    X_train = np.zeros((2*len(G.edges()), n_features))
    y_train = np.zeros(2*len(G.edges()))
    
    if validation:
        val_node_pairs, val_random_nodes, train_random_nodes = load_info(G, validation = validation)
        X_val = np.zeros((2*len(val_node_pairs), n_features))
        y_val = np.zeros(2*len(val_node_pairs))
    else:
        test_node_pairs, test_random_nodes, train_random_nodes = load_info(G, validation = validation)
        X_test = np.zeros((len(test_node_pairs), n_features))
        
    for i, train_edge in tqdm(enumerate(G.edges())):
        X_train, y_train = fill_new_feature(i, 2, G, X_train, y_train, train_edge, train_random_nodes, distance, measure)
    X_train_full = np.load('data/X_train.npy')
    
    if validation:
        for i, val_edge in tqdm(enumerate(val_node_pairs)):
            X_val, y_val = fill_new_feature(i, 2, G, X_val, y_val, val_edge, val_random_nodes, distance, measure)
        X_val_full = np.load('data/X_val.npy')
        return np.concatenate((X_train_full, X_train), axis=1), np.concatenate((X_val_full, X_val), axis=1), y_train, y_val

    else:
        for i, test_edge in tqdm(enumerate(test_node_pairs)):
            X_test = fill_new_feature(i, 1, G, X_test, None, test_edge, test_random_nodes, distance, measure)
        X_test_full = np.load('data/X_test.npy')
        return np.concatenate((X_train_full, X_train), axis=1), np.concatenate((X_test_full, X_test), axis=1), y_train
        
        
def fill_new_feature(i, p, G, X, y, edge, random_nodes, distance, measure):
    try:
        X[p*i,0] = measure(edge[0], edge[1], G, distance)
    except:
        X[p*i,0] = -1
    
    if p == 2:
        y[2*i] = 1
        y[2*i+1] = 0
        n1, n2 = random_nodes[2*i], random_nodes[2*i+1]
        X[2*i+1,0] = measure(n1, n2, G, distance)

        return X, y
    else:
        return X

def remove_last_feature(X_train, X_test):
    return X_train[:,:-1], X_test[:,:-1]
    

## Measures for new features

In [27]:
def shortest_path_length(n1, n2, G, distance=None):
    """
    Computes the shortest path length between two nodes in a graph.
    """
    try:
        length = nx.shortest_path_length(G, n1, n2)
    except:
        length = -1
    return length

def dijkstra_path_length(n1, n2, G, distance=None):
    """
    Computes the dijkstra path length between two nodes in a graph.
    """
    try:
        length = nx.dijkstra_path_length(G, n1, n2)
    except:
        length = -1
    return length

def jaccard_coefficient(n1, n2, G, distance=None):
    """
    Computes the jaccard coefficient of two nodes in a graph.
    """
    _, _, coeff = list(nx.jaccard_coefficient(G, [(n1, n2)]))[0]
    return coeff


def adamic_adar_index(n1, n2, G, distance=None):
    """
    Computes the adamic adar index of two nodes in a graph.
    """
    _, _, index = list(nx.adamic_adar_index(G, [(n1, n2)]))[0]
    return index


def pref_attachment(n1, n2, G, distance=None):
    """ 
    Computes the preferential attachment of two nodes in a graph.
    """
    _, _, p = list(nx.preferential_attachment(G, [(n1, n2)]))[0]
    return p

def common_neighbor_centrality(n1, n2, G, distance=None):
    """ 
    Computes the common neighbor centrality of two nodes in a graph.
    """
    _, _, p = nx.common_neighbor_centrality(G, [(n1, n2)])
    return p

In [None]:
common_neighbor_centrality(1, 0, G)

# Validation matrix

For validation, all parameters including embeddings should be computed using the validation graph. Results should be computed again on the full graph to submit a test result.

In [15]:
try:
    # Validation Graph computed in Preprocessing.ipynb
    H = nx.read_edgelist('data/edgelist_val.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)
except:
    raise SyntaxError("File 'edgelist_val.txt' was not found in 'data/'")

## Matrix creation/loading and training

In [44]:
create_default_matrix(H, validation = True)

100%|██████████| 54597/54597 [00:00<00:00, 471885.13it/s]
1037358it [01:00, 17079.54it/s]
54597it [00:03, 15805.04it/s]


In [47]:
create_param_matrix(H, validation = True)

Computing graph properties


100%|██████████| 54597/54597 [00:00<00:00, 421319.50it/s]
1037358it [00:02, 399113.24it/s]
54597it [00:00, 372396.70it/s]


In [82]:
# BAD: chebyshev, braycurtis, canberra, euclidian, jaccard
# TO TEST: cityblock, correlation, jensenshannon, mahalanobis, minkowski, seuclidean, sqeuclidean, wminkowski, dice, hamming
# BEST: cosine
distance = spatial.distance.cosine #spatial.distance.cosine #euclidean_distance #
create_embeddings_matrix(H, distance = distance, validation = True)

100%|██████████| 54597/54597 [00:00<00:00, 391935.28it/s]
1037358it [04:39, 3715.33it/s]
54597it [00:13, 4077.65it/s]


In [None]:
create_new_feature_matrix(H, measures = [shortest_path_length], validation = True)

In [16]:
X_train, X_val, y_train, y_val = concatenate_matrix(default = True, param = False, embeddings = True, new_feature = False, validation = True)

(2074716, 6) (2074716, 2) (2074716, 2) (2074716, 1)


In [27]:
y_pred = train(X_train, X_val, y_train, y_val, model = LogisticRegression(max_iter = 300))

Validation loss = 0.2573


In [85]:
df_train = pd.DataFrame(data = X_train)
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,157.0,15.0,2.0,21.0,17.0,1.0,0.783946,0.39249
1,179.0,71.0,2.0,29.0,23.0,0.0,0.985746,1.093389
2,193.0,21.0,7.0,41.0,37.0,0.0,0.680442,0.404881
3,146.0,42.0,3.0,36.0,26.0,0.0,0.982158,0.964146
4,182.0,40.0,10.0,30.0,8.0,0.0,0.603073,0.518883


In [20]:
X_train, X_val, y_train, y_val = add_new_feature(H, measure = pref_attachment, distance = None, validation = True)

100%|██████████| 54597/54597 [00:00<00:00, 557645.13it/s]
1037358it [00:20, 51024.39it/s]
54597it [00:01, 50387.24it/s]


In [23]:
y_pred = train(X_train, X_val, y_train, y_val, model = LogisticRegression(max_iter = 300))

Validation loss = 0.2573


In [22]:
X_train, X_val = remove_last_feature(X_train, X_val)

# Test matrix

In [24]:
G = nx.read_edgelist('data/edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)

## Matrix creation/loading and submission

In [25]:
create_default_matrix(G, validation = False)

1091955it [01:05, 16608.25it/s]
106692it [00:02, 36626.16it/s]


In [30]:
create_param_matrix(G, validation = False)

Computing graph properties


1091955it [00:02, 388125.64it/s]
106692it [00:00, 1137894.63it/s]


In [35]:
distance = spatial.distance.cosine #spatial.distance.cosine #euclidean_distance #
create_embeddings_matrix(G, distance = distance, validation = False)

  X[p*i,j] = emb.docvecs.similarity(edge[0], edge[1])
  X[2*i+1,j] = emb.docvecs.similarity(n1, n2)
133534it [00:39, 3752.01it/s]

In [29]:
create_new_feature_matrix(G, measures = [pref_attachment], validation = False)

1091955it [00:21, 50492.42it/s]
106692it [00:00, 109798.68it/s]


(array([[ 40.],
        [ 67.],
        [ 78.],
        ...,
        [130.],
        [  2.],
        [301.]]),
 array([[1922.],
        [ 126.],
        [4096.],
        ...,
        [  45.],
        [1221.],
        [1008.]]),
 array([1., 0., 1., ..., 0., 1., 0.]))

In [30]:
X_train, X_test, y_train= concatenate_matrix(default = True, param = False, embeddings = True, new_feature = False, validation = False)

(2183910, 6) (2074716, 2) (2183910, 3) (2183910, 1)


In [34]:
X_train, X_test, y_train = add_new_feature(G, measure = shortest_path_length, distance = None, validation = False)

1091955it [11:05, 1640.69it/s]
106692it [00:28, 3794.94it/s]


In [37]:
y_pred = train(X_train, X_test, y_train, y_test = None, model = LogisticRegression(max_iter = 300))

Creating submission
Submision created


# Tests

In [45]:
df_train = pd.DataFrame(data = X_train)
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,157.0,15.0,2.0,22.0,18.0,1.0,0.0,0.0,0.0,0.0,0.0,0.783946,0.0,0.548589,0.248893,1.0
1,170.0,26.0,2.0,14.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.982963,0.0,0.948378,0.793439,4.0
2,193.0,21.0,7.0,41.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0,0.680442,0.0,0.460406,0.253649,1.0
3,252.0,72.0,11.0,19.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,1.007298,0.0,0.969859,0.838834,4.0
4,182.0,40.0,10.0,31.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.603073,0.0,0.342701,0.206544,1.0


In [74]:
walklets_parameters = {'walk_number': 10, 'walk_length': 80, 'dimensions': 64, 'window_size': 4}

walklets = np.load('data/embedding_Walklets_wn{:d}_wl{:d}_d{:d}_ws{:d}.npy'.format(walklets_parameters['walk_number'], walklets_parameters['walk_length'],
                                                                walklets_parameters['dimensions'], walklets_parameters['window_size']))
walklets.shape

(138499, 256)

In [48]:
authors_emb = np.load('data/embedding_authors_articles_mean.npy')
authors_emb

array([[2.70029998, 2.70029998, 2.70029998, ..., 2.70029998, 2.70029998,
        2.70029998],
       [1.49200952, 1.49200952, 1.49200952, ..., 1.49200952, 1.49200952,
        1.49200952],
       [2.09158826, 2.09158826, 2.09158826, ..., 2.09158826, 2.09158826,
        2.09158826],
       ...,
       [1.99976814, 1.99976814, 1.99976814, ..., 1.99976814, 1.99976814,
        1.99976814],
       [1.52823949, 1.52823949, 1.52823949, ..., 1.52823949, 1.52823949,
        1.52823949],
       [2.41397214, 2.41397214, 2.41397214, ..., 2.41397214, 2.41397214,
        2.41397214]])

# Old

## Testing embedding properties and other ideas

In [24]:
d2v = Doc2Vec.load("data/abstracts_embedding_doc2vec_vs64_w5_mc2_e100")

neighbor = nx.single_source_shortest_path_length(G, 1, cutoff=3)
# neighbor = {v: k for k, v in neighbor.items()}

for i in list(neighbor.keys())[1:10]:
    if nodes_connected(G, i, 1):
        print('Connected {:d}-{:d}: {:.2f}, {:d}'.format(i, 0, spatial.distance.cosine(d2v[i], d2v[1]), len(authors[i].intersection(authors[0]))))
    else:
        print('Not Connected {:d}-{:d}: {:.2f}, {:d}'.format(i, 0, spatial.distance.cosine(d2v[i], d2v[1]), len(authors[i].intersection(authors[0]))))

Connected 0-0: 0.09, 4
Connected 3-0: 0.59, 0
Connected 5-0: 1.10, 0
Connected 6-0: 0.65, 0
Connected 7-0: 0.61, 0
Connected 9-0: 0.69, 0
Connected 10-0: 0.70, 1
Connected 11-0: 0.54, 0
Connected 12-0: 0.61, 0


In [25]:
for i in range(0,20):
    # random.seed(i)
    k1 = randint(0, n-1)
    k2 = randint(0, n-1)
    if nodes_connected(G, k1, k2):
        print('Connected {:d}-{:d}: {:.2f}'.format(k1, 0, spatial.distance.cosine(d2v[k1], d2v[k2])))
    else:
        print('Not Connected {:d}-{:d}: {:.2f}'.format(k2, 0, spatial.distance.cosine(d2v[k1], d2v[k2])))

Not Connected 57843-0: 1.05
Not Connected 37538-0: 0.93
Not Connected 65147-0: 0.87
Not Connected 106678-0: 0.93
Not Connected 18223-0: 0.95
Not Connected 89973-0: 0.91
Not Connected 125515-0: 0.95
Not Connected 73222-0: 1.09
Not Connected 64208-0: 0.99
Not Connected 4594-0: 0.94
Not Connected 19913-0: 0.95
Not Connected 57703-0: 0.89
Not Connected 23905-0: 0.99
Not Connected 122321-0: 0.92
Not Connected 9201-0: 1.02
Not Connected 101385-0: 0.98
Not Connected 61178-0: 0.75
Not Connected 88102-0: 0.87
Not Connected 111660-0: 0.78
Not Connected 90555-0: 1.03


In [26]:
for i, edge in enumerate(G.edges()):
    print('Connected {:d}-{:d}: {:.2f}'.format(edge[0], edge[1], spatial.distance.cosine(d2v[edge[0]], d2v[edge[1]])))
    if i==30:
        break

Connected 0-1: 0.09
Connected 0-2: 0.86
Connected 1-3: 0.59
Connected 1-5: 1.10
Connected 1-6: 0.65
Connected 1-7: 0.61
Connected 1-9: 0.69
Connected 1-10: 0.70
Connected 1-11: 0.54
Connected 1-12: 0.61
Connected 1-13: 0.58
Connected 1-14: 0.59
Connected 1-15: 0.67
Connected 1-16: 0.66
Connected 1-17: 0.84
Connected 1-19: 1.00
Connected 1-20: 0.66
Connected 1-21: 0.74
Connected 1-22: 0.60
Connected 1-23: 0.61
Connected 1-24: 0.72
Connected 2-25: 0.82
Connected 2-26: 0.91
Connected 2-27: 0.99
Connected 2-28: 1.00
Connected 2-29: 0.99
Connected 2-30: 1.05
Connected 2-31: 1.04
Connected 2-32: 0.87
Connected 2-33: 0.92
Connected 2-34: 0.93


In [4]:
for i,edge in tqdm(enumerate(G.edges())):
    print(len(set(authors[edge[0]]).intersection(set(authors[edge[1]]))))
    # cos_distance = spatial.distance.cosine(d2v[edge[0]], d2v[edge[1]])
    # print(cos_distance)
   
    if i == 20:
        break

20it [00:00, 4169.29it/s]

1
0
0
0
1
0
0
2
0
0
0
0
0
0
0
0
0
0
0
0
0



