<center><h2>ALTEGRAD Project</h2>

<hr>
<span style="font-variant: small-caps;">Xavier Jiménez, Jean Quentin, Sacha Revol</span><br>
<hr>
</center>

# Imports

In [1]:
import networkx as nx
import os
import csv
import numpy as np
import pandas as pd
from random import randint
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string
from tqdm import tqdm

# !pip install pip install karateclub
from gensim.models.doc2vec import Doc2Vec
from os import path
import pickle
from scipy import spatial
import random
from sklearn import preprocessing
from gensim.models import KeyedVectors
from sklearn.metrics import pairwise

from sklearn.model_selection import train_test_split


# Load preprocessed files

These files are created in `Preprocessing.ipynb`

In [2]:
def load_authors():
    """Read the abstract of each paper"""
    
    try:
        print('Loading unique authors preprocessed')
        a_file = open("data/unique_authors_dict.pkl", "rb")
        authors = pickle.load(a_file)
        a_file.close()
    except:
        raise SyntaxError("File 'unique_authors_dict.pkl' was not found in 'data/'")
    
    return authors

In [3]:
def load_abstracts():
    """Read the abstract of each paper"""
    
    try:
        print('Loading abstract preprocessed')
        a_file = open("data/abstract_preprocessed.pkl", "rb")
        abstracts = pickle.load(a_file)
        a_file.close()
    except:
        raise SyntaxError("File 'abstract_preprocessed.pkl' was not found in 'data/'")
    
    return abstracts

# Functions

## Training and submission

In [4]:
def train(X_train, X_test, y_train, y_test = None, normalize = True, model = LogisticRegression(max_iter = 300)):
    if normalize:
        scaler = preprocessing.StandardScaler().fit(X_train)

        X_train = scaler.transform(X_train)
        X_test = scaler.fit_transform(X_test)
    
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)
    y_pred = y_pred[:,1]
    if y_test is not None:
        print('Validation loss = {:.4f}'.format(log_loss(y_test, y_pred)))
    else:
        # Write predictions to a file
        print('Creating submission')
        predictions = zip(range(len(y_pred)), y_pred)
        os.remove("data/submission.csv")
        with open("data/submission.csv","w") as pred:
            csv_out = csv.writer(pred)
            csv_out.writerow(['id','predicted'])
            for row in predictions:
                csv_out.writerow(row)
        print('Submision created')
    return y_pred

## Feature matrix creation

### Common functions for matrix creation

In [5]:
def load_info(validation = True):
    """This function returns validation/test node pairs list and
    validation/test + train random nodes that will be used to create
    feature matrix. These lists have a fixed seed.

    Args:
        validation (bool, optional): If True, will return validation lists.
        Else, will return test lists. Defaults to True.

    Returns:
        lists: validation/test node pairs list and labels
    """
      
        
    if validation:
        node_pairs = pd.read_csv('data/train_val_node_pairs.csv')
        labels = pd.read_csv('data/train_val_labels.csv')
        
        train_node_pairs, val_node_pairs, train_labels, val_labels = train_test_split(node_pairs, 
                                                                                      labels, 
                                                                                      train_size=0.8, 
                                                                                      test_size=0.2, 
                                                                                      random_state=42)
        np.save('data/y_train_val', train_labels.to_numpy().ravel())
        np.save('data/y_val', val_labels.to_numpy().ravel())
        
        return train_node_pairs.to_numpy(), val_node_pairs.to_numpy()
    else:
        test_node_pairs = list()
        with open('data/test.txt', 'r') as f:
            for line in f:
                t = line.split(',')
                test_node_pairs.append((int(t[0]), int(t[1])))
                
        train_node_pairs = pd.read_csv('data/train_test_node_pairs.csv')
        train_labels = pd.read_csv('data/train_test_labels.csv')
        
        np.save('data/y_train', train_labels.to_numpy().ravel())
                
        return train_node_pairs.to_numpy(), test_node_pairs

### Default matrix creation

In [6]:
def create_default_matrix(G_train, G_test, validation = True):
    """Creates feature matrix with baseline features. 

    Args:
        G (NetworkX Graph): A Graph loaded with NetworkX library.
        validation (bool, optional): If True, will return validation lists.
        Else, will return test lists. Defaults to True.
    """

    n_features = 6
    
    
    if validation:
        train_node_pairs, val_node_pairs = load_info(validation = validation)
        X_val = np.zeros((len(val_node_pairs), n_features))
    else:
        train_node_pairs, test_node_pairs = load_info(validation = validation)
        X_test = np.zeros((len(test_node_pairs), n_features))           
    X_train = np.zeros((len(train_node_pairs), n_features))
    
    authors = load_authors()
    abstracts = load_abstracts()
    
    for i, train_edge in tqdm(enumerate(train_node_pairs)):
        X_train = fill_default_matrix(i, G_train, X_train, train_edge, authors, abstracts)
    if validation:
        np.save('data/X_train_val_default.npy', X_train)
        for i, val_edge in tqdm(enumerate(val_node_pairs)):
            X_val = fill_default_matrix(i, G_test, X_val, val_edge, authors, abstracts)
        np.save('data/X_val_default.npy', X_val)
    else:
        np.save('data/X_train_default.npy', X_train)
        for i, test_edge in tqdm(enumerate(test_node_pairs)):
            X_test = fill_default_matrix(i, G_test, X_test, test_edge, authors, abstracts)
        np.save('data/X_test_default.npy', X_test)
        
        
def fill_default_matrix(i, G, X, edge, authors, abstracts):
    """Fills each line of the default feature matrix.

    Args:
        i (int): matrix line.
        G (nx graph): Graph loaded with NetworkX.
        X (np.ndarray): Shape (2 x n_edges, n_features)
        edge (list): list of tupple of nodes.

    Returns:
        np.ndarray: if p=1, returns X, y filled. Elif p=2, returns X filled.
    """
    X[i,0] = len(abstracts[edge[0]]) + len(abstracts[edge[1]])
    X[i,1] = abs(len(abstracts[edge[0]]) - len(abstracts[edge[1]]))
    X[i,2] = len(set(abstracts[edge[0]]).intersection(set(abstracts[edge[1]])))
    # try:
    X[i,3] = G.degree(edge[0]) + G.degree(edge[1])
    X[i,4] = abs(G.degree(edge[0]) - G.degree(edge[1]))
    # except:
    #     X[i,3] = -1
    #     X[i,4] = -1
    X[i,5] = len(set(authors[edge[0]]).intersection(set(authors[edge[1]])))
    # X[i,5] = authors_intersection(authors[edge[0]], authors[edge[1]])
    # X[i,6] = first_author_in_common(authors[edge[0]], authors[edge[1]])

    return X

### Param matrix creation

In [7]:
def graph_properties(G):
    """Computes standard Graph properties with NetworkX
    Args:
        G ([nx graph)
    Returns:
        list of arrays, each array contains individual property 
        for each node.
    """
    
    print('Computing graph properties')
    avg_neighbor_degree = nx.average_neighbor_degree(G)
    pagerank = nx.pagerank_scipy(G)
    # eig_centrality = nx.eigenvector_centrality_numpy(G)
    # greedy_color = nx.greedy_color(G)
    # triangles = nx.triangles(G)

    return [avg_neighbor_degree, pagerank]#, eig_centrality, greedy_color, triangles]

def create_param_matrix(G_train, G_test, validation = True):
    """Creates Graph parameters feature matrix. 

    Args:
        G (NetworkX Graph): A Graph loaded with NetworkX library.
        validation (bool, optional): If True, will return validation lists.
        Else, will return test lists. Defaults to True.
    """

    
    n_features = 2*5
    
    if validation:
        train_node_pairs, val_node_pairs = load_info(validation = validation)
        X_val = np.zeros((len(val_node_pairs), n_features))
    else:
        train_node_pairs, test_node_pairs = load_info(validation = validation)
        X_test = np.zeros((len(test_node_pairs), n_features))           
    X_train = np.zeros((len(train_node_pairs), n_features))
    
    
    avg_neighbor_degree = nx.average_neighbor_degree(G_train)
    pagerank = nx.pagerank(G_train)
    eig_centrality = nx.eigenvector_centrality_numpy(G_train)
    greedy_color = nx.greedy_color(G_train)
    triangles = nx.triangles(G_train)
    parameters = {'avg_neighbor_degree': avg_neighbor_degree, 'pagerank': pagerank, 'eig_centrality': eig_centrality,
                  'greedy_color': greedy_color, 'triangles': triangles}
                
    for i, train_edge in tqdm(enumerate(train_node_pairs)):
        X_train = fill_param_matrix(i, X_train, train_edge, **parameters)
    if validation:
        avg_neighbor_degree = nx.average_neighbor_degree(G_test)
        pagerank = nx.pagerank(G_test)
        eig_centrality = nx.eigenvector_centrality_numpy(G_test)
        greedy_color = nx.greedy_color(G_test)
        triangles = nx.triangles(G_test)
        parameters = {'avg_neighbor_degree': avg_neighbor_degree, 'pagerank': pagerank, 'eig_centrality': eig_centrality,
                      'greedy_color': greedy_color, 'triangles': triangles}
        np.save('data/X_train_val_param', X_train)
        for i, val_edge in tqdm(enumerate(val_node_pairs)):
            X_val = fill_param_matrix(i, X_val, val_edge, **parameters)
        np.save('data/X_val_param', X_val)
    else:
        avg_neighbor_degree = nx.average_neighbor_degree(G_test)
        pagerank = nx.pagerank(G_test)
        eig_centrality = nx.eigenvector_centrality_numpy(G_test)
        greedy_color = nx.greedy_color(G_test)
        triangles = nx.triangles(G_test)
        parameters = {'avg_neighbor_degree': avg_neighbor_degree, 'pagerank': pagerank, 'eig_centrality': eig_centrality,
                      'greedy_color': greedy_color, 'triangles': triangles}
        np.save('data/X_train_param', X_train)
        for i, test_edge in tqdm(enumerate(test_node_pairs)):
            X_test = fill_param_matrix(i, X_test, test_edge, **parameters)
        np.save('data/X_test_param.npy', X_test)
        
        
def fill_param_matrix(i, X, edge, avg_neighbor_degree, pagerank, eig_centrality, greedy_color, triangles):
    """Fills each line of the Graph parameters feature matrix."""
    
    X[i,0] = avg_neighbor_degree[edge[0]] + avg_neighbor_degree[edge[1]]
    X[i,1] = abs(avg_neighbor_degree[edge[0]] - avg_neighbor_degree[edge[1]])

    X[i,2] = pagerank[edge[0]] + pagerank[edge[1]]
    X[i,3] = abs(pagerank[edge[0]] - pagerank[edge[1]])

    X[i,4] = eig_centrality[edge[0]] + eig_centrality[edge[1]]
    X[i,5] = abs(eig_centrality[edge[0]] - eig_centrality[edge[1]])

    X[i,6] = greedy_color[edge[0]] + greedy_color[edge[1]]
    X[i,7] = abs(greedy_color[edge[0]] - greedy_color[edge[1]])

    X[i,8] = triangles[edge[0]] + triangles[edge[1]]
    X[i,9] = abs(triangles[edge[0]] - triangles[edge[1]])
    
    return X

### Embedding Matrix creation

In [29]:
def create_embeddings_matrix(distance, validation = True):
    """Creates Graph embeddings feature matrix.

    Args:
        distance (scipy spatial distance): Function that takes two 1d arrays
        and outputs a float. Recommended: cosine distance.
        validation (bool, optional): If True, will return validation lists.
        Else, will return test lists. Defaults to True.
    """

    n2v_parameters = {'walk_number': 10, 'walk_length': 15, 'dimensions': 64, 'window_size': 5}
    d2v_parameters = {'vector_size':128, 'window':5, 'min_count':2, 'epochs':100}
    walkets_parameters = {'walk_number': 10, 'walk_length': 80, 'dimensions': 64, 'window_size': 5}

    # d2v = Doc2Vec.load("data/abstracts_embedding_doc2vec_vs{:d}_w{:d}_mc{:d}_e{:d}".format(d2v_parameters['vector_size'], d2v_parameters['window'],
    #                                                                            d2v_parameters['min_count'], d2v_parameters['epochs']))

    specter_emb = np.load('data/embeddings_14_02.npy')
        
    embeddings = [specter_emb]#[d2v]
    
    if validation:
        embeddings_val = [specter_emb]
        
        n2v = np.load('data/embedding_n2v_train_val_wn{:d}_wl{:d}_d{:d}_ws{:d}.npy'.format(n2v_parameters['walk_number'], n2v_parameters['walk_length'],
                                                                n2v_parameters['dimensions'], n2v_parameters['window_size']))
        
        walkets = np.load('data/embedding_Walklets_train_val_wn{:d}_wl{:d}_d{:d}_ws{:d}.npy'.format(walkets_parameters['walk_number'], walkets_parameters['walk_length'],
                                                                walkets_parameters['dimensions'], walkets_parameters['window_size']))
    

        embeddings.append(n2v)
        embeddings_val.append(n2v)
        embeddings.append(walkets)
        embeddings_val.append(walkets)
        
        n_features = len(embeddings)
        print(f'n_features = {n_features}')
        train_node_pairs, val_node_pairs = load_info(validation = validation)
        X_val = np.zeros((len(val_node_pairs), n_features))
    else:
        embeddings_test = [specter_emb]
        
        n2v_test = np.load('data/embedding_n2v_wn{:d}_wl{:d}_d{:d}_ws{:d}.npy'.format(n2v_parameters['walk_number'], n2v_parameters['walk_length'],
                                                                n2v_parameters['dimensions'], n2v_parameters['window_size']))
        n2v = np.load('data/embedding_n2v_train_test_wn{:d}_wl{:d}_d{:d}_ws{:d}.npy'.format(n2v_parameters['walk_number'], n2v_parameters['walk_length'],
                                                                n2v_parameters['dimensions'], n2v_parameters['window_size']))
        walkets_test = np.load('data/embedding_Walklets_wn{:d}_wl{:d}_d{:d}_ws{:d}.npy'.format(walkets_parameters['walk_number'], walkets_parameters['walk_length'],
                                                                walkets_parameters['dimensions'], 4))
        walkets = np.load('data/embedding_Walklets_train_test_wn{:d}_wl{:d}_d{:d}_ws{:d}.npy'.format(walkets_parameters['walk_number'], walkets_parameters['walk_length'],
                                                                walkets_parameters['dimensions'], walkets_parameters['window_size']))
        
        embeddings.append(n2v)
        embeddings_test.append(n2v_test)
        
        embeddings.append(walkets)
        embeddings_test.append(walkets_test)
        
        n_features = len(embeddings)
        print(f'n_features = {n_features}')
        train_node_pairs, test_node_pairs = load_info(validation = validation)
        X_test = np.zeros((len(test_node_pairs), n_features))           
    X_train = np.zeros((len(train_node_pairs), n_features))



    for i, train_edge in tqdm(enumerate(train_node_pairs)):
        X_train  = fill_embeddings_matrix(i, X_train, train_edge, embeddings, distance)
    if validation:
        np.save('data/X_train_val_embeddings', X_train)
        for i, val_edge in tqdm(enumerate(val_node_pairs)):
            X_val = fill_embeddings_matrix(i, X_val, val_edge, embeddings_val, distance)
        np.save('data/X_val_embeddings', X_val)
    else:
        np.save('data/X_train_embeddings', X_train)
        for i, test_edge in tqdm(enumerate(test_node_pairs)):
            X_test = fill_embeddings_matrix(i, X_test, test_edge,  embeddings_test, distance)
        np.save('data/X_test_embeddings', X_test)


def fill_embeddings_matrix(i, X, edge, embeddings, distance):
    """Fills each line of the Graph embedding feature matrix.

    Args:
        i (int): matrix line.
        G (nx graph): Graph loaded with NetworkX.
        X (np.ndarray): Shape (2 x n_edges, n_features)
        edge (list): list of tupple of nodes.
        embeddings (list): list of np.ndarrays containing embeddings for each node in the Graph
        distance (scipy spatial distance): Function that takes two 1d arrays
        and outputs a float. Recommended: cosine distance.

    Returns:
        np.ndarray: if p=1, returns X, y filled. Elif p=2, returns X filled.
    """

    for j, emb in enumerate(embeddings):
        # if j == 0:
        #     try:
        #         X[i,j] = emb.docvecs.similarity(edge[0], edge[1])
        #     except:
        #         X[i,j] = emb.dv.similarity(edge[0], edge[1])
        # else:
        X[i,j] = 1 - distance(emb[edge[0]], emb[edge[1]])

    return X

### Other features

In [9]:
def create_new_feature_matrix(G_train, G_test, measures, validation = True):
    n_features = len(measures)
    
    if validation:
        train_node_pairs, val_node_pairs = load_info(validation = validation)
        X_val = np.zeros((len(val_node_pairs), n_features))
    else:
        train_node_pairs, test_node_pairs = load_info(validation = validation)
        X_test = np.zeros((len(test_node_pairs), n_features))           
    X_train = np.zeros((len(train_node_pairs), n_features))
        
    for i, train_edge in tqdm(enumerate(train_node_pairs)):
        X_train = fill_new_feature_matrix(i, G_train, X_train, train_edge, measures)
    
    if validation:
        np.save('data/X_train_val_new_features', X_train)
        for i, val_edge in tqdm(enumerate(val_node_pairs)):
            X_val = fill_new_feature_matrix(i, G_test, X_val, val_edge, measures)
        np.save('data/X_val_new_features', X_val)
        return X_train, X_val

    else:
        np.save('data/X_train_new_features', X_train)
        for i, test_edge in tqdm(enumerate(test_node_pairs)):
            X_test = fill_new_feature_matrix(i, G_test, X_test, test_edge, measures)
        np.save('data/X_test_new_features', X_test)
        return X_train, X_test
        
        
def fill_new_feature_matrix(i, G, X, edge, measures):
    for j, measure in enumerate(measures):
        X[i,j] = measure(edge[0], edge[1], G)
    
    return X

### Matrix concatenation & loading

In [10]:
def concatenate_matrix(default = True, param = True, embeddings = True, new_feature = True, validation = True):
    bool_list = [default, param, embeddings, new_feature]
    
    if validation:
        y_train = np.load('data/y_train_val.npy')
        X_train_default = np.load('data/X_train_val_default.npy')
        X_train_param = np.load('data/X_train_val_param.npy')
        X_train_embeddings = np.load('data/X_train_val_embeddings.npy')
        X_train_new_feature = np.load('data/X_train_val_new_features.npy')
        X_train_list = [X for i,X in enumerate([X_train_default, X_train_param, X_train_embeddings, X_train_new_feature]) if bool_list[i]]
        X_train = np.concatenate(X_train_list, axis=1)
        np.save('data/X_train_val', X_train)
    
        X_val_default = np.load('data/X_val_default.npy')
        X_val_param = np.load('data/X_val_param.npy')
        X_val_embeddings = np.load('data/X_val_embeddings.npy')
        X_val_new_feature = np.load('data/X_val_new_features.npy')
        X_val_list = [X for i,X in enumerate([X_val_default, X_val_param, X_val_embeddings, X_val_new_feature]) if bool_list[i]]
        X_val = np.concatenate(X_val_list, axis=1)
        np.save('data/X_val', X_val)
        y_val = np.load('data/y_val.npy')
        return X_train, X_val, y_train, y_val
    else:
        y_train = np.load('data/y_train.npy')
        X_train_default = np.load('data/X_train_default.npy')
        X_train_param = np.load('data/X_train_param.npy')
        X_train_embeddings = np.load('data/X_train_embeddings.npy')
        X_train_new_feature = np.load('data/X_train_new_features.npy')
        X_train_list = [X for i,X in enumerate([X_train_default, X_train_param, X_train_embeddings, X_train_new_feature]) if bool_list[i]]
        X_train = np.concatenate(X_train_list, axis=1)
        np.save('data/X_train', X_train)
        
        X_test_default = np.load('data/X_test_default.npy')
        X_test_param = np.load('data/X_test_param.npy')
        X_test_embeddings = np.load('data/X_test_embeddings.npy')
        X_test_new_feature = np.load('data/X_test_new_features.npy')
        X_test_list = [X for i,X in enumerate([X_test_default, X_test_param, X_test_embeddings, X_test_new_feature]) if bool_list[i]]
        X_test = np.concatenate(X_test_list, axis=1)
        np.save('data/X_test', X_test)
        return X_train, X_test, y_train

def load_matrix(validation = True):
    if validation:
        y_train = np.load('data/y_train_val.npy')
        X_train = np.load('data/X_train_val.npy')
        X_val = np.load('data/X_val.npy')
        y_val = np.load('data/y_val.npy')
        return X_train, X_val, y_train, y_val
    else:
        y_train = np.load('data/y_train.npy')
        X_train = np.load('data/X_train.npy')
        X_test = np.load('data/X_test.npy')
        return X_train, X_test, y_train

### Functions to add/remove new features

In [11]:
def add_new_feature(G_train, G_test, measure, distance, validation = True):
    n_features = 1
    
    if validation:
        train_node_pairs, val_node_pairs = load_info(validation = validation)
        X_val = np.zeros((len(val_node_pairs), n_features))
    else:
        train_node_pairs, test_node_pairs = load_info(validation = validation)
        X_test = np.zeros((len(test_node_pairs), n_features))           
    X_train = np.zeros((len(train_node_pairs), n_features))
        
    for i, train_edge in tqdm(enumerate(train_node_pairs)):
        X_train, y_train = fill_new_feature(i, G_train, X_train, train_edge, distance, measure)
    
    if validation:
        for i, val_edge in tqdm(enumerate(val_node_pairs)):
            X_val, y_val = fill_new_feature(i, G_test, X_val, val_edge, distance, measure)
        X_val_full = np.load('data/X_val.npy')
        X_train_full = np.load('data/X_train_val.npy')
        return np.concatenate((X_train_full, X_train), axis=1), np.concatenate((X_val_full, X_val), axis=1), y_train, y_val

    else:
        for i, test_edge in tqdm(enumerate(test_node_pairs)):
            X_test = fill_new_feature(i, G_test, X_test, test_edge, distance, measure)
        X_test_full = np.load('data/X_test.npy')
        X_train_full = np.load('data/X_train.npy')
        return np.concatenate((X_train_full, X_train), axis=1), np.concatenate((X_test_full, X_test), axis=1), y_train
        
        
def fill_new_feature(i, G, X, edge, distance, measure):
    X[i,0] = measure(edge[0], edge[1], G, distance)
    
    return X

def remove_last_feature(X_train, X_test):
    return X_train[:,:-1], X_test[:,:-1]
    

## Measures for new features

In [12]:
def shortest_path_length(n1, n2, G, distance=None):
    """
    Computes the shortest path length between two nodes in a graph.
    """
    try:
        length = nx.shortest_path_length(G, n1, n2)
    except:
        length = -1
    return length

def adamic_adar_index(n1, n2, G, distance=None):
    """
    Computes the adamic adar index of two nodes in a graph.
    """
    _, _, index = list(nx.adamic_adar_index(G, [(n1, n2)]))[0]
    return index


def pref_attachment(n1, n2, G, distance=None):
    """ 
    Computes the preferential attachment of two nodes in a graph.
    Useless
    """
    try:
        _, _, p = list(nx.preferential_attachment(G, [(n1, n2)]))[0]
    except:
        p = -1
    return p

def common_neighbor_centrality(n1, n2, G, distance=None):
    """ 
    Computes the common neighbor centrality of two nodes in a graph.
    """
    _, _, p = nx.common_neighbor_centrality(G, [(n1, n2)])
    return p

def ressouce_allocation_index(n1, n2, G, distance=None):
    try:
        _, _, p =  list(nx.resource_allocation_index(G, [(n1, n2)]))[0]
    except:
        p = -1
    return p

def salton_index(node_1, node_2, G, distance=None):
    """ 
    Computes the salton index of two nodes in a graph
    """
    sqrt_prod = np.sqrt(G.degree(node_1) * G.degree(node_1))
    if sqrt_prod == 0:
        return 0
    return len(list(nx.common_neighbors(G, node_1, node_2))) / sqrt_prod

def jaccard_coefficient(node_1, node_2, G, distance=None):
    """
    Computes the jaccard coefficient of two nodes in a graph.
    """
    _, _, coeff = list(nx.jaccard_coefficient(G, [(node_1, node_2)]))[0]
    return coeff

# Validation matrix

For validation, all parameters including embeddings should be computed using the validation graph. Results should be computed again on the full graph to submit a test result.

In [13]:
H_train = nx.read_edgelist('data/edgelist_train.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)
H_val = nx.read_edgelist('data/edgelist_train.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)

## Matrix creation/loading and training

In [45]:
create_default_matrix(H_train, H_val, validation = True)

Loading unique authors preprocessed
Loading abstract preprocessed


279538it [00:08, 32799.24it/s]
69885it [00:02, 32698.92it/s]


In [55]:
create_param_matrix(H_train, H_val, validation = True)

  pagerank = nx.pagerank_scipy(G_train)
279538it [00:02, 102114.31it/s]
  pagerank = nx.pagerank_scipy(G_test)
69885it [00:00, 90227.78it/s]


In [30]:
# BAD: chebyshev, braycurtis, canberra, euclidian, jaccard
# TO TEST: cityblock, correlation, jensenshannon, mahalanobis, minkowski, seuclidean, sqeuclidean, wminkowski, dice, hamming
# BEST: cosine
distance = spatial.distance.cosine #
create_embeddings_matrix(distance = distance, validation = True)

n_features = 3


279538it [00:38, 7294.55it/s]
69885it [00:09, 7224.88it/s]


In [25]:
measures = [salton_index, adamic_adar_index, jaccard_coefficient, pref_attachment, shortest_path_length]#, ressouce_allocation_index]
create_new_feature_matrix(H_train, H_val, measures = measures, validation = True)

279538it [01:59, 2347.80it/s]
69885it [00:29, 2349.42it/s]


(array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 5.00000000e+00,
         8.00000000e+00],
        [2.94117647e-02, 2.81266414e-01, 2.85714286e-02, 6.80000000e+01,
         2.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 7.00000000e+00,
         7.00000000e+00],
        ...,
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 8.10000000e+02,
         3.00000000e+00],
        [1.25000000e-01, 6.77351235e-01, 1.11111111e-01, 6.40000000e+01,
         2.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.08000000e+02,
         3.00000000e+00]]),
 array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.06400000e+03,
         3.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.40000000e+01,
         3.00000000e+00],
        [1.25000000e-01, 4.02429604e-01, 6.66666667e-02, 6.40000000e+01,
         2.00000000e+00],
        ...,
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 7.00000000e+00,
        

In [31]:
X_train, X_val, y_train, y_val = concatenate_matrix(default = True, param = True, embeddings = True, new_feature = True, validation = True)

In [32]:
y_pred = train(X_train, X_val, y_train, y_val, model = LogisticRegression(max_iter = 300))

Validation loss = 0.1055


In [33]:
df_train = pd.DataFrame(data = X_train)
df_train.head(-40)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,253.0,17.0,4.0,6.0,4.0,0.0,10.000000,2.000000,0.000013,0.000007,...,1.0,1.0,0.652660,0.268699,0.577658,0.000000,0.000000,0.000000,5.0,8.0
1,109.0,3.0,7.0,36.0,32.0,0.0,39.500000,8.500000,0.000029,0.000023,...,29.0,29.0,0.889476,0.391978,0.807312,0.029412,0.281266,0.028571,68.0,2.0
2,106.0,22.0,1.0,8.0,6.0,0.0,46.857143,40.857143,0.000009,0.000003,...,1.0,1.0,0.645946,0.147216,0.181125,0.000000,0.000000,0.000000,7.0,7.0
3,203.0,61.0,8.0,55.0,9.0,0.0,194.221467,6.908967,0.000028,0.000005,...,53.0,29.0,0.784023,0.486918,0.797763,0.062500,0.707382,0.037736,736.0,2.0
4,115.0,115.0,0.0,114.0,100.0,0.0,50.347130,12.795728,0.000064,0.000054,...,131.0,125.0,0.733468,0.246951,0.282478,0.000000,0.000000,0.000000,749.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279493,154.0,2.0,5.0,9.0,3.0,0.0,65.666667,7.000000,0.000012,0.000006,...,0.0,0.0,0.837018,0.154593,0.679506,0.000000,0.000000,0.000000,18.0,4.0
279494,107.0,107.0,0.0,6.0,2.0,0.0,13.750000,4.250000,0.000011,0.000005,...,1.0,1.0,0.680740,0.150152,0.546786,0.000000,0.000000,0.000000,8.0,9.0
279495,147.0,35.0,3.0,11.0,5.0,0.0,230.416667,156.916667,0.000009,0.000002,...,0.0,0.0,0.764498,0.229161,0.363083,0.000000,0.000000,0.000000,24.0,4.0
279496,223.0,21.0,12.0,95.0,55.0,0.0,43.523333,4.376667,0.000056,0.000032,...,52.0,2.0,0.894458,0.625933,0.635741,0.100000,0.768794,0.021505,1500.0,2.0


# Test matrix

In [20]:
G_train = nx.read_edgelist('data/edgelist_test.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)
G_test = nx.read_edgelist('data/edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)


## Matrix creation/loading and submission

In [44]:
create_default_matrix(G_train, G_test, validation = False)

Loading unique authors preprocessed
Loading abstract preprocessed


436781it [00:11, 39462.25it/s]
106692it [00:02, 47671.54it/s]


In [58]:
create_param_matrix(G_train, G_test, validation = False)

  pagerank = nx.pagerank_scipy(G_train)
436781it [00:03, 110282.24it/s]
  pagerank = nx.pagerank_scipy(G_test)
106692it [00:00, 149424.28it/s]


In [34]:
distance = spatial.distance.cosine #spatial.distance.cosine #euclidean_distance #
create_embeddings_matrix(distance = distance, validation = False)

n_features = 3


436781it [00:59, 7305.63it/s]
106692it [00:14, 7353.90it/s]


In [50]:
measures = [salton_index, adamic_adar_index, jaccard_coefficient, pref_attachment, shortest_path_length]
create_new_feature_matrix(G_train, G_test, measures = measures, validation = False)

436781it [03:41, 1972.56it/s]
106692it [00:58, 1819.72it/s]


(array([[1.33333333e-01, 4.18576972e-01, 6.89655172e-02, 2.40000000e+02,
         2.00000000e+00],
        [2.58620690e-02, 7.45454926e-01, 2.22222222e-02, 2.55200000e+03,
         2.00000000e+00],
        [3.55871886e-03, 8.37482479e-01, 3.47624565e-03, 1.93890000e+04,
         2.00000000e+00],
        ...,
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 8.00000000e+01,
         5.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.80000000e+02,
         4.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.90000000e+02,
         4.00000000e+00]]),
 array([[1.61290323e-02, 4.55119613e-01, 1.08695652e-02, 1.92200000e+03,
         2.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.26000000e+02,
         4.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.09600000e+03,
         3.00000000e+00],
        ...,
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.50000000e+01,
        

In [35]:
X_train, X_test, y_train = concatenate_matrix(default = True, param = True, embeddings = True, new_feature = True, validation = False)

In [36]:
y_pred = train(X_train, X_test, y_train, y_test = None, model = LogisticRegression(max_iter = 300))

Creating submission
Submision created


In [38]:
df_train = pd.DataFrame(data = X_train)
df_train.head(-40)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,168.0,34.0,4.0,31.0,1.0,0.0,496.683333,48.816667,0.000012,1.061758e-06,...,25.0,11.0,0.772145,0.480954,0.728612,0.133333,0.418577,0.068966,240.0,2.0
1,245.0,43.0,14.0,138.0,94.0,0.0,211.132445,62.322100,0.000040,2.429915e-05,...,323.0,283.0,0.801074,0.285760,0.598979,0.025862,0.745455,0.022222,2552.0,2.0
2,237.0,25.0,11.0,866.0,820.0,0.0,202.770643,165.316313,0.000379,3.634374e-04,...,583.0,491.0,0.781474,0.322742,0.305678,0.003559,0.837482,0.003476,19389.0,2.0
3,108.0,42.0,5.0,18.0,8.0,0.0,33.984615,3.215385,0.000014,3.511014e-06,...,7.0,7.0,0.716314,0.102442,0.689577,0.000000,0.000000,0.000000,65.0,3.0
4,174.0,8.0,12.0,20.0,2.0,0.0,40.030303,9.303030,0.000013,1.207451e-07,...,9.0,3.0,0.855762,0.433182,0.863966,0.000000,0.000000,0.000000,99.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436736,202.0,12.0,3.0,46.0,20.0,0.0,484.717949,58.051282,0.000021,9.160719e-07,...,74.0,62.0,0.587851,0.052791,0.258725,0.000000,0.000000,0.000000,429.0,3.0
436737,360.0,220.0,7.0,56.0,52.0,0.0,25.259259,16.259259,0.000029,2.084571e-05,...,173.0,171.0,0.607258,0.190334,0.280383,0.000000,0.000000,0.000000,108.0,6.0
436738,155.0,55.0,2.0,6.0,2.0,0.0,15.000000,2.000000,0.000008,5.767449e-07,...,5.0,3.0,0.701786,0.066005,0.237775,0.000000,0.000000,0.000000,8.0,7.0
436739,122.0,2.0,3.0,16.0,6.0,0.0,347.400000,92.600000,0.000012,3.040237e-06,...,6.0,2.0,0.612892,0.124385,0.354304,0.000000,0.000000,0.000000,55.0,3.0
