## Feature Engineering - Coauthors

In [1]:
import numpy as np
import pandas as pd
import math
import networkx as nx
from itertools import combinations

from joblib import Parallel, delayed
from tqdm import *

In [2]:
data_tran = pd.read_json('data2/data_tran.json', orient='records', lines=True)
data_test = pd.read_json('data2/data_test.json', orient='records', lines=True)

n_tran = data_tran.shape[0]
n_test = data_test.shape[0]

In [3]:
def get_coauthors_graph(data):

    coauthors_graph = nx.Graph()

    for authors in data['authors']:
        for author_pair in combinations(authors, 2):
            if coauthors_graph.has_edge(*author_pair):
                coauthors_graph[author_pair[0]][author_pair[1]]['weight'] += 1
            else:
                coauthors_graph.add_edge(author_pair[0], author_pair[1], weight=1)
    
    return coauthors_graph

coauthors_graph = get_coauthors_graph(data_tran)

In [4]:
def get_coauthors_vector(graph, start_nodes):

    node_array = np.zeros((1, 100)) 

    def dfs_iterative(graph, start_node):
        
        if start_node not in graph:
            return
        
        stack = [(start_node, 1)] 
        visited = set() 

        while stack:
            
            node, depth = stack.pop()

            if node in visited:
                continue 

            visited.add(node)

            for neighbor, edge in graph[node].items():
                weight = edge['weight'] * (1 / (depth * math.log(depth + 1)))
                if 0 <= neighbor < 100: 
                    node_array[0, neighbor] += weight 
                if neighbor not in visited:
                    stack.append((neighbor, depth + 1)) 

    for start_node in start_nodes:
        dfs_iterative(graph, start_node)

    return node_array.reshape(1, 100)

In [5]:
def get_coauthors_matrix(data, graph):
    vectors_list = Parallel(n_jobs=-1)(delayed(get_coauthors_vector)(graph, row['coauthors']) for _, row in tqdm(data.iterrows(), total=len(data)))
    return np.concatenate(vectors_list, axis=0)

In [6]:
x_tran_coauthors = get_coauthors_matrix(data_tran, coauthors_graph)
x_test_coauthors = get_coauthors_matrix(data_test, coauthors_graph)

np.save('data2/x_tran_coauthors.npy', x_tran_coauthors)
np.save('data2/x_test_coauthors.npy', x_test_coauthors)

100%|██████████| 8460/8460 [06:12<00:00, 22.71it/s]
100%|██████████| 800/800 [00:30<00:00, 25.81it/s]
