<center><h2>ALTEGRAD Project</h2>

<hr>
<span style="font-variant: small-caps;">Xavier Jiménez</span><br>
<hr>
</center>

# Imports

In [1]:
import networkx as nx
import os
import csv
import numpy as np
import pandas as pd
from random import randint
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss, make_scorer
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string
from sklearn.model_selection import cross_val_score
from tqdm import tqdm

# !pip install pip install karateclub
from gensim.models.doc2vec import Doc2Vec
from os import path
import pickle
from scipy import spatial
import random
try:
    from google.colab import files
except:
    pass


In [2]:
if not path.isdir('data'):
    !mkdir data
    !wget -O altegrad.zip https://www.dropbox.com/sh/fhfjjtk0sr7pmse/AAD4ZEtHv9OI5HfVO22tdMX0a?dl=1
    !unzip altegrad.zip
else:
    print('Data already downloaded')

Data already downloaded


# Preprocessing

In [2]:
# Create a graph
G = nx.read_edgelist('data/edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)
nodes = list(G.nodes())
n = G.number_of_nodes()
m = G.number_of_edges()
print('Number of nodes:', n)
print('Number of edges:', m)

Number of nodes: 138499
Number of edges: 1091955


In [3]:
# Read the abstract of each paper
try:
    a_file = open("data/authors_preprocessed.pkl", "rb")
    authors = pickle.load(a_file)
    a_file.close()
    print('Authors already preprocessed')
except:
    print('Preprocessing authors')
    authors = dict()
    with open('data/authors.txt', 'r') as f:
        for line in tqdm(f):
            node, author = line.split('|--|')
            # author = author.lower()
            author = author.split(',')
            author[-1] = author[-1].strip()
            authors[int(node)] = set(author)
        # for node in authors:
        #     authors[node] = set(authors[node].split())
        a_file = open("data/authors_preprocessed.pkl", "wb")
        pickle.dump(authors, a_file)
        a_file.close()
    print('Preprocessing Done')

authors[0]

18670it [00:00, 186697.06it/s]

Preprocessing authors


138499it [00:01, 103847.65it/s]

Preprocessing Done





{'George W. Irwin', 'James H. Niblock', 'Jian-Xun Peng', 'Karen R. McMenemy'}

In [18]:
# Read the abstract of each paper
try:
    a_file = open("data/abstract_preprocessed.pkl", "rb")
    abstracts = pickle.load(a_file)
    a_file.close()
    print('Abstract already preprocessed')
except:
    print('Preprocessing abstracts')
    stop_words = stopwords.words('english')
    porter = PorterStemmer()
    abstracts = dict()
    with open('data/abstracts.txt', 'r') as f:
        for line in tqdm(f):
            node, abstract = line.split('|--|')
            abstract = abstract.lower()
            # abstract = "".join([char for char in abstract if char not in string.punctuation])
            abstract = word_tokenize(abstract)
            abstract = [word for word in abstract if word not in stop_words]
            # abstract = [porter.stem(word) for word in abstract]
            abstracts[int(node)] = abstract
    a_file = open("data/abstract_preprocessed.pkl", "wb")
    pickle.dump(abstracts, a_file)
    a_file.close()
    print('Preprocessing Done')

Preprocessing abstracts


138499it [03:09, 730.64it/s] 


Preprocessing Done


In [19]:
abstracts[0][:10]

['development',
 'automated',
 'system',
 'quality',
 'assessment',
 'aerodrome',
 'ground',
 'lighting',
 '(',
 'agl']

# Embeddings

In [50]:
def validation_score(model, X_train, y_train, cv, scoring = 'neg_log_loss', n_jobs = None, verbose = 0):
    """Computes scores using cross validation for a given model.

    Args:
        model: classifier
        X_train (array like): training set.
        y_train (array like): training lavels.
        cv (int): number of splits.
        scoring (str, optional): Metric. Defaults to 'neg_log_loss'.
        n_jobs (int, optional): Number of cores. Defaults to None.
        verbose (int, optional): Verbose level for cross_val_score

    Returns:
        list: list containing scores from cross validation
    """
    print('Cross validation')
    scores = cross_val_score(model, X_train, y_train, cv = cv, scoring = make_scorer(log_loss, greater_is_better = False, needs_proba = True), n_jobs = n_jobs, verbose = verbose)
    print('Score: {:.2f} ± {:.2f}'.format(-scores.mean(), scores.std()/np.sqrt(cv)))
    
    return scores

def create_submission(model, G, G_params, X_train, y_train):
    # # Read test data. Each sample is a pair of nodes
    print('Creating submission')
    node_pairs = list()
    with open('data/test.txt', 'r') as f:
        for line in f:
            t = line.split(',')
            node_pairs.append((int(t[0]), int(t[1])))

    X_test, _ = create_dataset(G, G_params, edges = node_pairs, training = False)
    
    # Use logistic regression to predict if two nodes are linked by an edge
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)
    y_pred = y_pred[:,1]
    df = pd.DataFrame(X_test)
    print(df.head(15))

    # Write predictions to a file
    predictions = zip(range(len(y_pred)), y_pred)
    os.remove("data/submission.csv")
    with open("data/submission.csv","w") as pred:
        csv_out = csv.writer(pred)
        csv_out.writerow(['id','predicted'])
        for row in predictions:
            csv_out.writerow(row)
    print('Submision created')


def graph_properties(G):
    print('Computing graph properties')
    avg_neighbor_degree = nx.average_neighbor_degree(G)
    eig_centrality = nx.eigenvector_centrality_numpy(G)
    pagerank = nx.pagerank_scipy(G)
    greedy_color = nx.greedy_color(G)
    triangles = nx.triangles(G)
    core_number = nx.core_number(G)
    onion_number = nx.onion_layers(G)
    degree_centrality = nx.degree_centrality(G)
    clustering = nx.clustering(G)

    return [avg_neighbor_degree, eig_centrality, pagerank, greedy_color, triangles, core_number, onion_number, degree_centrality, clustering]
            
def create_dataset(G, G_params, edges = None, training = True): # 

    if training:
        p = 2
        edges = G.edges()
    else:
        p = 1
    m = len(edges)
    X_train = np.zeros((p*m, 6 + 2*len(G_params)))
    y_train = np.zeros(p*m)
    model = Doc2Vec.load("data/abstracts_embedding_doc2vec_vs128_w5_mc2_e100")
    for i,edge in tqdm(enumerate(edges)):
        # an edge
        X_train[p*i,0] = len(abstracts[edge[0]]) + len(abstracts[edge[1]])
        X_train[p*i,1] = abs(len(abstracts[edge[0]]) - len(abstracts[edge[1]]))
        X_train[p*i,2] = len(set(abstracts[edge[0]]).intersection(set(abstracts[edge[1]])))
        X_train[p*i,3] = G.degree(edge[0]) + G.degree(edge[1])
        X_train[p*i,4] = abs(G.degree(edge[0]) - G.degree(edge[1]))
        # X_train[p*i,5] = len(authors[edge[0]].intersection(authors[edge[1]]))
        cos_distance = spatial.distance.cosine(model[edge[0]], model[edge[1]])
        # print(cos_distance)
        X_train[p*i,5] = cos_distance
        for j in range(6, len(G_params), 2):
            param = G_params[j]
            X_train[p*i,j] = param[edge[0]] + param[edge[1]]
            X_train[p*i,j+1] = abs(param[edge[0]] - param[edge[1]])
        
        if training:
            y_train[2*i] = 1

            # a randomly generated pair of nodes
            random.seed(2*1) 
            n1 = randint(0, n-1)
            random.seed(2*i+1) 
            n2 = randint(0, n-1)
            X_train[2*i+1,0] = len(abstracts[n1]) + len(abstracts[n2])
            X_train[2*i+1,1] = abs(len(abstracts[n1]) - len(abstracts[n2]))
            X_train[2*i+1,2] = len(set(abstracts[n1]).intersection(set(abstracts[n2])))
            X_train[2*i+1,3] = G.degree(n1) + G.degree(n2)
            X_train[2*i+1,4] = abs(G.degree(n1) - G.degree(n2))
            # X_train[2*i+1,5] = len(authors[n1].intersection(abstracts[n2]))
            cos_distance = spatial.distance.cosine(model[n1], model[n2])
            # print(cos_distance)
            X_train[2*i+1,5] = cos_distance
            for j in range(6, len(G_params), 2):
                param = G_params[j]
                X_train[2*i+1,j] = param[n1] + param[n2]
                X_train[2*i+1,j+1] = abs(param[n1] - param[n2])

            y_train[2*i+1] = 0
        
    print('Size of the matrix:', X_train.shape)
    
    return X_train, y_train
    


## Create Dataset

In [None]:
# Computes node parameters for graph G and stores them in a list
G_params = graph_properties(G)

In [None]:
X_train, y_train = create_dataset(G, G_params = list())

Visualize dataset

In [None]:
df_train = pd.DataFrame(data=X_train)
df_train.head(10)

Test results with cross-validation

In [None]:
clf = LogisticRegression()
scores = validation_score(clf, X_train, y_train, cv = 4, n_jobs = 4, verbose = 1)

Create submission

In [None]:
clf = LogisticRegression()
create_submission(clf, G, [], X_train, y_train)

## Testing embedding properties and other ideas

In [None]:
def nodes_connected(G, u, v):
    return u in G.neighbors(v)

nodes_connected(G, 17, 0)

In [24]:
d2v = Doc2Vec.load("data/abstracts_embedding_doc2vec_vs64_w5_mc2_e100")

neighbor = nx.single_source_shortest_path_length(G, 1, cutoff=3)
# neighbor = {v: k for k, v in neighbor.items()}

for i in list(neighbor.keys())[1:10]:
    if nodes_connected(G, i, 1):
        print('Connected {:d}-{:d}: {:.2f}, {:d}'.format(i, 0, spatial.distance.cosine(d2v[i], d2v[1]), len(authors[i].intersection(authors[0]))))
    else:
        print('Not Connected {:d}-{:d}: {:.2f}, {:d}'.format(i, 0, spatial.distance.cosine(d2v[i], d2v[1]), len(authors[i].intersection(authors[0]))))

Connected 0-0: 0.09, 4
Connected 3-0: 0.59, 0
Connected 5-0: 1.10, 0
Connected 6-0: 0.65, 0
Connected 7-0: 0.61, 0
Connected 9-0: 0.69, 0
Connected 10-0: 0.70, 1
Connected 11-0: 0.54, 0
Connected 12-0: 0.61, 0


In [25]:
for i in range(0,20):
    # random.seed(i)
    k1 = randint(0, n-1)
    k2 = randint(0, n-1)
    if nodes_connected(G, k1, k2):
        print('Connected {:d}-{:d}: {:.2f}'.format(k1, 0, spatial.distance.cosine(d2v[k1], d2v[k2])))
    else:
        print('Not Connected {:d}-{:d}: {:.2f}'.format(k2, 0, spatial.distance.cosine(d2v[k1], d2v[k2])))

Not Connected 57843-0: 1.05
Not Connected 37538-0: 0.93
Not Connected 65147-0: 0.87
Not Connected 106678-0: 0.93
Not Connected 18223-0: 0.95
Not Connected 89973-0: 0.91
Not Connected 125515-0: 0.95
Not Connected 73222-0: 1.09
Not Connected 64208-0: 0.99
Not Connected 4594-0: 0.94
Not Connected 19913-0: 0.95
Not Connected 57703-0: 0.89
Not Connected 23905-0: 0.99
Not Connected 122321-0: 0.92
Not Connected 9201-0: 1.02
Not Connected 101385-0: 0.98
Not Connected 61178-0: 0.75
Not Connected 88102-0: 0.87
Not Connected 111660-0: 0.78
Not Connected 90555-0: 1.03


In [26]:
for i, edge in enumerate(G.edges()):
    print('Connected {:d}-{:d}: {:.2f}'.format(edge[0], edge[1], spatial.distance.cosine(d2v[edge[0]], d2v[edge[1]])))
    if i==30:
        break

Connected 0-1: 0.09
Connected 0-2: 0.86
Connected 1-3: 0.59
Connected 1-5: 1.10
Connected 1-6: 0.65
Connected 1-7: 0.61
Connected 1-9: 0.69
Connected 1-10: 0.70
Connected 1-11: 0.54
Connected 1-12: 0.61
Connected 1-13: 0.58
Connected 1-14: 0.59
Connected 1-15: 0.67
Connected 1-16: 0.66
Connected 1-17: 0.84
Connected 1-19: 1.00
Connected 1-20: 0.66
Connected 1-21: 0.74
Connected 1-22: 0.60
Connected 1-23: 0.61
Connected 1-24: 0.72
Connected 2-25: 0.82
Connected 2-26: 0.91
Connected 2-27: 0.99
Connected 2-28: 1.00
Connected 2-29: 0.99
Connected 2-30: 1.05
Connected 2-31: 1.04
Connected 2-32: 0.87
Connected 2-33: 0.92
Connected 2-34: 0.93


In [4]:
for i,edge in tqdm(enumerate(G.edges())):
    print(len(set(authors[edge[0]]).intersection(set(authors[edge[1]]))))
    # cos_distance = spatial.distance.cosine(d2v[edge[0]], d2v[edge[1]])
    # print(cos_distance)
   
    if i == 20:
        break

20it [00:00, 4169.29it/s]

1
0
0
0
1
0
0
2
0
0
0
0
0
0
0
0
0
0
0
0
0



