<center><h2>ALTEGRAD Project </h2>
<h3>PREPROCESSING</h3>

<hr>
<span style="font-variant: small-caps;">Xavier Jiménez, Jean Quentin, Sacha Revol</span><br>
<hr>
</center>

# Imports

In [1]:
import networkx as nx
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string
from tqdm import tqdm

import pickle
import random

import networkx as nx
from stellargraph import StellarGraph
from stellargraph import data
import pandas as pd

# Preprocessing

Citation Graph is loaded from `edgelist.txt`

In [2]:
# Create a graph
G = nx.read_edgelist('data/edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)
nodes = list(G.nodes())
n = G.number_of_nodes()
m = G.number_of_edges()
print('Number of nodes:', n)
print('Number of edges:', m)

Number of nodes: 138499
Number of edges: 1091955


## Authors

Preprocess `authors.txt` and save it as a dictionary `authors_preprocessed.pkl`

In [28]:
print('Preprocessing authors')
authors = dict()
punctuation = "!#$%&'()*+-./:<=>?@[\]^_`{|}~"
with open('data/authors.txt', 'r', encoding="utf-8") as f:
    for line in tqdm(f):
        node, author = line.split('|--|')
        author = author.lower()
        author = "".join([char for char in author if char not in punctuation])
        author = author.split(',')        
        author[-1] = author[-1].strip()
        authors[int(node)] = author
    a_file = open("data/authors_preprocessed.pkl", "wb")
    pickle.dump(authors, a_file)
    a_file.close()
print('Preprocessing Done')
authors[0]

20330it [00:00, 203034.92it/s]

Preprocessing authors


138499it [00:01, 130319.48it/s]


Preprocessing Done


['james h niblock', 'jianxun peng', 'karen r mcmenemy', 'george w irwin']

In [3]:
# Read the abstract of each paper
try:
    print('Loading Authors preprocessed')
    a_file = open("data/authors_preprocessed.pkl", "rb")
    authors = pickle.load(a_file)
    a_file.close()
except:
    raise SyntaxError("File 'authors_preprocessed.pkl' was not found in 'data/'")

Loading Authors preprocessed


Create a dictionnary from `authors_preprocessed.pkl` with unique names (e.j. "X. Jimenez" and "Xavier Jimenez" are considered as the same author) and saved it as `unique_authors.pkl`.

In [4]:
from namematcher import NameMatcher
from tqdm import tqdm 

def name_similarity(name1, name2):
    """Computes similarity between two names. If similarity is above 0.9 and both names have the
    same letter for the first name, it will return True if one of the authors first name has only
    one letter. False otherwise. If score is 1, returns True.

    Args:
        name1 (str): author name
        name2 (str): author name
        debug (bool, optional): If True, will print authors name and similarity score. Defaults to False.

    Returns:
        bool: True if names are the same. False otherwise.
    """
    if name1 == name2:
        return True
    elif name1.split(" ")[-1] != name2.split(" ")[-1]:
        return False
    
    name_matcher = NameMatcher()
    score = name_matcher.match_names(name1, name2)

    if score >= 0.9:
        if name1.split(" ")[0][0] == name2.split(" ")[0][0]:
            if len(name1.split(" ")[0]) == 1 and len(name2.split(" ")[0]) > 1:
                return True
            elif len(name2.split(" ")[0]) == 1 and len(name1.split(" ")[0]) > 1:
                return True
            else:
                return False
        else:
            return False
    else:
        return False

In [5]:
authors_list = list()
for i in tqdm(range(len(authors))):
    for author in authors[i]:
        authors_list.append(author)
authors_list = list(set(authors_list))

100%|██████████| 138499/138499 [00:00<00:00, 1257017.84it/s]


In [7]:
authors_dict = dict()
for i in tqdm(range(0, len(authors_list))):
    similar_authors = list()
    for j in range(0, len(authors_list)):
        if name_similarity(authors_list[i], authors_list[j]):
            similar_authors.append(authors_list[j])
        elif i == j:
            similar_authors.append(authors_list[i])
    authors_dict[authors_list[i]] = np.sort(similar_authors)
    
a_file = open("data/similar_authors_dict.pkl", "wb")
pickle.dump(authors_dict, a_file)
a_file.close()

100%|██████████| 146122/146122 [5:00:00<00:00,  8.12it/s]


In [37]:
try:
    print('Loading similar authors')
    a_file = open("data/similar_authors_dict.pkl", "rb")
    authors_dict = pickle.load(a_file)
    a_file.close()
except:
    raise SyntaxError("File 'similar_authors_dict.pkl' was not found in 'data/'")

unique_authors = dict()
for i in tqdm(range(len(authors))):
    authors_list = list()
    for author in authors[i]:
        if len(authors_dict[author]) < 3:
            authors_list.append(authors_dict[author][-1])
        else:
            authors_list.append(author)
    unique_authors[i] = authors_list

a_file = open("data/unique_authors_dict.pkl", "wb")
pickle.dump(unique_authors, a_file)
a_file.close()
        

Loading similar authors


100%|██████████| 138499/138499 [00:01<00:00, 99087.57it/s]


In [38]:
try:
    print('Loading unique authors')
    a_file = open("data/unique_authors_dict.pkl", "rb")
    unique_authors = pickle.load(a_file)
    a_file.close()
except:
    raise SyntaxError("File 'unique_authors_dict.pkl' was not found in 'data/'")

f = open("data/unique_authors.txt", 'w')
for i in tqdm(range(len(unique_authors))):
    f.write('{:d}|--|'.format(i) + ",".join(unique_authors[i]) + '\n')
f.close()

Loading unique authors


100%|██████████| 138499/138499 [00:00<00:00, 604902.17it/s]


In [40]:
authors_list = list()
for i in tqdm(range(len(authors))):
    for author in authors[i]:
        authors_list.append(author)
print(len(authors_list))
print(len(list(set(authors_list))))

100%|██████████| 138499/138499 [00:00<00:00, 1272804.96it/s]

456810
146122





## Abstracts

Preprocess `abstracts.txt` and save it as a dictionary `abstract_preprocessed.pkl`. Common NLP operations are made: lowercase, remove punctuation, tokenize, remove stop words.

In [None]:
print('Preprocessing abstracts')
stop_words = stopwords.words('english')
porter = PorterStemmer()
abstracts = dict()
with open('data/abstracts.txt', 'r', encoding="utf-8") as f:
    for line in tqdm(f):
        node, abstract = line.split('|--|')
        abstract = abstract.lower()
        abstract = "".join([char for char in abstract if char not in string.punctuation])
        abstract = word_tokenize(abstract)
        abstract = [word for word in abstract if word not in stop_words]
        # abstract = [porter.stem(word) for word in abstract]
        abstracts[int(node)] = abstract
a_file = open("data/abstract_preprocessed.pkl", "wb")
pickle.dump(abstracts, a_file)
a_file.close()
print('Preprocessing Done')
abstracts[0][:10]

## Dataset split

Citation Graph is loaded from `edgelist.txt` and random edges are removed from it (p=5%). These removed edges will be used to create a validation set that mirrors the test set `test.txt`. New graph is saved as `edgelist_val.txt` and will be loaded as H instead of G. If p is changed, the value should be changed as well on other .ipynb files.

In [None]:
nx_G = nx.read_edgelist('data/edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)
G = StellarGraph.from_networkx(nx_G, node_type_default="paper", edge_type_default="cites")

In [None]:
# Define an edge splitter on the original graph:
edge_splitter_test = data.EdgeSplitter(G)

# Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from graph, and obtain the
# reduced graph graph_test with the sampled links removed:
graph_train_test, examples_test, labels_test = edge_splitter_test.train_test_split(p=0.25, 
                                                                             method="global", 
                                                                             keep_connected=True, 
                                                                             seed=42)

nx.write_edgelist(graph_train_test.to_networkx(),
                  'data/edgelist_test.txt',
                  delimiter=',',
                  data=False)

# Save test pairs
pd.DataFrame(examples_test).to_csv('data/train_test_node_pairs.csv', index=False, header=False)
# Save test targets
pd.DataFrame(labels_test).to_csv('data/train_test_labels.csv', index=False, header=False)

In [None]:
# Do the same process to compute a training subset from within the test graph
edge_splitter_train = data.EdgeSplitter(graph_train_test, G)
graph_train_val, examples, labels = edge_splitter_train.train_test_split(p=0.2, 
                                                                     method="global", 
                                                                     keep_connected=True, 
                                                                     seed=42)

nx.write_edgelist(graph_train_val.to_networkx(),
                  'data/edgelist_train.txt',
                  delimiter=',',
                  data=False)

# Save training pairs
pd.DataFrame(examples).to_csv('data/train_val_node_pairs.csv', index=False, header=False)
# Save training targets
pd.DataFrame(labels).to_csv('data/train_val_labels.csv', index=False, header=False)