# Node Classification

In [3]:
import networkx as nx
import pandas as pd
import numpy as np
import arxiv

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix, classification_report
from node2vec import Node2Vec as n2v

In [4]:
# constants
queries = [
    'automl', 'machinelearning', 'data', 'phyiscs','mathematics', 'recommendation system', 'nlp', 'neural networks'
]

## Fetch Data

In [6]:
def search_arxiv(queries, max_results = 100):
    '''
    This function will search arxiv associated to a set of queries and store
    the latest 10000 (max_results) associated to that search.
    
    params:
        queries (List -> Str) : A list of strings containing keywords you want
                                to search on Arxiv
        max_results (Int) : The maximum number of results you want to see associated
                            to your search. Default value is 1000, capped at 300000
                            
    returns:
        This function will return a DataFrame holding the following columns associated
        to the queries the user has passed. 
            `title`, `date`, `article_id`, `url`, `main_topic`, `all_topics`
    
    example:
        research_df = search_arxiv(
            queries = ['automl', 'recommender system', 'nlp', 'data science'],
            max_results = 10000
        )
    '''
    d = []
    searches = []
    # hitting the API
    for query in queries:
        search = arxiv.Search(
          query = query,
          max_results = max_results,
          sort_by = arxiv.SortCriterion.SubmittedDate,
          sort_order = arxiv.SortOrder.Descending
        )
        searches.append(search)
    
    # Converting search result into df
    for search in searches:
        for res in search.results():
            data = {
                'title' : res.title,
                'date' : res.published,
                'article_id' : res.entry_id,
                'url' : res.pdf_url,
                'main_topic' : res.primary_category,
                'all_topics' : res.categories,
                'authors' : res.authors
            }
            d.append(data)
        
    d = pd.DataFrame(d)
    d['year'] = pd.DatetimeIndex(d['date']).year
    
    # change article id from url to integer
    unique_article_ids = d.article_id.unique()
    article_mapping = {art:idx for idx,art in enumerate(unique_article_ids)}
    d['article_id'] = d['article_id'].map(article_mapping)
    return d

In [7]:
%%time
research_df = search_arxiv(
    queries = queries,
    max_results = 100
)
research_df.shape

CPU times: user 1.21 s, sys: 200 ms, total: 1.41 s
Wall time: 8.84 s


(646, 8)

## Create Network

In [2]:
def generate_network(df, node_col, edge_col):
    '''
    This function will generate a article to article network given an input DataFrame.
    It will do so by creating an edge_dictionary where each key is going to be a node
    referenced by unique values in node_col and the values will be a list of other nodes
    connected to the key through the edge_col.
    
    params:
        df (DataFrame) : The dataset which holds the node and edge columns
        node_col (String) : The column name associated to the nodes of the network
        edge_col (String) : The column name associated to the edges of the network
        
    returns:
        A networkx graph corresponding to the input dataset
        
    example:
        generate_network(
            research_df,
            node_col = 'article_id',
            edge_col = 'main_topic'
        )
    '''
    edge_dct = {}
    for i,g in df.groupby(node_col):
        topics = g[edge_col].unique()
        edge_df = df[(df[node_col] != i) & (df[edge_col].isin(topics))]
        edges = list(edge_df[node_col].unique())
        edge_dct[i] = edges
    
    # create nx network
    g = nx.Graph(edge_dct, create_using = nx.MultiGraph)
    return g

In [8]:
all_tp = research_df.explode('all_topics').copy()

In [9]:
%%time
tp_nx = generate_network(
    all_tp, 
    node_col = 'article_id', 
    edge_col = 'all_topics'
)

CPU times: user 400 ms, sys: 38.6 ms, total: 439 ms
Wall time: 588 ms


In [10]:
print(nx.info(tp_nx))

Name: 
Type: Graph
Number of nodes: 554
Number of edges: 27745
Average degree: 100.1625


## Apply Node2Vec

In [11]:
%time g_emb = n2v(tp_nx, dimensions=16)

Computing transition probabilities:   0%|          | 0/554 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:13<00:00,  1.33s/it]

CPU times: user 33.1 s, sys: 485 ms, total: 33.6 s
Wall time: 34.5 s





In [12]:
WINDOW = 1 # Node2Vec fit window
MIN_COUNT = 1 # Node2Vec min. count
BATCH_WORDS = 4 # Node2Vec batch words

In [13]:
mdl = g_emb.fit(
    window=WINDOW,
    min_count=MIN_COUNT,
    batch_words=BATCH_WORDS
)

In [14]:
emb_df = (
    pd.DataFrame(
        [mdl.wv.get_vector(str(n)) for n in tp_nx.nodes()],
        index = tp_nx.nodes
    )
)

In [15]:
emb_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,-0.207463,0.062599,0.316774,0.759136,0.531327,0.218926,1.292739,0.829721,0.227742,0.811851,0.184792,-0.461864,-0.256434,-0.607792,-0.297039,0.720587
1,0.18353,-0.303806,-0.191326,0.695051,0.004614,0.516913,1.419496,0.71216,-0.516897,-0.066074,-0.123888,-0.263573,0.205573,-0.329295,-0.67991,0.548596
2,0.136323,-0.342012,-0.203393,0.679367,-0.072208,0.576346,1.321501,0.549285,-0.382767,0.075622,-0.218555,-0.346983,0.309274,-0.335234,-0.661385,0.696024
3,-0.071042,-0.024264,0.374454,0.676078,0.13281,0.264118,1.28135,0.693255,-0.042121,0.912933,0.303049,-0.381561,-0.430063,-0.631585,-0.392021,0.633346
4,0.161672,-0.338293,-0.258853,0.834578,-0.090691,0.538172,1.25841,0.596693,-0.534008,0.191207,-0.226571,-0.304185,0.35893,-0.472012,-0.726286,0.616967


## Train Model

## Evaluate Model

## Predictions

## Concluding Remarks


## Resources

---