# Node Classification

In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import arxiv

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix, classification_report
from node2vec import Node2Vec as n2v

In [2]:
# constants
queries = [
    'automl', 'machinelearning', 'data', 'phyiscs','mathematics', 'recommendation system', 'nlp', 'neural networks'
]

## Fetch Data

In [3]:
def search_arxiv(queries, max_results = 100):
    '''
    This function will search arxiv associated to a set of queries and store
    the latest 10000 (max_results) associated to that search.
    
    params:
        queries (List -> Str) : A list of strings containing keywords you want
                                to search on Arxiv
        max_results (Int) : The maximum number of results you want to see associated
                            to your search. Default value is 1000, capped at 300000
                            
    returns:
        This function will return a DataFrame holding the following columns associated
        to the queries the user has passed. 
            `title`, `date`, `article_id`, `url`, `main_topic`, `all_topics`
    
    example:
        research_df = search_arxiv(
            queries = ['automl', 'recommender system', 'nlp', 'data science'],
            max_results = 10000
        )
    '''
    d = []
    searches = []
    # hitting the API
    for query in queries:
        search = arxiv.Search(
          query = query,
          max_results = max_results,
          sort_by = arxiv.SortCriterion.SubmittedDate,
          sort_order = arxiv.SortOrder.Descending
        )
        searches.append(search)
    
    # Converting search result into df
    for search in searches:
        for res in search.results():
            data = {
                'title' : res.title,
                'date' : res.published,
                'article_id' : res.entry_id,
                'url' : res.pdf_url,
                'main_topic' : res.primary_category,
                'all_topics' : res.categories,
                'authors' : res.authors
            }
            d.append(data)
        
    d = pd.DataFrame(d)
    d['year'] = pd.DatetimeIndex(d['date']).year
    
    # change article id from url to integer
    unique_article_ids = d.article_id.unique()
    article_mapping = {art:idx for idx,art in enumerate(unique_article_ids)}
    d['article_id'] = d['article_id'].map(article_mapping)
    return d

In [4]:
%%time
research_df = search_arxiv(
    queries = queries,
    max_results = 100
)
research_df.shape

CPU times: user 1.21 s, sys: 84.3 ms, total: 1.29 s
Wall time: 10 s


(646, 8)

## Create Network

In [5]:
def generate_network(df, node_col, edge_col):
    '''
    This function will generate a article to article network given an input DataFrame.
    It will do so by creating an edge_dictionary where each key is going to be a node
    referenced by unique values in node_col and the values will be a list of other nodes
    connected to the key through the edge_col.
    
    params:
        df (DataFrame) : The dataset which holds the node and edge columns
        node_col (String) : The column name associated to the nodes of the network
        edge_col (String) : The column name associated to the edges of the network
        
    returns:
        A networkx graph corresponding to the input dataset
        
    example:
        generate_network(
            research_df,
            node_col = 'article_id',
            edge_col = 'main_topic'
        )
    '''
    edge_dct = {}
    for i,g in df.groupby(node_col):
        topics = g[edge_col].unique()
        edge_df = df[(df[node_col] != i) & (df[edge_col].isin(topics))]
        edges = list(edge_df[node_col].unique())
        edge_dct[i] = edges
    
    # create nx network
    g = nx.Graph(edge_dct, create_using = nx.MultiGraph)
    return g

In [6]:
%%time
tp_nx = generate_network(
    research_df, 
    node_col = 'article_id', 
    edge_col = 'main_topic'
)

CPU times: user 321 ms, sys: 7.12 ms, total: 328 ms
Wall time: 336 ms


In [7]:
print(nx.info(tp_nx))

Name: 
Type: Graph
Number of nodes: 554
Number of edges: 11689
Average degree:  42.1986


## Apply Node2Vec

In [8]:
%time g_emb = n2v(tp_nx, dimensions=16)

Computing transition probabilities:   0%|          | 0/554 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:09<00:00,  1.05it/s]

CPU times: user 13.7 s, sys: 107 ms, total: 13.8 s
Wall time: 13.8 s





In [9]:
WINDOW = 1 # Node2Vec fit window
MIN_COUNT = 1 # Node2Vec min. count
BATCH_WORDS = 4 # Node2Vec batch words

In [10]:
mdl = g_emb.fit(
    window=WINDOW,
    min_count=MIN_COUNT,
    batch_words=BATCH_WORDS
)

In [11]:
emb_df = (
    pd.DataFrame(
        [mdl.wv.get_vector(str(n)) for n in tp_nx.nodes()],
        index = tp_nx.nodes
    )
)

In [12]:
emb_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,-0.838726,-0.235961,0.678999,0.272436,1.040902,0.545415,-0.076529,-0.823694,-0.180377,-0.419546,-0.37856,-1.136071,0.917937,0.525844,-0.399737,0.757854
1,-0.854508,-0.151853,0.553616,0.195111,1.057672,0.700733,-0.054394,-0.762913,-0.143613,-0.328154,-0.380083,-1.128442,1.001008,0.498544,-0.368707,0.743062
2,-0.926988,-0.10457,0.664905,0.264477,1.10911,0.617614,-0.048608,-0.807432,-0.150094,-0.314221,-0.404238,-1.130124,0.93744,0.488027,-0.309991,0.691492
3,-0.780072,-0.260421,0.672069,0.253161,1.051173,0.685588,-0.089009,-0.754997,-0.209139,-0.311743,-0.292209,-1.033492,0.95319,0.570085,-0.399655,0.817219
4,-0.912576,-0.186411,0.681746,0.168253,1.044887,0.617192,0.084998,-0.852501,-0.145857,-0.282448,-0.437965,-1.112469,0.985778,0.519955,-0.390988,0.657512


In [20]:
emb_df = emb_df.merge(
    research_df[['article_id', 'main_topic']].set_index('article_id'),
    left_index = True,
    right_index = True
)

In [24]:
ft_cols = emb_df.drop(columns = ['main_topic']).columns.tolist()
target_col = 'main_topic'

In [35]:
# train test split
x = emb_df[ft_cols].values
y = emb_df[target_col].values

x_train, x_test, y_train, y_test = train_test_split(
    x, 
    y,
    test_size = 0.3
)

## Train Model

In [36]:
%%time
# GBC classifier
clf = GradientBoostingClassifier()

# train the model
clf.fit(x_train, y_train)

CPU times: user 14.3 s, sys: 65.8 ms, total: 14.3 s
Wall time: 14.5 s


GradientBoostingClassifier()

## Evaluate Model

In [37]:
y_pred = clf.predict(x_test)
y_true = y_test

In [38]:
y_pred = clf.predict(x_test)
x_pred = clf.predict(x_train)
test_acc = accuracy_score(y_test, y_pred)
train_acc = accuracy_score(y_train, x_pred)
print("Testing Accuracy : ", test_acc)
print("Training Accuracy : ", train_acc)

Testing Accuracy :  0.8350515463917526
Training Accuracy :  1.0


In [39]:
print("MCC Score : ", matthews_corrcoef(y_true, y_pred))


MCC Score :  0.8222440064195689


In [40]:
print("Test Classification Report : ")
print(classification_report(y_test, clf.predict(x_test)))

Test Classification Report : 
                   precision    recall  f1-score   support

      astro-ph.CO       1.00      1.00      1.00         2
      astro-ph.EP       1.00      1.00      1.00         3
      astro-ph.GA       1.00      1.00      1.00         1
      astro-ph.HE       0.00      0.00      0.00         1
      astro-ph.IM       0.00      0.00      0.00         0
      astro-ph.SR       0.00      0.00      0.00         2
  cond-mat.dis-nn       1.00      1.00      1.00         1
cond-mat.mes-hall       0.00      0.00      0.00         2
cond-mat.mtrl-sci       1.00      1.00      1.00         1
    cond-mat.soft       0.00      0.00      0.00         0
  cond-mat.str-el       0.00      0.00      0.00         0
cond-mat.supr-con       0.00      0.00      0.00         2
            cs.AI       1.00      1.00      1.00         3
            cs.CE       1.00      1.00      1.00         2
            cs.CG       0.00      0.00      0.00         1
            cs.CL       1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
print(confusion_matrix(y_pred,y_test))

[[2 0 0 ... 0 0 0]
 [0 3 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 5 0 0]
 [0 0 0 ... 0 2 0]
 [0 0 0 ... 0 0 2]]


## Predictions

In [45]:
pred_ft = [mdl.wv.get_vector(str('21'))]
clf.predict(pred_ft)[0]

'cs.LG'

## Concluding Remarks


## Resources

---