# Link Prediction w/ n2v

In [17]:
!pip install arxiv

Collecting arxiv
  Downloading arxiv-1.4.2-py3-none-any.whl (11 kB)
Collecting feedparser
  Downloading feedparser-6.0.8-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 1.7 MB/s eta 0:00:01
[?25hCollecting sgmllib3k
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
Building wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25ldone
[?25h  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6065 sha256=9628adb2c5f911b2608828809f9555a106d3a0b745a02b4cf5c0b995c871b39d
  Stored in directory: /Users/vatsalpatel/Library/Caches/pip/wheels/83/63/2f/117884c3b19d46b64d3d61690333aa80c88dc14050e269c546
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser, arxiv
Successfully installed arxiv-1.4.2 feedparser-6.0.8 sgmllib3k-1.0.0


In [1]:
import networkx as nx
import scipy as sp
import pandas as pd
import numpy as np
import arxiv

from node2vec import node2vec as n2v

In [2]:
# constants
queries = [
    'automl', 'machinelearning', 'data', 'phyiscs','mathematics', 'recommendation system', 'nlp', 'neural networks'
]

# Fetch Data

In [3]:
def search_arxiv(queries, max_results = 1000):
    '''
    This function will search arxiv associated to a set of queries and store
    the latest 10000 (max_results) associated to that search.
    
    params:
        queries (List -> Str) : A list of strings containing keywords you want
                                to search on Arxiv
        max_results (Int) : The maximum number of results you want to see associated
                            to your search. Default value is 1000, capped at 300000
                            
    returns:
        This function will return a DataFrame holding the following columns associated
        to the queries the user has passed. 
            `title`, `date`, `article_id`, `url`, `main_topic`, `all_topics`
    
    example:
        research_df = search_arxiv(
            queries = ['automl', 'recommender system', 'nlp', 'data science'],
            max_results = 10000
        )
    '''
    d = []
    searches = []
    # hitting the API
    for query in queries:
        search = arxiv.Search(
          query = query,
          max_results = max_results,
          sort_by = arxiv.SortCriterion.SubmittedDate,
          sort_order = arxiv.SortOrder.Descending
        )
        searches.append(search)
    
    # Converting search result into df
    for search in searches:
        for res in search.results():
            data = {
                'title' : res.title,
                'date' : res.published,
                'article_id' : res.entry_id,
                'url' : res.pdf_url,
                'main_topic' : res.primary_category,
                'all_topics' : res.categories
            }
            d.append(data)
        
    d = pd.DataFrame(d)
    return d

In [4]:
%%time
research_df = search_arxiv(
    queries = queries,
    max_results = 1000
)
research_df.shape

CPU times: user 7.99 s, sys: 421 ms, total: 8.41 s
Wall time: 4min 1s


(5332, 6)

In [5]:
research_df

Unnamed: 0,title,date,article_id,url,main_topic,all_topics
0,Review of automated time series forecasting pi...,2022-02-03 17:26:27+00:00,http://arxiv.org/abs/2202.01712v1,http://arxiv.org/pdf/2202.01712v1,cs.LG,[cs.LG]
1,Hubble Asteroid Hunter: I. Identifying asteroi...,2022-02-01 06:56:20+00:00,http://arxiv.org/abs/2202.00246v1,http://arxiv.org/pdf/2202.00246v1,astro-ph.EP,"[astro-ph.EP, astro-ph.IM]"
2,NAS-Bench-Suite: NAS Evaluation is (Now) Surpr...,2022-01-31 18:02:09+00:00,http://arxiv.org/abs/2201.13396v1,http://arxiv.org/pdf/2201.13396v1,cs.LG,"[cs.LG, cs.AI, stat.ML]"
3,Online AutoML: An adaptive AutoML framework fo...,2022-01-24 15:37:20+00:00,http://arxiv.org/abs/2201.09750v1,http://arxiv.org/pdf/2201.09750v1,cs.LG,"[cs.LG, cs.AI]"
4,Automated Reinforcement Learning (AutoRL): A S...,2022-01-11 12:41:43+00:00,http://arxiv.org/abs/2201.03916v1,http://arxiv.org/pdf/2201.03916v1,cs.LG,[cs.LG]
...,...,...,...,...,...,...
5327,Reinforcement Learning-Based Deadline and Batt...,2022-01-25 14:42:29+00:00,http://arxiv.org/abs/2201.10361v2,http://arxiv.org/pdf/2201.10361v2,cs.NI,[cs.NI]
5328,Resource-efficient Deep Neural Networks for Au...,2022-01-25 14:41:08+00:00,http://arxiv.org/abs/2201.10360v1,http://arxiv.org/pdf/2201.10360v1,eess.SP,"[eess.SP, cs.CV]"
5329,Ultra Low-Parameter Denoising: Trainable Bilat...,2022-01-25 14:33:56+00:00,http://arxiv.org/abs/2201.10345v1,http://arxiv.org/pdf/2201.10345v1,eess.IV,"[eess.IV, cs.CV]"
5330,Distributed Image Transmission using Deep Join...,2022-01-25 14:25:26+00:00,http://arxiv.org/abs/2201.10340v1,http://arxiv.org/pdf/2201.10340v1,cs.IT,"[cs.IT, cs.LG, math.IT]"


## Generate Network