# Supervised & Unsupervised Topic Modelling

In [1]:
import arxiv
import numpy as np
import pandas as pd

from typing import List

In [2]:
# constants
queries = [
    'automl', 'machinelearning', 'data', 'phyiscs','mathematics', 'recommendation system', 'nlp', 'neural networks'
]

## Load Data

In [3]:
def search_arxiv(queries: List[str], max_results: int = 100) -> pd.DataFrame:
    '''
    This function will search arxiv associated to a set of queries and store
    the latest 10000 (max_results) associated to that search.
    
    params:
        queries (List -> Str) : A list of strings containing keywords you want
                                to search on Arxiv
        max_results (Int) : The maximum number of results you want to see associated
                            to your search. Default value is 1000, capped at 300000
                            
    returns:
        This function will return a DataFrame holding the following columns associated
        to the queries the user has passed. 
            `title`, `date`, `article_id`, `url`, `main_topic`, `all_topics`
    
    example:
        research_df = search_arxiv(
            queries = ['automl', 'recommender system', 'nlp', 'data science'],
            max_results = 10000
        )
    '''
    d = []
    searches = []
    # hitting the API
    for query in queries:
        search = arxiv.Search(
          query = query,
          max_results = max_results,
          sort_by = arxiv.SortCriterion.SubmittedDate,
          sort_order = arxiv.SortOrder.Descending
        )
        searches.append(search)
    
    # Converting search result into df
    for search in searches:
        for res in search.results():
            data = {
                'title' : res.title,
                'date' : res.published,
                'article_id' : res.entry_id,
                'url' : res.pdf_url,
                'main_topic' : res.primary_category,
                'all_topics' : res.categories,
                'authors' : res.authors
            }
            d.append(data)
        
    d = pd.DataFrame(d)
    d['year'] = pd.DatetimeIndex(d['date']).year
    
    # change article id from url to integer
    unique_article_ids = d.article_id.unique()
    article_mapping = {art:idx for idx,art in enumerate(unique_article_ids)}
    d['article_id'] = d['article_id'].map(article_mapping)
    return d


In [4]:
# fetch data from arXiv
research_df = search_arxiv(
    queries = queries,
    max_results = 100
)
print(research_df.shape)

(647, 8)


In [5]:
research_df.head()

Unnamed: 0,title,date,article_id,url,main_topic,all_topics,authors,year
0,Multi-objective Tree-structured Parzen Estimat...,2022-12-13 17:33:02+00:00,0,http://arxiv.org/pdf/2212.06751v1,cs.LG,"[cs.LG, cs.AI]","[Shuhei Watanabe, Noow Awad, Masaki Onishi, Fr...",2022
1,POPNASv3: a Pareto-Optimal Neural Architecture...,2022-12-13 17:14:14+00:00,1,http://arxiv.org/pdf/2212.06735v1,cs.LG,"[cs.LG, cs.AI, cs.CV, cs.NE]","[Andrea Falanti, Eugenio Lomurno, Danilo Ardag...",2022
2,AutoPINN: When AutoML Meets Physics-Informed N...,2022-12-08 03:44:08+00:00,2,http://arxiv.org/pdf/2212.04058v1,cs.LG,"[cs.LG, cs.AI]","[Xinle Wu, Dalin Zhang, Miao Zhang, Chenjuan G...",2022
3,Benchmarking AutoML algorithms on a collection...,2022-12-06 01:53:50+00:00,3,http://arxiv.org/pdf/2212.02704v2,cs.LG,[cs.LG],"[Pedro Henrique Ribeiro, Patryk Orzechowski, J...",2022
4,NAS-LID: Efficient Neural Architecture Search ...,2022-11-23 08:08:17+00:00,4,http://arxiv.org/pdf/2211.12759v2,cs.CV,"[cs.CV, cs.AI, cs.LG]","[Xin He, Jiangchao Yao, Yuxin Wang, Zhenheng T...",2022


## Data Cleaning

In [None]:
def remove_stopwords():
    raise NotImplemented
    
def remove_punct():
    raise NotImplemented
    
def unidecode():
    raise NotImplemented
    
def clean():
    raise NotImplemented

# Unsupervised Learning - Topic Modelling

## Evaluation

# Supervised Learning - Topic Modelling

## Evaluation

---