# Supervised & Unsupervised Topic Modelling

In [25]:
import arxiv
import string
import unidecode
import numpy as np
import pandas as pd

from typing import List

try:
    from nltk.corpus import stopwords
except:
    import nltk
    nltk.download('stopwords')
finally:
    from nltk.corpus import stopwords

In [2]:
# constants
queries = [
    'automl', 'machinelearning', 'data', 'phyiscs','mathematics', 'recommendation system', 'nlp', 'neural networks'
]

## Load Data

In [3]:
def search_arxiv(queries: List[str], max_results: int = 100) -> pd.DataFrame:
    '''
    This function will search arxiv associated to a set of queries and store
    the latest 10000 (max_results) associated to that search.
    
    params:
        queries (List -> Str) : A list of strings containing keywords you want
                                to search on Arxiv
        max_results (Int) : The maximum number of results you want to see associated
                            to your search. Default value is 1000, capped at 300000
                            
    returns:
        This function will return a DataFrame holding the following columns associated
        to the queries the user has passed. 
            `title`, `date`, `article_id`, `url`, `main_topic`, `all_topics`
    
    example:
        research_df = search_arxiv(
            queries = ['automl', 'recommender system', 'nlp', 'data science'],
            max_results = 10000
        )
    '''
    d = []
    searches = []
    # hitting the API
    for query in queries:
        search = arxiv.Search(
          query = query,
          max_results = max_results,
          sort_by = arxiv.SortCriterion.SubmittedDate,
          sort_order = arxiv.SortOrder.Descending
        )
        searches.append(search)
    
    # Converting search result into df
    for search in searches:
        for res in search.results():
            data = {
                'title' : res.title,
                'date' : res.published,
                'article_id' : res.entry_id,
                'url' : res.pdf_url,
                'main_topic' : res.primary_category,
                'summary' : res.summary,
                'all_topics' : res.categories,
                'authors' : res.authors
            }
            d.append(data)
        
    d = pd.DataFrame(d)
    d['year'] = pd.DatetimeIndex(d['date']).year
    
    # change article id from url to integer
    unique_article_ids = d.article_id.unique()
    article_mapping = {art:idx for idx,art in enumerate(unique_article_ids)}
    d['article_id'] = d['article_id'].map(article_mapping)
    return d


In [4]:
# fetch data from arXiv
research_df = search_arxiv(
    queries = queries,
    max_results = 100
)
print(research_df.shape)

(647, 9)


In [5]:
research_df.head()

Unnamed: 0,title,date,article_id,url,main_topic,summary,all_topics,authors,year
0,Multi-objective Tree-structured Parzen Estimat...,2022-12-13 17:33:02+00:00,0,http://arxiv.org/pdf/2212.06751v1,cs.LG,Hyperparameter optimization (HPO) is essential...,"[cs.LG, cs.AI]","[Shuhei Watanabe, Noow Awad, Masaki Onishi, Fr...",2022
1,POPNASv3: a Pareto-Optimal Neural Architecture...,2022-12-13 17:14:14+00:00,1,http://arxiv.org/pdf/2212.06735v1,cs.LG,The automated machine learning (AutoML) field ...,"[cs.LG, cs.AI, cs.CV, cs.NE]","[Andrea Falanti, Eugenio Lomurno, Danilo Ardag...",2022
2,AutoPINN: When AutoML Meets Physics-Informed N...,2022-12-08 03:44:08+00:00,2,http://arxiv.org/pdf/2212.04058v1,cs.LG,Physics-Informed Neural Networks (PINNs) have ...,"[cs.LG, cs.AI]","[Xinle Wu, Dalin Zhang, Miao Zhang, Chenjuan G...",2022
3,Benchmarking AutoML algorithms on a collection...,2022-12-06 01:53:50+00:00,3,http://arxiv.org/pdf/2212.02704v2,cs.LG,Automated machine learning (AutoML) algorithms...,[cs.LG],"[Pedro Henrique Ribeiro, Patryk Orzechowski, J...",2022
4,NAS-LID: Efficient Neural Architecture Search ...,2022-11-23 08:08:17+00:00,4,http://arxiv.org/pdf/2211.12759v2,cs.CV,One-shot neural architecture search (NAS) subs...,"[cs.CV, cs.AI, cs.LG]","[Xin He, Jiangchao Yao, Yuxin Wang, Zhenheng T...",2022


## Data Cleaning

In [15]:
def remove_stopwords(text: str, sw: List[str] = stopwords) -> str:
    '''
    This function will remove stopwords from the text
    
    Args:
        text: String of data you want to remove stopwords from
        sw: List of strings indicating the list of stopwords
        
    Returns:
        The input string with the stopwords removed.
    '''
    return ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    
def remove_punctuation(text: str, punct: str = string.punctuation) -> str:
    '''
    This function will remove punctuations from the text.
    
    Args:
        text: String of data you want to remove punctuations from
        punct: String of punctuations
    
    Returns:
        The input string with the punctuations removed.
    '''
    cleaned_text = ''.join([char for char in text if char not in punct])
    return cleaned_text
    
def unicode(text: str) -> str:
    '''
    This function will make all the data unicoded. Meaning Â -> A
    
    Args:
        text: String of data you want to unicode
    
    Returns:
        The input string unicoded.
    '''
    return unidecode.unidecode(text)
    
def clean(text: str) -> str:
    '''
    This method will clean the input text through unidecoding and stopword and punctuation 
    removal.
    
    Args:
        text: String indicating the body of text you want to clean
    
    Returns:
        A string corresponding to the cleaned version of the input string.
    '''
    text = unicode(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    return text.lower()

In [16]:
%time research_df['cleaned_summary'] = research_df['summary'].apply(clean)

CPU times: user 3.88 s, sys: 772 ms, total: 4.65 s
Wall time: 4.66 s


In [17]:
research_df.head()

Unnamed: 0,title,date,article_id,url,main_topic,summary,all_topics,authors,year,cleaned_summary
0,Multi-objective Tree-structured Parzen Estimat...,2022-12-13 17:33:02+00:00,0,http://arxiv.org/pdf/2212.06751v1,cs.LG,Hyperparameter optimization (HPO) is essential...,"[cs.LG, cs.AI]","[Shuhei Watanabe, Noow Awad, Masaki Onishi, Fr...",2022,hyperparameter optimization hpo essential bett...
1,POPNASv3: a Pareto-Optimal Neural Architecture...,2022-12-13 17:14:14+00:00,1,http://arxiv.org/pdf/2212.06735v1,cs.LG,The automated machine learning (AutoML) field ...,"[cs.LG, cs.AI, cs.CV, cs.NE]","[Andrea Falanti, Eugenio Lomurno, Danilo Ardag...",2022,the automated machine learning automl field be...
2,AutoPINN: When AutoML Meets Physics-Informed N...,2022-12-08 03:44:08+00:00,2,http://arxiv.org/pdf/2212.04058v1,cs.LG,Physics-Informed Neural Networks (PINNs) have ...,"[cs.LG, cs.AI]","[Xinle Wu, Dalin Zhang, Miao Zhang, Chenjuan G...",2022,physicsinformed neural networks pinns recently...
3,Benchmarking AutoML algorithms on a collection...,2022-12-06 01:53:50+00:00,3,http://arxiv.org/pdf/2212.02704v2,cs.LG,Automated machine learning (AutoML) algorithms...,[cs.LG],"[Pedro Henrique Ribeiro, Patryk Orzechowski, J...",2022,automated machine learning automl algorithms g...
4,NAS-LID: Efficient Neural Architecture Search ...,2022-11-23 08:08:17+00:00,4,http://arxiv.org/pdf/2211.12759v2,cs.CV,One-shot neural architecture search (NAS) subs...,"[cs.CV, cs.AI, cs.LG]","[Xin He, Jiangchao Yao, Yuxin Wang, Zhenheng T...",2022,oneshot neural architecture search nas substan...


# Unsupervised Learning - LDA

In [22]:
import nltk
from gensim import models, corpora

In [28]:
article_summaries = research_df['cleaned_summary'].values
article_summaries = [[word for word in nltk.word_tokenize(article)] for article in article_summaries]

In [33]:
# Create a dictionary of all the words in the dataset
dictionary = corpora.Dictionary(article_summaries)

# Create a corpus
corpus = [dictionary.doc2bow(article) for article in article_summaries]

In [35]:
%%time
# Create an LDA model
lda = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=20)

CPU times: user 4.69 s, sys: 45.2 ms, total: 4.74 s
Wall time: 4.75 s


In [37]:
# Get the topic distribution for each document
for i, article in enumerate(article_summaries[0:10]):
    print(f"Article {i+1}: {lda.get_document_topics(corpus[i])}")

# Print the top words for each topic
print(lda.print_topics())

Article 1: [(6, 0.54790044), (8, 0.44401333)]
Article 2: [(4, 0.26697603), (6, 0.65080774), (7, 0.07738426)]
Article 3: [(0, 0.4645687), (6, 0.39220694), (7, 0.13915145)]
Article 4: [(6, 0.9916613)]
Article 5: [(4, 0.87794346), (6, 0.11565114)]
Article 6: [(6, 0.39041102), (7, 0.6014999)]
Article 7: [(0, 0.5545885), (1, 0.035100326), (6, 0.40523368)]
Article 8: [(0, 0.15213814), (6, 0.8432332)]
Article 9: [(0, 0.3477852), (6, 0.64428633)]
Article 10: [(6, 0.42433938), (7, 0.56949884)]
[(0, '0.013*"data" + 0.010*"the" + 0.008*"model" + 0.008*"we" + 0.007*"in" + 0.005*"systems" + 0.004*"results" + 0.004*"two" + 0.004*"using" + 0.004*"models"'), (1, '0.010*"data" + 0.006*"we" + 0.006*"work" + 0.005*"time" + 0.005*"in" + 0.005*"the" + 0.005*"results" + 0.004*"analysis" + 0.004*"users" + 0.004*"business"'), (2, '0.010*"we" + 0.009*"systems" + 0.006*"transmission" + 0.006*"the" + 0.004*"networks" + 0.004*"study" + 0.003*"neural" + 0.003*"data" + 0.003*"coherent" + 0.003*"system"'), (3, '0.00

## Visualization

In [54]:
# Create an empty dataframe
df = pd.DataFrame(columns=["article", "topic", "probability"])

# Loop over the articles and get the top 15 topics for each one
for i, article in enumerate(article_summaries):
    topics = lda.get_document_topics(corpus[i])
    topics = sorted(topics, key=lambda x: x[1], reverse=True)[:15]
    for topic in topics:
        df = df.append({"article": i, "topic": topic[0], "probability": topic[1]}, ignore_index=True)

In [55]:
df

Unnamed: 0,article,topic,probability
0,0.0,6.0,0.547899
1,0.0,8.0,0.444015
2,1.0,6.0,0.650804
3,1.0,4.0,0.266983
4,1.0,7.0,0.077380
...,...,...,...
912,644.0,6.0,0.033557
913,645.0,3.0,0.994036
914,646.0,3.0,0.496334
915,646.0,4.0,0.351349


In [82]:
# Get the list of top words for each topic
topics = lda.print_topics(num_words=5)

# Create a dictionary that maps the topic numbers to the topic names
topic_names = {}
for topic in topics:
    topic_num = topic[0]
    split_w = [w for w in topic[1].split('+')]
    topic_words = [w.split('*')[1] for w in split_w]
    topic_name = ",".join(topic_words)
    topic_names[topic_num] = topic_name

# Add a new column to the dataframe with the topic names
df["topic_name"] = df["topic"].map(topic_names)


In [83]:
df

Unnamed: 0,article,topic,probability,topic_name
0,0.0,6.0,0.547899,"""automl"" ,""learning"" ,""we"" ,""data"" ,""models"""
1,0.0,8.0,0.444015,"""learning"" ,""in"" ,""we"" ,""matching"" ,""reinforce..."
2,1.0,6.0,0.650804,"""automl"" ,""learning"" ,""we"" ,""data"" ,""models"""
3,1.0,4.0,0.266983,"""models"" ,""model"" ,""data"" ,""we"" ,""language"""
4,1.0,7.0,0.077380,"""data"" ,""learning"" ,""image"" ,""the"" ,""time"""
...,...,...,...,...
912,644.0,6.0,0.033557,"""automl"" ,""learning"" ,""we"" ,""data"" ,""models"""
913,645.0,3.0,0.994036,"""we"" ,""the"" ,""models"" ,""this"" ,""model"""
914,646.0,3.0,0.496334,"""we"" ,""the"" ,""models"" ,""this"" ,""model"""
915,646.0,4.0,0.351349,"""models"" ,""model"" ,""data"" ,""we"" ,""language"""


In [52]:
from pyLDAvis import gensim


ImportError: cannot import name 'gensim' from 'pyLDAvis' (/Users/vatsalpatel/opt/miniconda3/envs/test_env/lib/python3.10/site-packages/pyLDAvis/__init__.py)

In [51]:
import pyLDAvis.gensim


ModuleNotFoundError: No module named 'pyLDAvis.gensim'

In [None]:
lda.id2word

In [None]:
body_vis = pyLDAvis.gensim.prepare(body_lda_model_25, body_corpus_25, dictionary=body_lda_model_25.id2word,mds='mmds',sort_topics=False)


## Topic Analysis

## Word Cloud

# Supervised Learning - Binary Classification

In [None]:
# generate labels

## Evaluation

# Supervised Learning - Multi-Class Classification

---