<a href="https://colab.research.google.com/github/tranhoangnguyen03/DataProjects/blob/master/BERTopic/OF_Topic_Modeling_with_BERTopic_v_bi_gram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Instant Topic Modeling with BERTopic

[More on BERTopic here](https://https://maartengr.github.io/BERTopic/index.html#:~:text=BERTopic%20is%20a%20topic%20modeling,supervised%2C%20and%20dynamic%20topic%20modeling.)

### Preparation

In [1]:
#@title Setup & ingest dataset
import os

try:
    from bertopic import BERTopic
except:
    os.system('pip install bertopic')
    from bertopic import BERTopic

try:
    from stop_words import get_stop_words
except:
    os.system('pip install stop-words')
    from stop_words import get_stop_words
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')


import pandas as pd 
import numpy as np

url = 'https://raw.githubusercontent.com/joon-solutions/unsubscription_analysis/master/Example_Data_Pull_Subscription_Cancellation_Reasons_(Recharge).csv?token=GHSAT0AAAAAABSPDOHLGYSGDQJE56MS3T64YSNQHOA'
df = pd.read_csv(url)
df.head(3)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,subscription_id,customer_id,customer_email,address_id,status,product_title,variant_title,recurring_price,price,quantity,...,order_interval_unit,charge_day_of_month,charge_day_of_week,properties,cancelled_at,cancellation_reason,cancellation_reason_comments,created_at,deleted_at,updated_at
0,42453790,28624583,kyle.gallant@diffagency.com,32221075,CANCELLED,Homestead Turkey & Chicken,4.5LB,28.49,28.49,1,...,week,0.0,,"[{""name"": ""shipping_interval_frequency"", ""valu...",2019-05-02 13:15,Other reason,,2019-05-02 13:12,,2021-10-04 21:23
1,42613662,28721717,megan@openfarmpet.com,32327967,CANCELLED,Senior Recipe,4.5LB,28.49,29.99,1,...,week,0.0,,"[{""name"": ""shipping_interval_frequency"", ""valu...",2019-05-07 14:01,Other reason,test,2019-05-06 19:30,,2020-09-16 16:44
2,42684037,28765173,blamondin@live.com,32376368,CANCELLED,Wild-Caught Salmon,8LB,51.29,,3,...,week,,,,,,,2019-05-08 12:02,,2020-09-16 16:52


In [11]:
#@title Preprocess data & Training

lemmatizer = WordNetLemmatizer()
stop_words = list(get_stop_words('en'))         #About 900 stopwords
nltk_words = list(stopwords.words('english')) #About 150 stopwords
stop_words.extend(nltk_words)

class Prep:
    def remove_stop_words(sentence):
        words = sentence.split(' ')
        return ' '.join([word for word in words if word not in stop_words])

    def remove_punctuations(sentence):
        words = sentence.split(' ')
        return ' '.join([word for word in words if word.isalpha()])

    def lemmatize_all(sentence):
        words = sentence.split(' ')
        for word_type in ['n','v','a','r','s']:
            words = [lemmatizer.lemmatize(word, word_type) for word in words]
        return ' '.join(words)

class Model():
    def __init__(self, df):
        
        self.df = df
        self.model = BERTopic(top_n_words=10
            ,n_gram_range=(1,3)
            ,verbose=True
            ,nr_topics="auto")

    
    def preprocess(self, df, comment_column):
        mask_invalid = df[comment_column].fillna('test').str.lower().str.contains('test')
        
        docs = (df[~mask_invalid][comment_column]
                .str.lower()
                .apply(Prep.remove_punctuations)
                .apply(Prep.lemmatize_all)
                #.apply(Prep.remove_stop_words)
                )
        return docs
        
    def train(self, comment_column='cancellation_reason_comments'):
        self.docs = self.preprocess(self.df, comment_column)
        sentences = self.docs.tolist()
        self.topics, self.probabilities = self.model.fit_transform(sentences)
        print(f'Training Result: {len(set(self.topics))} topics')
        
        topic_detail = {
            i : (v, [j[0] for j in self.model.get_topic(v)] ) 
                for i,v in enumerate(self.topics)
        }
        df_topics = pd.DataFrame.from_dict(topic_detail, orient='index').rename(columns={0:'Topic'})

        self.docs_transformed = df[[comment_column]].merge(
            pd.concat([self.docs.rename('Processed Comments').reset_index()
                ,df_topics['Topic']
                ,df_topics[1].apply(lambda x: ','.join(x)).str.split(',',expand=True)
                ],axis=1
            ).set_index('index')
            ,left_index=True,right_index=True
        )

    def predict(self, df, comment_column = 'cancellation_reason_comments'):
        df_ = self.preprocess(df, comment_column)
        sentences = df_.tolist()

        topics_int = self.model.transform(sentences)[0]
        topic_detail = {
            i : (v, [j[0] for j in self.model.get_topic(v)] ) 
                for i,v in enumerate(topics_int)
        }
        df_topics = pd.DataFrame.from_dict(topic_detail, orient='index').rename(columns={0:'Topic'})

        res = df[[comment_column]].merge(
            pd.concat([df_.rename('Processed Comments').reset_index()
                ,df_topics['Topic']
                ,df_topics[1].apply(lambda x: ','.join(x)).str.split(',',expand=True)
                ],axis=1
            ).set_index('index')
            ,left_index=True,right_index=True
        )
        
        return res

model = Model(df)
model.train()

Batches:   0%|          | 0/37 [00:00<?, ?it/s]

2022-04-06 11:06:26,969 - BERTopic - Transformed documents to Embeddings
2022-04-06 11:06:34,942 - BERTopic - Reduced dimensionality with UMAP
2022-04-06 11:06:35,017 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2022-04-06 11:06:37,072 - BERTopic - Reduced number of topics from 36 to 31


Training Result: 31 topics


### Model Analysis

In [16]:
#@title Model Analysis
from google.colab import widgets

tabs_list = [
    'Topic Freq'
    ,'Topic Viz'
    ,'Topic BarChart'
    ,'Topic HeatMap'
    ,'Full df']
tabs = widgets.TabBar(tabs_list)

with tabs.output_to(tabs_list[0]):
    display(
        (model.model.get_topic_freq()
            .merge(
                pd.DataFrame.from_dict(
                    model.model.get_topics()
                    ,orient='index'
                    ).applymap(lambda x: x[0])
                ,left_on='Topic', right_index=True
            )
        )
    )

with tabs.output_to(tabs_list[1]):
    display(model.model.visualize_topics())

with tabs.output_to(tabs_list[2]):
    display(model.model.visualize_barchart(top_n_topics=len(set(model.topics))+1, n_words=10))

with tabs.output_to(tabs_list[3]):
    display(model.model.visualize_heatmap())

with tabs.output_to(tabs_list[4]):
    display(model.docs_transformed)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Topic,Count,0,1,2,3,4,5,6,7,8,9
0,0,215,cat,my,dog,eat,my cat,food,like,the,not,be
1,-1,207,the,to,be,for,and,dog,my,food,it,have
2,1,71,too,because too,too confuse,too because too,too because,no too confuse,no too,too confuse too,gvhg no too,gvhg no
3,2,69,subscription,the,to,bag,and,of,the subscription,from,my,be
4,3,49,order,to order,want to order,when,want,when need,want to,order when,need,to
5,4,38,locally,store,your product,product,find,sell,that sell,local,your,carry
6,5,38,raw,food,switch,to raw,switch to,diet,switch to raw,change,raw food,to
7,6,36,pet,local pet,local,pet store,store,local pet store,buy,my local pet,at,my local
8,7,35,in store,shop,shop in,shop in store,store,in,to shop in,to shop,purchase in store,purchase in
9,8,33,delay,ship,post,canada,too long,canada post,too,transit,transit delay,post office


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,cancellation_reason_comments,Processed Comments,Topic,0,1,2,3,4,5,6,7,8,9
6,I just placed an order.,i just place an,3,order,to order,want to order,when,want,when need,want to,order when,need,to
7,Vet Diet Change,vet diet change,-1,the,to,be,for,and,dog,my,food,it,have
10,I cancelled this subscription over a month ago...,i cancel this subscription over a month ago th...,18,cancel,subscription,cancel subscription,subscription cancel subscription,cancel subscription cancel,subscription cancel,to,yet,have,the
16,"delivery issues (e.g., item held at post offic...",delivery issue item hold at post office instea...,8,delay,ship,post,canada,too long,canada post,too,transit,transit delay,post office
18,purchasing in store,purchase in store,7,in store,shop,shop in,shop in store,store,in,to shop in,to shop,purchase in store,purchase in
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4106,Cats do not enjoy the recipes,cat do not enjoy the recipe,0,cat,my,dog,eat,my cat,food,like,the,not,be
4115,Merging under one,merge under one,-1,the,to,be,for,and,dog,my,food,it,have
4119,Shopping in store instead.,shop in store,7,in store,shop,shop in,shop in store,store,in,to shop in,to shop,purchase in store,purchase in
4125,Diet Change,diet change,5,raw,food,switch,to raw,switch to,diet,switch to raw,change,raw food,to


<IPython.core.display.Javascript object>

In [17]:
#@title Manual Test
sentence = "I've switch to shop nearby" #@param {type:"string"}
column = 'comment'

df_test = pd.DataFrame({column:sentence},index=[0])

df_res = model.predict(df_test, column)
similar_comments = model.docs_transformed.loc[
    model.docs_transformed['Topic']==df_res.Topic.iloc[0]
    ,model.docs_transformed.columns[0]
].unique()

from google.colab import widgets

tabs_list = [
    'Topic Result'
    ,'Similar Comments'
]
tabs = widgets.TabBar(tabs_list)

with tabs.output_to(tabs_list[0]):
    display(df_res)
with tabs.output_to(tabs_list[1]):
    display(similar_comments)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-04-06 11:08:19,561 - BERTopic - Reduced dimensionality with UMAP
2022-04-06 11:08:19,564 - BERTopic - Predicted clusters with HDBSCAN


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,comment,Processed Comments,Topic,0,1,2,3,4,5,6,7,8,9
0,I've switch to shop nearby,switch to shop nearby,7,in store,shop,shop in,shop in store,store,in,to shop in,to shop,purchase in store,purchase in


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

array(['purchasing in store', 'Buying locally.',
       'I can buy this for less in store',
       'Wants to shop in store instead. ', 'This is now sold in store.',
       'will be switching to purchasing in store locally',
       'easier to purchase it in store',
       'Wishes to shop in store instead', 'shop in store',
       'Prefers shopping instore ', 'Shopping in store',
       'I moved to a new location where I can purchase in store now!',
       "I can shop in store at Ren's and no longer need the mailed to me",
       'Wants to purchase in store because of website ', 'Shops in Store',
       'Buying in store', 'wishes to purchase in store as well.',
       'Wish to shop in store instead. ', 'Buying fresh food.',
       'Wants to shop in store instead', 'Will shop instore instead ',
       'Shop in store', 'Shopping in Store',
       'Wants to shop in store. Delivery is unreliable. ',
       'Shopping in store instead. '], dtype=object)

<IPython.core.display.Javascript object>