<a href="https://colab.research.google.com/github/tranhoangnguyen03/DataProjects/blob/master/BERTopic/OF_Topic_Modeling_with_BERTopic_v_tri_gram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Instant Topic Modeling with BERTopic

[More on BERTopic here](https://https://maartengr.github.io/BERTopic/index.html#:~:text=BERTopic%20is%20a%20topic%20modeling,supervised%2C%20and%20dynamic%20topic%20modeling.)

### Preparation

In [101]:
#@title Setup & ingest dataset
import os, re, pickle

try:
    from bertopic import BERTopic
except:
    os.system('pip install bertopic')
    from bertopic import BERTopic

try:
    from stop_words import get_stop_words
except:
    os.system('pip install stop-words')
    from stop_words import get_stop_words
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')


import pandas as pd 
import numpy as np
import warnings
warnings.simplefilter('ignore')

topics = list(set('''Excess
Excess
Swap
Price
In Store
Abandon
Site issues
Duplicate
Refusal to eat
Medical Reason
Medical Reason
Pet passed away
OOS
Reaction
Delivery Time'''.split('\n')))


url = 'https://raw.githubusercontent.com/joon-solutions/unsubscription_analysis/a87664fc6771fe641c260987dce2f17d868938b6/Example_Data_Pull_Subscription_Cancellation_Reasons_(Recharge).csv?token=GHSAT0AAAAAABSPDOHKZGR4P3AAWA2LNH4YYTBERLA'
df = pd.read_csv(url)
df = df[
        (~df.customer_email.str.contains('openfarmpet.com'))
        & (~df.cancelled_at.isnull())]
df['cancellation_reason_comments'] = (df
    .loc[:,['cancellation_reason','cancellation_reason_comments']]
    .apply(
            lambda x: x[0] if pd.isnull(x[1]) else x[1] 
        ,axis = 1
    )
    .str.replace('’',"'")
)
df.head(3)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,subscription_id,customer_id,customer_email,address_id,status,product_title,variant_title,recurring_price,price,quantity,...,order_interval_unit,charge_day_of_month,charge_day_of_week,properties,cancelled_at,cancellation_reason,cancellation_reason_comments,created_at,deleted_at,updated_at
0,42453790,28624583,kyle.gallant@diffagency.com,32221075,CANCELLED,Homestead Turkey & Chicken,4.5LB,28.49,28.49,1,...,week,0.0,,"[{""name"": ""shipping_interval_frequency"", ""valu...",2019-05-02 13:15,Other reason,Other reason,2019-05-02 13:12,,2021-10-04 21:23
3,42684039,28765177,cberisoff@hotmail.com,32376373,CANCELLED,Homestead Turkey Recipe,13.5OZ,37.04,,1,...,week,,,,2019-09-20 13:18,I already have more than I need,I already have more than I need,2019-05-08 12:02,,2020-09-16 16:48
4,42684040,28765174,cleorock@yahoo.ca,32376369,CANCELLED,Grass-Fed Beef Recipe,13.5OZ,37.04,,2,...,week,,,,2020-01-25 22:52,I want a different product or variety,I want a different product or variety,2019-05-08 12:02,,2020-09-16 16:48


In [102]:
#@title Preprocess data & Training

lemmatizer = WordNetLemmatizer()
stop_words = list(get_stop_words('en'))         #About 900 stopwords
nltk_words = list(stopwords.words('english')) #About 150 stopwords
stop_words.extend(nltk_words)

contractions_dict = {"I've": 'I have', "ain't": "are not", "'s":" is", "aren't": "are not", "can't": "cannot", "can't've": "cannot have", "‘cause": "because", "could've": "could have", "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'd've": "he would have", "he'll": "he will", "he'll've": "he will have", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have", "I'm": "I am", "I've": "I have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not", "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have", "that'd": "that would", "that'd've": "that would have", "there'd": "there would", "there'd've": "there would have", "they'd": "they would", "they'd've": "they would have","they'll": "they will",
 "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not","what'll": "what will", "what'll've": "what will have", "what're": "what are", "what've": "what have", "when've": "when have", "where'd": "where did", "where've": "where have",
 "who'll": "who will", "who'll've": "who will have", "who've": "who have", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"
 ,"ought to":"should"}

negatives_dict = {'are not':'not','am not':'not','is not':'not','was not':'not','were not':'not','have not':'not','has not':'not','had not'
    :'not','do not':'not','does not':'not','did not':'not','cannot':'not','can not':'not','could not':'not','should not':'not','ought not to':'not','shall not':'not','need not':'not','must not'
    :'not','might not':'not','may not':'not','maybe not':'not','will not':'not','would not':'not'}

class Prep:
    def expand_contractions(sentence, contractions_dict=contractions_dict):
        contractions_re = re.compile('(%s)'%'|'.join(contractions_dict.keys()))

        def replace(match):
            return contractions_dict[match.group(0)]
        return contractions_re.sub(replace, sentence)

    def transform_negatives(sentence, negatives_dict=negatives_dict):
        neg_re = re.compile('(%s)'%'|'.join(negatives_dict.keys()))

        def replace(match):
            return negatives_dict[match.group(0)]
        return neg_re.sub(replace, sentence)

    def remove_stop_words(sentence):
        words = sentence.split(' ')
        return ' '.join([word for word in words if word not in stop_words])

    def remove_punctuations(sentence):
        words = sentence.split(' ')
        return ' '.join([word for word in words if word.isalpha()])

    def lemmatize_all(sentence):
        words = sentence.split(' ')
        for word_type in ['n','v','a','r','s']:
            words = [lemmatizer.lemmatize(word, word_type) for word in words]
        return ' '.join(words)

class Model():
    def __init__(self, df):
        
        self.df = df
        self.model = BERTopic(top_n_words=10
            ,n_gram_range=(1,3)
            ,verbose=True
            ,nr_topics="auto")

    def save(self, filepath):
        with open(filepath, "wb") as f:
            pickle.dump(self, f)

    def load(self, filepath):
        with open(filepath, "rb") as f:
            pik = pickle.load(f)
        self.__dict__.update(pik.__dict__)
    
    def preprocess(self, df, comment_column):
        mask_invalid = df[comment_column].fillna('test').str.lower().str.contains('test')
        
        docs = (df[~mask_invalid][comment_column]
                .str.lower()
                .apply(Prep.expand_contractions)
                .apply(Prep.transform_negatives)
                .apply(Prep.remove_punctuations)
                .apply(Prep.lemmatize_all)
                #.apply(Prep.remove_stop_words)
                )
        return docs
        
    def train(self, comment_column='cancellation_reason_comments'):
        self.docs = self.preprocess(self.df, comment_column)
        sentences = self.docs.tolist()
        self.topics, self.probabilities = self.model.fit_transform(sentences)
        print(f'Training Result: {len(set(self.topics))} topics')
        
        topic_detail = {
            i : (v, [j[0] for j in self.model.get_topic(v)] ) 
                for i,v in enumerate(self.topics)
        }
        df_topics = pd.DataFrame.from_dict(topic_detail, orient='index').rename(columns={0:'Topic'})

        self.docs_transformed = df[[comment_column]].merge(
            pd.concat([self.docs.rename('Processed Comments').reset_index()
                ,df_topics['Topic']
                ,df_topics[1].apply(lambda x: ','.join(x)).str.split(',',expand=True)
                ],axis=1
            ).set_index('index')
            ,left_index=True,right_index=True
        )

    def predict(self, df, comment_column = 'cancellation_reason_comments'):
        df_ = self.preprocess(df, comment_column)
        sentences = df_.tolist()

        topics_int = self.model.transform(sentences)[0]
        topic_detail = {
            i : (v, [j[0] for j in self.model.get_topic(v)] ) 
                for i,v in enumerate(topics_int)
        }
        df_topics = pd.DataFrame.from_dict(topic_detail, orient='index').rename(columns={0:'Topic'})

        res = df[[comment_column]].merge(
            pd.concat([df_.rename('Processed Comments').reset_index()
                ,df_topics['Topic']
                ,df_topics[1].apply(lambda x: ','.join(x)).str.split(',',expand=True)
                ],axis=1
            ).set_index('index')
            ,left_index=True,right_index=True
        )
        
        return res

model = Model(df)
model.train()

Batches:   0%|          | 0/129 [00:00<?, ?it/s]

2022-04-21 08:34:49,454 - BERTopic - Transformed documents to Embeddings
2022-04-21 08:35:11,136 - BERTopic - Reduced dimensionality with UMAP
2022-04-21 08:35:11,337 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2022-04-21 08:35:16,342 - BERTopic - Reduced number of topics from 88 to 48


Training Result: 48 topics


### Model Analysis

In [103]:
#@title Model Analysis
from google.colab import widgets

tabs_list = [
    'Topic Freq'
    ,'Topic Viz'
    ,'Topic BarChart'
    ,'Topic HeatMap'
    ,'Full df']
tabs = widgets.TabBar(tabs_list)

with tabs.output_to(tabs_list[0]):
    display(
        (model.model.get_topic_freq()
            .merge(
                pd.DataFrame.from_dict(
                    model.model.get_topics()
                    ,orient='index'
                    ).applymap(lambda x: x[0])
                ,left_on='Topic', right_index=True
            )
        )
    )

with tabs.output_to(tabs_list[1]):
    display(model.model.visualize_topics())

with tabs.output_to(tabs_list[2]):
    display(model.model.visualize_barchart(top_n_topics=len(set(model.topics))+1, n_words=10))

with tabs.output_to(tabs_list[3]):
    display(model.model.visualize_heatmap())

with tabs.output_to(tabs_list[4]):
    display(model.docs_transformed)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Topic,Count,0,1,2,3,4,5,6,7,8,9
0,0,544,other reason,reason,reason other,reason other reason,other reason other,other,reason because other,reason because,because other reason,other reason because
1,-1,499,the,to,not,be,and,my,it,for,dog,subscription
2,1,370,have more than,have more,than need,more than need,already have,more than,already have more,already,than,need already have
3,2,339,dog,cat,my,not,like,the,food,not like,eat,dog not
4,3,289,subscription,order,to,modify,the,to order,not,when,new,when need
5,4,169,charge attempt,attempt reach,max,of charge attempt,of charge,number of charge,charge attempt reach,max number of,max number,reach max
6,5,162,long use,no long use,long use this,use this,product no,product no long,use this product,this product no,this product,use
7,6,144,store,local,buy,pet,shop,in store,locally,in,local pet,pet store
8,7,141,accident,create by accident,create by,by accident,wa create,wa create by,this wa create,accident this wa,by accident this,accident this
9,8,132,this be too,expensive this be,too expensive this,expensive this,be too expensive,be too,this be,too expensive,expensive,too


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,cancellation_reason_comments,Processed Comments,Topic,0,1,2,3,4,5,6,7,8,9
0,Other reason,other reason,0,other reason,reason,reason other,reason other reason,other reason other,other,reason because other,reason because,because other reason,other reason because
3,I already have more than I need,i already have more than i need,1,have more than,have more,than need,more than need,already have,more than,already have more,already,than,need already have
4,I want a different product or variety,i want a different product or variety,10,different product or,or variety,product or,product or variety,want different product,want different,or variety want,variety want,variety,variety want different
5,I already have more than I need,i already have more than i need,1,have more than,have more,than need,more than need,already have,more than,already have more,already,than,need already have
6,I just placed an order.,i just place an,3,subscription,order,to,modify,the,to order,not,when,new,when need
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4152,Reorder,reorder,3,subscription,order,to,modify,the,to order,not,when,new,when need
4154,I'm purchasing from a store instead,purchase from a store instead,6,store,local,buy,pet,shop,in store,locally,in,local pet,pet store
4155,I'm opting to order as needed instead,opt to order a need instead,30,opt to order,order need instead,opt to,need instead,opt,to order need,need instead opt,instead opt to,instead opt,order need
4156,This is too expensive,this be too expensive,8,this be too,expensive this be,too expensive this,expensive this,be too expensive,be too,this be,too expensive,expensive,too


<IPython.core.display.Javascript object>

In [107]:
#@title Manual Test - Verbatim { vertical-output: true }
sentence = "i cannot afford it" #@param {type:"string"}
column = 'comment'

df_test = pd.DataFrame({column:sentence},index=[0])

df_res = model.predict(df_test, column)
similar_comments = model.docs_transformed.loc[
    model.docs_transformed['Topic']==df_res.Topic.iloc[0]
    ,model.docs_transformed.columns[0]
].unique()

from google.colab import widgets

tabs_list = [
    'Topic Result'
    ,'Similar Comments'
]
tabs = widgets.TabBar(tabs_list)

with tabs.output_to(tabs_list[0]):
    display(df_res)
with tabs.output_to(tabs_list[1]):
    display(similar_comments)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-04-21 09:03:29,868 - BERTopic - Reduced dimensionality with UMAP
2022-04-21 09:03:29,870 - BERTopic - Predicted clusters with HDBSCAN


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,comment,Processed Comments,Topic,0,1,2,3,4,5,6,7,8,9
0,i cannot afford it,i not afford it,41,financial,afford,money,no money,no long afford,can no long,can no,long afford,no money no,afford financial


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

array(['You have a great product, but can no longer afford it',
       "Can't afford", 'Financial',
       "Can't afford to spend 25$ more, will have to walk uphill across the city with it.",
       'More money than I thought. As a topper it only lasts 2 weeks.',
       'My financial status has very recently changed and I can no longer afford this.  My dogs LOVE the food.  Once my finances change I will be back!',
       'can no longer afford', 'Financial instability', 'no money'],
      dtype=object)

<IPython.core.display.Javascript object>

In [105]:
#@title Manual Test - Verbatim { run: "auto", vertical-output: true }
topic =  45#@param {type:"integer"}
column = 'comment'

similar_comments = (model.docs_transformed
    .loc[model.docs_transformed.Topic == topic
        ,model.docs_transformed.columns[0]
    ].unique()
)

from google.colab import widgets

tabs_list = [
    'Topic Result'
    ,'Similar Comments'
]
tabs = widgets.TabBar(tabs_list)

with tabs.output_to(tabs_list[0]):
    display(model.docs_transformed
        .loc[model.docs_transformed.Topic == topic
            ,model.docs_transformed.columns[2:]
        ].drop_duplicates()
    )
with tabs.output_to(tabs_list[1]):
    display(similar_comments)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Topic,0,1,2,3,4,5,6,7,8,9
358,45,long have,no long have,long have dog,have dog,have dog no,puppy,dog no,no long puppy,long puppy,no


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

array(['No longer have a dog',
       'We no longer have a cat, he went missing a few weeks ago.',
       'No longer have dog', 'no dog anymore',
       'Changing foods. Pup not enjoying. ', 'No longer puppy',
       'No longer a puppy',
       'We were getting a puppy today but the adoption fell through ?',
       'Older dog not a puppy'], dtype=object)

<IPython.core.display.Javascript object>