In [2]:
os.chdir("../../")

In [3]:
os.getcwd()

'/Users/usman_s/Dev/data-dialogue'

In [9]:
import os

import pandas as pd

from src.data.preprocess import Preprocessor
from src.models.topic_modelling.bert_topic import BertTopic
from src.models.topic_modelling.LDA import LDAGensim
from src.models.topic_modelling.LSA import LSAModel
from src.models.topic_modelling.NMF import NMFModel


def run_training_pipeline(model_choice, pre_processed_df, num_topics=10, tags=None):
    """
    Train the specified topic modeling algorithm on the given pre-processed DataFrame.

    Parameters:
        num_topics (int): Number of topics
        tags (list): Nouns, Adjectives and Verbs
        model_choice (str): The topic modeling algorithm to use, options are: "lda", "lsa", "nmf", "bertopic".
        pre_processed_df (pd.DataFrame): The pre-processed input data to train the topic modeling algorithm on.

    Returns:
        topics_dict (dict)A dictionary containing the topics generated by the specified topic modeling algorithm.

    Raises
    ValueError
        If an invalid model_choice is provided.
    """
    if model_choice:
        if model_choice == "lda":
            lda_model = LDAGensim(pre_processed_df, num_topics=num_topics, tags=tags)
            print("Running LDA Model...")
            topics_dict = lda_model.get_topics()
        elif model_choice == "lsa":
            lsa_model = LSAModel(pre_processed_df, num_topics=num_topics, tags=tags)
            print("Running LSA Model...")
            topics_dict = lsa_model.get_topics()
        elif model_choice == "nmf":
            nmf_model = NMFModel(pre_processed_df)
            print("Running NMF Model...")
            nmf_model.fit_transform()
            topics_dict = nmf_model.get_topic_terms()
        elif model_choice == "bertopic":
            bertopic_model = BertTopic(pre_processed_df)
            print("Running BertTopic Model...")
            bertopic_model.prepare_embeddings()
            bertopic_model.run_bertopic()
            topics_dict = bertopic_model.get_topics()
    else:
        raise ValueError("Please specify a model to run.")

    return topics_dict


def topics_dict_to_df(model_choice, topics_dict):
    """
    Convert the topics dictionary to a DataFrame.

    Parameters:
        model_choice (str): The topic modeling algorithm used to generate the topics_dict.
        topics_dict (dict): A dictionary containing the topics generated by the specified topic modeling algorithm.

    Returns:
        pivoted_df (pd.DataFrame): A DataFrame with the topic_id, words and corresponding scores for each topic.
    """
    topics_df = pd.DataFrame.from_dict(
        {(i, j): topics_dict[i][j] for i in topics_dict.keys() for j in topics_dict[i].keys()},
        orient="index",
        columns=["value"],
    )

    # split the row index into two separate columns
    topics_df.index = pd.MultiIndex.from_tuples(topics_df.index, names=["topic", "word"])

    # reset the index to turn the MultiIndex into columns
    topics_df = topics_df.reset_index()

    if model_choice == "lda":
        score_col = "probability"
    elif model_choice == "lsa":
        score_col = "svd_score"
    elif model_choice == "nmf":
        score_col = "tfidf_score"
    elif model_choice == "bertopic":
        score_col = "c-tfidf_score"

    topics_df = topics_df.rename(columns={"value": score_col})

    pivoted_df = topics_df.pivot_table(
        index="topic",
        columns=topics_df.groupby(["topic"]).cumcount() + 1,
        values=["word", score_col],
        aggfunc="first",
    ).reset_index()

    pivoted_df.columns = ["_".join(map(str, col)).strip() for col in pivoted_df.columns.values]
    pivoted_df = pivoted_df.rename(columns={"topic_": "topic_id"})

    column_order = ["topic_id"]
    for i in range(1, len(pivoted_df.columns[2:]), 2):
        column_order += [f"word_{i // 2 + 1}", f"{score_col}_{i // 2 + 1}"]

    pivoted_df = pivoted_df.reindex(columns=column_order)

    return pivoted_df

In [12]:
data = pd.read_csv("data/raw/reviews.csv", parse_dates=["Time"])
preprocessor = Preprocessor(data)
preprocessor.clean_csv()
pre_processed_df = preprocessor.clean_df

In [13]:
topics_dict = run_training_pipeline("bertopic", pre_processed_df)
pivoted_df_bertopic = topics_dict_to_df("bertopic", topics_dict)
pivoted_df_bertopic

Running BertTopic Model...
Loading existing embeddings...


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


Unnamed: 0,topic_id,word_1,c-tfidf_score_1,word_2,c-tfidf_score_2,word_3,c-tfidf_score_3,word_4,c-tfidf_score_4,word_5,c-tfidf_score_5,word_6,c-tfidf_score_6,word_7,c-tfidf_score_7,word_8,c-tfidf_score_8,word_9,c-tfidf_score_9
0,-1,not,0.02802,taste,0.023138,my,0.021959,but,0.021287,like,0.019599,great,0.019227,good,0.0192,product,0.018608,flavor,0.018452
1,0,coffee,0.125342,cup,0.059497,not,0.027689,taste,0.026875,my,0.025352,roast,0.024339,flavor,0.02331,but,0.022369,like,0.022214
2,1,sauce,0.044464,soup,0.033543,noodle,0.033496,pasta,0.029924,not,0.024354,use,0.023337,chicken,0.023269,rice,0.022272,cook,0.020872
3,2,price,0.036529,box,0.030241,product,0.029799,not,0.028664,candy,0.028271,order,0.028127,buy,0.026924,store,0.023921,get,0.023842
4,3,tea,0.179459,green,0.031936,taste,0.029419,not,0.027178,drink,0.025867,like,0.025148,flavor,0.02287,my,0.022512,but,0.020881
5,4,dog,0.113213,treat,0.047125,food,0.045221,my,0.035419,china,0.031445,pet,0.029852,she,0.025087,not,0.024694,product,0.02374
6,5,gluten,0.06652,cake,0.05531,free,0.048723,mix,0.047531,pancake,0.036596,make,0.035302,bread,0.034654,not,0.02818,flour,0.026058
7,6,drink,0.074421,orange,0.057521,juice,0.057453,soda,0.054265,water,0.048409,tangerine,0.037472,taste,0.036581,not,0.03086,flavor,0.030211
8,7,snack,0.052607,baby,0.046251,my,0.02601,love,0.025992,food,0.025185,not,0.024783,but,0.022703,stool,0.020755,she,0.020642
9,8,chocolate,0.141143,hot,0.075328,cocoa,0.074148,not,0.038582,taste,0.033418,milk,0.02814,like,0.026121,dark,0.024745,cup,0.024336


In [14]:
topics_dict = run_training_pipeline("lda", pre_processed_df)
pivoted_df_lda = topics_dict_to_df("lda", topics_dict)
pivoted_df_lda

Running LDA Model...


Unnamed: 0,topic_id,word_1,probability_1,word_2,probability_2,word_3,probability_3,word_4,probability_4,word_5,probability_5,word_6,probability_6,word_7,probability_7,word_8,probability_8,word_9,probability_9
0,0,not,0.030836,bag,0.01813,buy,0.015524,bean,0.015367,open,0.013768,out,0.012479,use,0.009233,but,0.008797,no,0.008196
1,1,coffee,0.062392,cup,0.031824,not,0.024142,taste,0.022073,my,0.017574,flavor,0.015865,like,0.014552,but,0.01368,try,0.01142
2,2,not,0.033842,taste,0.030475,but,0.023787,chip,0.023149,good,0.021539,great,0.017463,like,0.017068,flavor,0.016423,salt,0.012802
3,3,tea,0.031522,not,0.030787,taste,0.02322,my,0.018827,like,0.018232,but,0.016931,flavor,0.013319,product,0.010994,very,0.010002
4,4,use,0.018665,great,0.015133,sugar,0.015071,milk,0.01484,oil,0.013255,make,0.012713,taste,0.012488,mix,0.012281,add,0.009956
5,5,water,0.017663,product,0.016144,jerky,0.01452,use,0.014257,not,0.011972,can,0.011634,bottle,0.009508,up,0.009487,pork,0.008062
6,6,food,0.042827,dog,0.040572,my,0.023961,cat,0.022933,treat,0.020602,free,0.019674,gluten,0.019327,she,0.019102,not,0.017029
7,7,my,0.037252,love,0.031351,very,0.017687,great,0.015945,make,0.012661,not,0.011125,snack,0.011123,eat,0.011094,good,0.011023
8,8,not,0.053335,but,0.028189,like,0.021571,taste,0.021004,good,0.014389,flavor,0.011667,drink,0.010369,very,0.008822,be,0.008657
9,9,price,0.024633,my,0.024145,not,0.023321,amazon,0.020168,product,0.019177,buy,0.01622,get,0.015788,store,0.014377,but,0.013959


In [15]:
topics_dict = run_training_pipeline("lsa", pre_processed_df)
pivoted_df_lsa = topics_dict_to_df("lsa", topics_dict)
pivoted_df_lsa

Running LSA Model...


Unnamed: 0,topic_id,word_1,svd_score_1,word_2,svd_score_2,word_3,svd_score_3,word_4,svd_score_4,word_5,svd_score_5,word_6,svd_score_6,word_7,svd_score_7,word_8,svd_score_8,word_9,svd_score_9
0,0,taste,1.0,like,6.282117e-15,good,7.943588e-17,make,9.907246000000001e-33,love,4.0461219999999995e-38,use,2.8393849999999996e-38,try,1.9535699999999997e-44,tea,4.735315e-53,food,6.590657e-56
1,1,like,1.0,good,2.12806e-15,great,5.976261e-19,healthy,7.737317e-21,product,4.788377e-25,really,1.986234e-28,tea,1.311167e-29,actually,1.2810149999999999e-30,able,7.888552000000001e-31
2,2,good,1.0,absolutely,4.3056249999999996e-19,flavor,1.714295e-22,really,6.173895e-25,tea,4.09492e-26,actually,1.327308e-26,eat,6.531213000000001e-29,order,1.18853e-30,able,3.006014e-33
3,3,great,1.0,product,1.095949e-13,good,1.316065e-14,really,1.009854e-16,tea,6.778235e-18,actually,1.482882e-18,eat,1.036876e-20,healthy,5.312923e-22,order,1.872791e-22
4,4,product,1.0,flavor,7.761351e-12,food,9.644800000000001e-17,actually,3.27996e-17,actual,6.006682999999999e-19,healthy,1.1824279999999999e-20,good,1.083578e-20,love,4.372236e-23,time,5.872763e-24
5,5,flavor,1.0,great,7.988106e-15,food,1.844357e-16,actually,3.589226e-18,make,3.021123e-18,healthy,1.248196e-21,use,1.142272e-22,time,8.045872000000001e-23,buy,1.548818e-23
6,6,make,1.0,use,3.3498960000000004e-17,product,1.119376e-17,buy,4.892745e-18,great,1.1276909999999999e-20,really,1.818857e-21,dog,1.8587630000000002e-22,tea,1.736565e-22,small,5.762373000000001e-23
7,7,use,1.0,buy,5.878217e-15,food,7.422174e-16,small,2.6159980000000003e-17,tea,1.58931e-17,addition,6.5887349999999995e-19,time,3.6355439999999997e-19,amazon,5.7909e-21,added,5.949079e-22
8,8,love,1.0,buy,1.770961e-12,use,9.735723e-13,try,1.113198e-15,coffee,7.976234e-16,food,2.69186e-16,small,7.704666000000001e-17,price,1.0665190000000001e-17,eat,1.587011e-18
9,9,buy,1.0,try,4.696152e-16,coffee,4.014894e-16,addict,3.409484e-17,price,4.938467e-18,eat,5.660572999999999e-19,order,1.8429559999999998e-20,drink,1.329232e-21,added,7.903189e-22


In [16]:
topics_dict = run_training_pipeline("nmf", pre_processed_df)
pivoted_df_nmf = topics_dict_to_df("nmf", topics_dict)
pivoted_df_nmf

Running NMF Model...


Unnamed: 0,topic_id,word_1,tfidf_score_1,word_2,tfidf_score_2,word_3,tfidf_score_3,word_4,tfidf_score_4,word_5,tfidf_score_5,word_6,tfidf_score_6,word_7,tfidf_score_7,word_8,tfidf_score_8,word_9,tfidf_score_9
0,0,taste,2.627288,like,2.287856,good,2.035633,really,1.031693,try,0.520927,bad,0.482532,buy,0.411547,think,0.322734,know,0.246372
1,1,coffee,2.845298,cup,0.871045,strong,0.278883,roast,0.273839,bean,0.2629,brew,0.247357,flavor,0.231823,blend,0.209479,keurig,0.19344
2,2,tea,2.592868,green,0.350734,bag,0.178501,black,0.148369,drink,0.139501,iced,0.135607,ice,0.131163,cup,0.128397,brew,0.128151
3,3,price,1.560091,store,0.979057,amazon,0.902097,buy,0.838037,order,0.601464,grocery,0.527729,local,0.515014,box,0.505404,good,0.339445
4,4,great,2.225554,taste,0.552187,price,0.146747,work,0.145819,easy,0.114071,snack,0.106727,add,0.090838,little,0.088986,day,0.087695
5,5,love,2.563444,kid,0.398802,old,0.238195,son,0.238119,family,0.193926,try,0.191623,flavor,0.187745,year,0.179528,eat,0.159794
6,6,dog,1.622049,food,1.367988,treat,0.696253,cat,0.598278,eat,0.403494,china,0.299311,make,0.246392,pet,0.228374,chicken,0.211606
7,7,product,1.934616,use,0.92514,make,0.566229,purchase,0.325223,time,0.298985,easy,0.286761,quality,0.285431,oil,0.21882,sauce,0.213923
8,8,flavor,1.397245,drink,1.214563,water,0.92724,sugar,0.612582,juice,0.533786,add,0.513239,orange,0.504766,soda,0.489021,bottle,0.331083
9,9,bar,0.92387,snack,0.864615,chip,0.770121,tasty,0.569708,eat,0.532733,chocolate,0.528581,healthy,0.516477,free,0.462991,cooky,0.403897
