In [None]:
import pandas as pd
import pandas as pd
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

# Import our Python Classes
from src.models.topic_modelling.NMF import NMFModel
from src.data.preprocess import Preprocessor


In [None]:
preprocessor = Preprocessor(pd.read_csv("../../data/raw/reviews.csv"))
preprocessor.clean_csv()
df = preprocessor.clean_df
df.head()

# NMF

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', 
max_features= 1000, # keep top 1000 terms 
max_df = 0.5, 
smooth_idf=True)

X = vectorizer.fit_transform(df['cleaned_text'])

X.shape # check shape of the document-term matrix

In [None]:
# Create an NMF model
# 10 components will be the topics
nmf_model = NMF(n_components=10, random_state=5)
 
# Fit the model to TF-IDF
nmf_model.fit(X)
 
# Transform the TF-IDF: nmf_features
nmf_features = nmf_model.transform(X)

len(nmf_model.components_) 

In [None]:
# Create a DataFrame: components_df
components_df = pd.DataFrame(nmf_model.components_, columns=vectorizer.get_feature_names_out())
components_df

In [None]:
# Get Words of Highest Value for each Topic
for topic in range(components_df.shape[0]):
    tmp = components_df.iloc[topic]
    print(f'For topic {topic+1} the words with the highest value are:')
    print(tmp.nlargest(10))
    print('\n')

In [None]:
def topic_table(n_top_words, feature_names, nmf):
  topics = {}
  for i, topic_vec in enumerate(nmf.components_):
    topic_descr = ''
    for fid in topic_vec.argsort()[-1:-n_top_words-1:-1]:
      topic_descr = topic_descr + feature_names[fid] + " "
    topics[i] = topic_descr
  return pd.DataFrame({'Top_Topic_Terms': topics})

# Label topics with top 5 terms
topic_df = topic_table(5, vectorizer.get_feature_names_out(), nmf_model)

# Manually label topics
topic_df['Label'] = ['Flavoured Drinks/Juices', 'Tea', 'Coffee', 'Price', 'Taste', 'Pet Food', 'Healthy', 'Chocolate/Sweet Snacks', 'Quality/Delivery', 'Satisfaction']

# Getting weights to classify our dataset
document_weights = nmf_model.transform(vectorizer.transform(df['cleaned_text']))

# Store most representative topic
df["Topic_idx"] = document_weights.argmax(axis=1)
# Joining the original dataset with labels
df = pd.merge(df, topic_df, left_on='Topic_idx', right_index=True, how='left')

In [None]:
df.head()

# Using our NMF Python Class

In [None]:
model = NMFModel(df)
model.fit_transform()

In [None]:
# Get the top words for each topic
top_words = model.get_topic_terms()

In [None]:
# Get the labels for each document
labels = model.get_labels()

In [None]:
labels.head()