## Topic Modeling NMF

NMF
NonNegative Matrix Factorization (NMF) is an unsupervised technique, so there is no labeling of the topics by which the model will be trained. The way this works is that NMF decomposes (or factorizes) vectors of large dimension to a representation of smaller dimension. These vectors of smaller dimension are non-negative, which also means that their coefficients are non-negative.

Using the original matrix (A), NMF will give you two matrices (W and H). W are the topics found, and His are the coefficients (weights) for these topics. In other words, A is comments on words (the original), his comments on topics and topics on words.

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
import re
import warnings
warnings.filterwarnings('ignore')

merged_df = pd.read_excel("preprocessed_train.xlsx")
merged_df["comment_text_lemm"] = merged_df["comment_text_lemm"].astype(str)
clmn = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Check if all columns of interest contain 0 and create a boolean mask
mask = (merged_df[clmn] == 0).all(axis=1)

# Filter the DataFrame to keep rows where the mask is False
filtered_df = merged_df[~mask]
#merged_df = filtered_df.head(5000) # part for testing
merged_df

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.decomposition import NMF

vectorizer = TfidfVectorizer()
# Fitting tf-idf model
X = vectorizer.fit_transform(merged_df['comment_text_lemm'])

In [None]:
merged_df['comment_text_lemm'] = merged_df['comment_text_lemm'].astype(str)
# Convert text to lowercase
merged_df['cleaned_text'] = merged_df['comment_text_lemm'].str.lower()

# Tokenize the text into words
merged_df['tokens'] = merged_df['cleaned_text'].apply(lambda x: word_tokenize(x))
merged_df

In [None]:
%%time
from gensim import corpora, models
np.random.seed(42)

# Create a corpus from a list of texts
texts = merged_df['tokens'].values
dictionary = corpora.Dictionary(texts, prune_at=2000)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
%%time
from gensim.models.nmf import Nmf
from gensim.models.coherencemodel import CoherenceModel

coh_list = []
for n_topics in range(3,50+1):
    # Train the model on the corpus
    nmf = Nmf(corpus, num_topics=n_topics, id2word=dictionary, random_state=42)
    # Estimate coherence
    cm = CoherenceModel(model=nmf, texts=texts, dictionary=dictionary, coherence='u_mass')
    coherence = cm.get_coherence_per_topic() # get coherence value
    coh_list.append(coherence)

In [None]:
%%time
from sklearn.decomposition import NMF

tfidf = X

n_topics=7
n_top_words = 10

nmf = NMF(n_components=n_topics, random_state=42, l1_ratio=.5).fit(tfidf)
nmf_embedding = nmf.transform(tfidf)
feature_names = vectorizer.get_feature_names_out()
print("Topics found via NMF:")
for topic_idx, topic in enumerate(nmf.components_):
    print("\nTopic {}:".format(topic_idx))
    print(" ".join(['[{}]'.format(feature_names[i]) for i in topic.argsort()[:-n_top_words - 1:-1]]))
print()

In [None]:
topics = ['Mental Abilities and Origin',
          'Complaints and Requests for Blocking',
          'References to Anatomy',
          'Homophobia',
          'Calls for Suicide',
          'Belittling',
          'Hatred']

In [None]:
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, CustomJS, ColumnDataSource, Slider, Range1d
from bokeh.layouts import column
from bokeh.palettes import all_palettes

In [None]:
import umap
import numpy as np

umap_embr = umap.UMAP(n_neighbors=10, metric='cosine', min_dist=0.1, random_state=42)
embedding = umap_embr.fit_transform(np.asarray(tfidf.todense()))
embedding = pd.DataFrame(embedding, columns=['x', 'y'])

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

%matplotlib inline

centroids = umap_embr.transform(nmf.components_)
embedding['hue'] = nmf_embedding.argmax(axis=1)
my_colors = [all_palettes['Category20'][20][i] for i in embedding.hue]

legend_list = []
for color in all_palettes['Category20'][20][:n_topics]:   
    legend_list.append(mpatches.Ellipse((0, 0), 1, 1, fc=color))
    
fig,ax = plt.subplots(figsize=(12,13))
ax.scatter(embedding.x, embedding.y, c=my_colors, alpha=0.8)
ax.scatter(centroids[:,0], centroids[:,1], c='black', s=100, alpha=1, marker='x')
fig.legend(legend_list, topics, loc=(0.18,0.89), ncol=3)
plt.subplots_adjust(top=0.82);