In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import NMF
from scipy.special import kl_div
import pandas as pd
import numpy as np
import math

with open("TREC/texts.txt", "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]  

df = pd.DataFrame(lines, columns=["text"])

In [10]:
print(df.describe)
print(df.info())
print(df.head())

<bound method NDFrame.describe of                                                    text
0     How did serfdom develop in and then leave Russ...
1      What films featured the character Popeye Doyle ?
2     How can I find a list of celebrities ' real na...
3     What fowl grabs the spotlight after the Chines...
4                       What is the full form of .com ?
...                                                 ...
5947             Who was the 22nd President of the US ?
5948             What is the money they use in Zambia ?
5949                          How many feet in a mile ?
5950                What is the birthstone of October ?
5951                                   What is e-coli ?

[5952 rows x 1 columns]>
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5952 entries, 0 to 5951
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5952 non-null   object
dtypes: object(1)
memory usage: 46.6+ KB
None
        

### NMF (Non-negative matrix factorization)

Resources: https://medium.com/codex/what-is-non-negative-matrix-factorization-nmf-32663fb4d65,
https://medium.com/voice-tech-podcast/topic-modelling-using-nmf-2f510d962b6e

Theres a bit of math used in NMF explained below:

### Kullback-Leibler Divergence (elementwise):

kl_div(x, y)

if x > 0 and y > 0:
    x log (x/y) - x + y  

if x = 0 and y >= 0:
    y  

else:
    nfinity  



In [11]:
# Saw this implementation in Medium: https://medium.com/voice-tech-podcast/topic-modelling-using-nmf-2f510d962b6e
def kv_div(x, y):
    return np.sum(x[i] * math.log2(x[i] / y[i]) for i in range(len(x)))
    

There's also the Frobenius Norm or Euclidean Norm (if you need the formula, make a web search)

In [12]:
frobenius_norm = lambda x : np.linalg.norm(x)

Optimization is needed to improve model:

### Two types of optimizations present in Scikit learn:  
Coordinate Descent Solver  
Multiplicative update Solver

## Using tf-idf on the data:

In [13]:
vectorizer = TfidfVectorizer(max_features=1500, min_df=10, stop_words='english')
X = vectorizer.fit_transform(df['text'])
words = np.array(vectorizer.get_feature_names_out())

print(X)
print(words)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9774 stored elements and shape (5952, 407)>
  Coords	Values
  (0, 108)	1.0
  (1, 68)	1.0
  (2, 219)	0.6139327599674049
  (2, 303)	0.5772910008261286
  (2, 254)	0.5383508768488926
  (3, 74)	0.8223115885946162
  (3, 404)	0.5690374779072103
  (4, 148)	0.6790684458447013
  (4, 86)	0.7340749592909855
  (6, 108)	0.31654393499346684
  (6, 358)	0.5268021683515133
  (6, 28)	0.5244917740004745
  (6, 339)	0.5892264349508014
  (7, 269)	1.0
  (9, 268)	0.6708318344556677
  (9, 388)	0.7416094995891325
  (10, 47)	1.0
  (11, 369)	1.0
  (13, 200)	1.0
  (15, 345)	0.599547748026181
  (15, 44)	0.800338989326857
  (16, 108)	1.0
  (20, 102)	0.7523895114804765
  (20, 103)	0.6587184702239416
  (22, 131)	1.0
  :	:
  (5932, 334)	1.0
  (5933, 148)	1.0
  (5934, 201)	0.624782260772983
  (5934, 154)	0.7807990308801622
  (5936, 307)	0.6502980527329315
  (5936, 310)	0.7596791708423744
  (5938, 388)	0.6627856184300285
  (5938, 204)	0.5675510527146649
  (5938

I'll be using Scikit implementation of NMF:

In [14]:
nmf = NMF(n_components=10, solver="mu")
W = nmf.fit_transform(X)
H = nmf.components_

for i, topic in enumerate(H):
      print("Topic {}: {}".format(i + 1, ",".join([str(x) for x in words[topic.argsort()[-3:]]])))

Topic 1: stand,mean,does
Topic 2: die,year,did
Topic 3: largest,origin,country
Topic 4: war,largest,world
Topic 5: cold,food,fear
Topic 6: united,states,president
Topic 7: airport,largest,city
Topic 8: new,year,state
Topic 9: group,island,called
Topic 10: live,invented,people


The original LabPi docs also teached BERTopic, but I couldn't use it for some reason