In [1]:
import pandas as pd

train_df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet")
test_df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/test-00000-of-00001.parquet")


# splits = {'train': 'yelp_review_full/train-00000-of-00001.parquet', 'test': 'yelp_review_full/test-00000-of-00001.parquet'}
# train_df = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["train"])
# test_df = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["test"])

In [66]:


# splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
# train_df = pd.read_parquet("hf://datasets/fancyzhx/ag_news/" + splits["train"])
# test_df = pd.read_parquet("hf://datasets/fancyzhx/ag_news/" + splits["test"])

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import Pipeline
import numpy as np

n_topics = 2
pipeline = Pipeline([
    ('vect', CountVectorizer(max_features=1000, stop_words='english',
                             min_df=5, max_df=0.5)),
    ('lda', LatentDirichletAllocation(n_components=n_topics, random_state=42, n_jobs=-1,))
])

pipeline.fit(train_df['text'].sample(100, random_state=42))


0,1,2
,steps,"[('vect', ...), ('lda', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,n_components,2
,doc_topic_prior,
,topic_word_prior,
,learning_method,'batch'
,learning_decay,0.7
,learning_offset,10.0
,max_iter,10
,batch_size,128
,evaluate_every,-1
,total_samples,1000000.0


In [26]:
vect = pipeline.named_steps['vect']
lda = pipeline.named_steps['lda']

feature_names = vect.get_feature_names_out()
components = lda.components_
print(components.shape)

(2, 304)


In [27]:
n_top_words = 10
for topic_idx in range(components.shape[0]):
    top_indices = components[topic_idx,:].argsort()[:-n_top_words - 1:-1]
    words = [feature_names[i] for i in top_indices]
    print(f"Topic #{topic_idx + 1}: {', '.join(words)}")

Topic #1: just, story, time, like, actors, plot, really, way, think, best
Topic #2: like, good, just, really, bad, people, love, best, did, films


In [28]:

def compute_perplexity(pipeline, texts):
    lda = pipeline.named_steps['lda']
    vect = pipeline.named_steps['vect']
    X = vect.transform(texts)
    perplexity = lda.perplexity(X)
    return perplexity

test_perplexity = compute_perplexity(pipeline, test_df['text'].sample(100, random_state=42))
print("Test Perplexity:", test_perplexity)

Test Perplexity: 319.8898003995262


In [34]:
topic_range = [2, 3, 4, 5, 6, 7, 8]
perplexities = []
this_df = train_df['text'].sample(5000, random_state=42)
this_df_train = test_df['text'].sample(5000, random_state=42)
for n_topics in topic_range:
    pipeline = Pipeline([
        ('vect', CountVectorizer(max_features=1000, stop_words='english',
                                 min_df=5, max_df=0.5)),
        ('lda', LatentDirichletAllocation(n_components=n_topics, random_state=42, n_jobs=-1))
    ])

    pipeline.fit(this_df)
    perplexity = compute_perplexity(pipeline, this_df_train)
    perplexities.append(perplexity)
    print(f"n_topics={n_topics}, test perplexity={perplexity}")

best_n_topics = topic_range[np.argmin(perplexities)]
print("Best n_topics:", best_n_topics)

n_topics=2, test perplexity=679.0448292950306
n_topics=3, test perplexity=688.0917522575834
n_topics=4, test perplexity=700.957180498238
n_topics=5, test perplexity=708.6130819591726
n_topics=6, test perplexity=720.8062805101179
n_topics=7, test perplexity=727.9498067258128
n_topics=8, test perplexity=741.2119501524587
Best n_topics: 2


## Perplexity in Language Models

**Perplexity** is a measurement of how well a probabilistic model predicts a sample. In the context of language models, it quantifies how surprised the model is by the actual sequence of words. A lower perplexity indicates better predictive performance.

### Definition
Let a language model assign probabilities to a sequence of words $w_1, w_2, \dots, w_N \). The **perplexity (PP)** is defined as:

$$
PP(W) = \exp \left( -\frac{1}{N} \sum_{i=1}^{N} \log P(w_i) \right)
$$

Or equivalently:

$$
PP(W) = \left( \prod_{i=1}^{N} \frac{1}{P(w_i)} \right)^{\frac{1}{N}}
$$

### Interpretation
- Perplexity is the **inverse probability** of the test set, normalized by the number of words.
- It can be interpreted as the average number of word choices the model is "perplexed" by at each position.
- **Lower is better**: A perfect model would have a perplexity of 1.

### Example
If a model predicts each word in a sentence perfectly with probability = 1, then:
- \( \log P(w_i) = 0 \) for each word
- Perplexity = 1 → No perplexity, the model is certain
