In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.decomposition import LatentDirichletAllocation

# Create a pipeline with TfidfVectorizer and LogisticRegression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000,)),
    ('clf', LogisticRegression())
])

In [63]:
import pandas as pd

train_df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet")
test_df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/test-00000-of-00001.parquet")
unsupervised_df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/unsupervised-00000-of-00001.parquet")


In [64]:
pipeline.fit(train_df['text'], train_df['label'])
accuracy = pipeline.score(test_df['text'], test_df['label'])
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.8638


In [None]:
topic_pipeline = Pipeline([
    ('vect', CountVectorizer(max_features=100, stop_words='english', min_df=5, max_df=0.8,)),
    ('lda', LatentDirichletAllocation(n_components=2, learning_offset=150, n_jobs=-1,))
])

sample = unsupervised_df['text'].sample(5000, random_state=42)
topic_pipeline.fit(sample)
print(topic_pipeline.named_steps['lda'].perplexity(topic_pipeline.named_steps['vect'].transform(sample)))
print(topic_pipeline.named_steps['lda'].bound_)



54.909605079683814
54.909605079683814


np.float64(21.089949665885218)

In [65]:
topic_pipeline = Pipeline([
    ('vect', CountVectorizer(max_features=1000,)),
    ('lda', LatentDirichletAllocation(n_components=20))
])
topic_pipeline.fit(unsupervised_df['text'].sample(5000, random_state=42))


0,1,2
,steps,"[('vect', ...), ('lda', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,n_components,20
,doc_topic_prior,
,topic_word_prior,
,learning_method,'batch'
,learning_decay,0.7
,learning_offset,10.0
,max_iter,10
,batch_size,128
,evaluate_every,-1
,total_samples,1000000.0


In [66]:
topics = topic_pipeline.named_steps['lda'].components_
feature_names = topic_pipeline.named_steps['vect'].get_feature_names_out()
for idx, topic in enumerate(topics):
    print(f"Topic {idx + 1}:")
    top_features = topic.argsort()[-10:][::-1]
    print(", ".join(feature_names[top_features]))

Topic 1:
poorly, dumb, acted, done, better, actress, little, could, than, story
Topic 2:
the, of, and, to, is, in, br, that, it, are
Topic 3:
budget, low, jack, movie, and, big, actors, the, in, fun
Topic 4:
book, was, the, in, dog, and, he, played, detective, story
Topic 5:
the, of, and, in, is, to, as, with, film, it
Topic 6:
the, it, was, and, to, this, that, in, of, film
Topic 7:
the, of, to, is, and, it, film, this, in, that
Topic 8:
very, good, movie, really, story, it, and, this, the, is
Topic 9:
the, in, of, was, series, were, that, and, but, there
Topic 10:
show, he, shows, all, just, this, the, and, what, know
Topic 11:
sam, humor, intelligent, tom, die, crew, interest, under, truth, cop
Topic 12:
the, to, and, that, it, of, is, they, in, this
Topic 13:
was, the, of, movie, and, to, this, were, in, as
Topic 14:
is, her, she, and, the, but, it, to, this, in
Topic 15:
her, she, to, and, in, was, the, mother, for, had
Topic 16:
the, to, and, his, he, of, br, in, is, with
Topic 1

In [67]:
topics_train = topic_pipeline.transform(train_df['text'])
topics_test = topic_pipeline.transform(test_df['text'])


In [68]:
vect = TfidfVectorizer(max_features=1000,)
train_vects = vect.fit_transform(train_df['text'])
test_vects = vect.transform(test_df['text'])


In [69]:

clf = LogisticRegression(max_iter=1000)
clf.fit( topics_train, train_df['label'])
test_accuracy = clf.score(topics_test, test_df['label'])
print(f"Test Accuracy on Topic Model: {test_accuracy:.4f}")

Test Accuracy on Topic Model: 0.7725


In [70]:
import numpy as np
train_data = np.hstack((topics_train, train_vects.toarray()))
test_data = np.hstack((topics_test, test_vects.toarray()))

In [71]:
clf = LogisticRegression(max_iter=1000)
clf.fit( train_data, train_df['label'])
test_accuracy = clf.score(test_data, test_df['label'])
print(f"Test Accuracy on Topic Model: {test_accuracy:.4f}")

Test Accuracy on Topic Model: 0.8616
