In [7]:
__author__='Xuan Nguyen'
__NetID__='XDN240000'

# NLP PROJECT — IMDB Movie Reviews

In [8]:
#Import important libraries
import pandas as pd
import re
import nltk
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from gensim.corpora.dictionary import Dictionary
from gensim.models import CoherenceModel, LdaModel


In [9]:
# Download resources if missing
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
stopwords = set(stopwords.words('english'))
custom_stopwords = stopwords.union({"film", "films", "movie", "movies", "one", "make", "see", "get", "time",
    "watch", "watching", "seen", "good", "like", "really",
    "even", "dont", "would", "know", "think", "im", "people", "guy", "thing",
    "show", "story", "character", "characters", "life", "well", "also",
    "plot", "scene", "scenes", "acting","many","much","way","didnt","go","say","something","could","made"})
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package punkt_tab to /Users/huy/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/huy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/huy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## PART 1 — TEXT PREPROCESSING

In [10]:
# --- Basic text cleaning ---
# Initialize stopwords and lemmatizer ONCE


def base_clean(text):
    """Base cleaning function to avoid duplication"""
    text = re.sub(r'<.*?>', '', text.lower())
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r'[^a-z\s]', '', text)
    return text

def clean_text(text):
    text = base_clean(text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in custom_stopwords]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

data = pd.read_csv("IMDB Dataset.csv")
df = data.sample(n=10000, random_state=42).reset_index(drop=True)

df['clean_review'] = df['review'].apply(clean_text)

# --- Feature extraction ---
count_vectorizer = CountVectorizer(max_df=0.95, min_df=5)
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=5)
count_data = count_vectorizer.fit_transform(df['clean_review'])
tfidf_data = tfidf_vectorizer.fit_transform(df['clean_review'])
print("Preview of cleaned data")
print(df['clean_review'].head())
print("Preprocessing complete")

Preview of cleaned data
0    liked summerslam due look arena curtain look o...
1    television show appeal quite different kind fa...
2    quickly get major chase ever increasing destru...
3    jane austen definitely approve onegwyneth palt...
4    expectation somewhat high went thought steve c...
Name: clean_review, dtype: object
Preprocessing complete


## PART 2 — SENTIMENT ANALYSIS

In [11]:
#Splitting data

X_train, X_test, y_train, y_test = train_test_split(
    tfidf_data, df['sentiment'], test_size=0.2, random_state=42
)

# Use Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, pos_label='positive')
rec = recall_score(y_test, y_pred, pos_label='positive')
f1 = f1_score(y_test, y_pred, pos_label='positive')

print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1-score:  {f1:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy:  0.8740
Precision: 0.8523
Recall:    0.9051
F1-score:  0.8779

Classification Report:
               precision    recall  f1-score   support

    negative       0.90      0.84      0.87       999
    positive       0.85      0.91      0.88      1001

    accuracy                           0.87      2000
   macro avg       0.88      0.87      0.87      2000
weighted avg       0.88      0.87      0.87      2000



## PART 3 — TOPIC MODELING (LDA + NMF)

In [12]:
#Extrating topics
N_TOPIC = 5

# --- LDA ---
lda = LatentDirichletAllocation(n_components=N_TOPIC, random_state=42, learning_method='batch')
lda.fit(count_data)


#
# --- NMF ---
nmf = NMF(n_components=N_TOPIC, random_state=42)
nmf.fit(tfidf_data)

def extract_topics(model, vectorizer, top_n=10):
    """Extract top words from LDA topics"""
    feature_names = vectorizer.get_feature_names_out()
    return [[feature_names[i] for i in topic.argsort()[:-top_n-1:-1]]
            for topic in model.components_]
# Extract LDA topics
lda_topics = extract_topics(lda, count_vectorizer)

# Extract NMF topics
nmf_topics = extract_topics(nmf, tfidf_vectorizer)

#create human-assigned labels
lda_labels = [
    "Overall Movie Judgments",
    "Praising Performances, Music, & Comedy",
    "Critiquing Male Roles & Horror",
    "War & Action Films",
    "Drama, Romance, & Adaptations"
]

nmf_labels = [
    "Analyzing Direction & Characters",
    "Strongly Negative Reviews (Waste of Money)",
    "Praising Comedy, Music, & Performances",
    "Discussing Book/Novel Adaptations",
    "TV Series Fan Discussion (Seasons, Episodes)"
]

# Create Pandas DataFrames

lda_df = pd.DataFrame({
    "Topic #": [f"Topic {i + 1}" for i in range(len(lda_topics))],
    "Human Label": lda_labels,
    "Top Words": [", ".join(words) for words in lda_topics]
})

nmf_df = pd.DataFrame({
    "Topic #": [f"Topic {i + 1}" for i in range(len(nmf_topics))],
    "Human Label": nmf_labels,
    "Top Words": [", ".join(words) for words in nmf_topics]
})

#print the DataFrames
#set display options to show full text in columns
pd.set_option('display.max_colwidth', None)

print("\n\n" + "="*30 + " LDA Topics " + "="*30)
print(lda_df.to_string(index=False))

print("\n\n" + "="*30 + " NMF Topics " + "="*30)
print(nmf_df.to_string(index=False))



Topic #                            Human Label                                                               Top Words
Topic 1                Overall Movie Judgments         bad, first, ever, great, look, better, actor, funny, never, lot
Topic 2 Praising Performances, Music, & Comedy great, best, song, music, comedy, year, love, first, still, performance
Topic 3         Critiquing Male Roles & Horror            man, great, role, he, play, horror, end, first, never, actor
Topic 4                     War & Action Films             man, world, two, war, take, first, american, end, come, new
Topic 5          Drama, Romance, & Adaptations   love, work, director, book, woman, performance, two, man, great, play


Topic #                                  Human Label                                                                          Top Words
Topic 1             Analyzing Direction & Characters                    man, woman, end, two, little, take, look, director, work, young
Topic 2   

## PART 4 — EVALUATION & VISUALIZATION

In [13]:
# 1. Compute Topic Coherence (C_v) 

print("Calculating Coherence Scores...")

# Create the tokenized list of lists needed for CoherenceModel
processed_texts = [text.split() for text in df['clean_review']]

# Create a Gensim Dictionary from our tokenized texts
dictionary = Dictionary(processed_texts)

corpus = [dictionary.doc2bow(text) for text in processed_texts]

id2word = dictionary

#define function to compute topic coherence ---
def topic_coherence(topics, texts, dictionary):
    """Compute coherence score"""
    cm = CoherenceModel(topics=topics,
                        texts=texts,
                        dictionary=dictionary,
                        coherence='c_v'
                        )
    return cm.get_coherence()

# Calculate LDA Coherence
coherence_lda = topic_coherence(lda_topics, processed_texts, dictionary)

# Calculate NMF Coherence
coherence_nmf = topic_coherence(nmf_topics, processed_texts, dictionary)

# Report and Compare
print("\n" + "="*30 + " Coherence Score Comparison " + "="*30)
print(f"LDA C_v = {coherence_lda:.4f}, NMF C_v = {coherence_nmf:.4f}")

if coherence_nmf > coherence_lda:
    print("Result: NMF produced slightly more coherent topics.")
elif coherence_lda > coherence_nmf:
    print("Result: LDA produced slightly more coherent topics.")
else:
    print("Result: Both models produced topics with identical coherence.")


Calculating Coherence Scores...

LDA C_v = 0.3749, NMF C_v = 0.4696
Result: NMF produced slightly more coherent topics.


In [14]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# --- 2. Visualize Topics (pyLDAvis – LDA only) ---

print("\nPreparing LDA visualization for Jupyter/Colab...")

# Enable the visualization notebook mode
pyLDAvis.enable_notebook()

# Prepare the visualization data using the consolidated lda_model method

lda_model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=N_TOPIC,
    random_state=42,
    passes=10,
    alpha='auto'
)

vis = gensimvis.prepare(lda_model, corpus, id2word)
print("Displaying visualization...")

# Display the visualization
vis

#export visualization as an HTML screenshot
pyLDAvis.save_html(vis, "lda_visualization.html")


Preparing LDA visualization for Jupyter/Colab...
Displaying visualization...


In [15]:
# Interpretation of visualization 
#The LDA visualization shows 5 topics: the two largest topics (1 & 2) overlap heavily, 
#while topics 3 & 4 form a separate cluster and topic 5 is distinct.
#The themes relate to film and acting, but this overlap indicates lower coherence.

## PART 5 — PREPROCESSING VARIANTS STUDY

In [16]:
from itertools import combinations
import time
import psutil
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary


# --- Define Preprocessing Pipelines ---

#Pipeline 1: Lemmatization + Remove StopWords
def clean_lemmatize(text):
    text = base_clean(text)
    tokens = [lemmatizer.lemmatize(w) for w in text.split() if w not in custom_stopwords]
    return " ".join(tokens)

#Pipeline 2: Stemming + Keep StopWords
stemmer = PorterStemmer()
def clean_stem(text):
    text = base_clean(text)
    tokens = [stemmer.stem(w) for w in text.split()]
    return " ".join(tokens)

#Pipeline 3: Unigrams + Bigrams
def clean_bigram(text):
    text = base_clean(text)  # your existing normalization: lower, strip HTML, etc.
    # Basic tokenization (adjust to your base_clean)
    tokens = [w for w in text.split() if w and w not in custom_stopwords]

    # Build bigrams
    bigrams = ["_".join(pair) for pair in zip(tokens, tokens[1:])]

    # Return BOTH unigrams and bigrams
    return " ".join(tokens + bigrams)


pipelines = {
    "Lemmatization + Remove StopWords": clean_lemmatize,
    "Stemming + Keep StopWords": clean_stem,
    "Unigrams + Bigrams": clean_bigram
}

# --- Reusable evaluation functions ---


# Function to compute topic diversity 
def topic_diversity(topics, top_n=10):
    """Compute diversity score"""
    all_words = [w for topic in topics for w in topic[:top_n]]
    return len(set(all_words)) / len(all_words) if all_words else 0

# Function to compute topic stability
def jaccard_similarity(set1, set2):
    return len(set1 & set2) / len(set1 | set2) if (set1 | set2) else 0.0
 
def compute_stability(count_data, feature_names, seeds=[10, 42, 100], n_topics=N_TOPIC, top_k=10):
    """Compute topic stability across multiple runs"""
    runs = []
    for rs in seeds:
        lda = LatentDirichletAllocation(n_components=N_TOPIC, random_state=rs, max_iter=5)
        lda.fit(count_data)
        topic_sets = [set(feature_names[i] for i in topic.argsort()[:-top_k-1:-1])
                     for topic in lda.components_]
        runs.append(topic_sets)

    # Average Jaccard across all pairs
    scores = [sum(jaccard_similarity(runs[i][k], runs[j][k]) for k in range(n_topics)) / n_topics
             for i, j in combinations(range(len(runs)), 2)]
    return sum(scores) / len(scores) if scores else 0.0


# --- Storage for results ---
results_Analysis = []
results_coherence = []
results = []
for name, func in pipelines.items():
    print("\n" + "="*30 + " Pipeline " + "="*30)
    print(f"\nRunning pipeline: {name}")
    start_time = time.time()
    process = psutil.Process()

    # Apply preprocessing
    df_variant = df.copy()
    df_variant['clean_review'] = df_variant['review'].apply(func)
    print("\n Finish pre-processing")

    # --- Sentiment Analysis ---
    tfidf_vectorizer_variant = TfidfVectorizer(max_df=0.95, min_df=5)
    X = tfidf_vectorizer_variant.fit_transform(df_variant['clean_review'])
    X_train, X_test, y_train, y_test = train_test_split(X, df_variant['sentiment'], test_size=0.2, random_state=42)
    
    # Train a NEW model for each variant
    model_variant = LogisticRegression(max_iter=1000)
    model_variant.fit(X_train, y_train)
    y_pred = model_variant.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label='positive')
    recall = recall_score(y_test, y_pred, pos_label='positive')
    f1 = f1_score(y_test, y_pred, pos_label='positive')

    X_train, X_test, y_train, y_test = train_test_split(
    tfidf_data, df['sentiment'], test_size=0.2, random_state=42
    )
    print("\n Finish sentiment analysis")
    results_Analysis.append({
        "Pipeline": name,
        "Accuracy": accuracy,
        "Recall": recall,
        "Precision": precision,
        "F1-score": f1,
    })

# Topic Modeling
    count_vectorizer_variant = CountVectorizer(max_df=0.95, min_df=5)
    if name == "Unigrams + Bigrams":
        count_vectorizer_variant = CountVectorizer(max_df=0.95, min_df=5, ngram_range=(1,2))
    tfidf_vectorizer_variant = TfidfVectorizer(max_df=0.95, min_df=5)
    if name == "Unigrams + Bigrams":
        tfidf_vectorizer_variant = TfidfVectorizer(max_df=0.95, min_df=5, ngram_range=(1,2))

    count_data_variant = count_vectorizer_variant.fit_transform(df_variant['clean_review'])
    tfidf_data_variant = tfidf_vectorizer_variant.fit_transform(df_variant['clean_review'])

    lda_variant = LatentDirichletAllocation(n_components=N_TOPIC, random_state=42)
    lda_variant.fit(count_data_variant)

    nmf_variant = NMF(n_components=N_TOPIC, random_state=42, max_iter=600)
    nmf_variant.fit(tfidf_data_variant)

    feature_names_variant = count_vectorizer_variant.get_feature_names_out()
    lda_topics_variant = extract_topics(lda_variant, count_vectorizer_variant)
    nmf_topics_variant = extract_topics(nmf_variant, tfidf_vectorizer_variant)

    # Create the tokenized list of lists needed for CoherenceModel
    processed_texts = [text.split() for text in df['clean_review']]
    # Create a Gensim Dictionary from our tokenized texts
    dictionary = Dictionary(processed_texts)

    lda_coherence = topic_coherence(lda_topics_variant, processed_texts, dictionary)
    nmf_coherence = topic_coherence(nmf_topics_variant, processed_texts, dictionary)
    print("\n Finish coherence model")
    # --- Topic Modeling ---
    results_coherence.append({
        "Pipeline": name,
        "lda_coherence": lda_coherence,
        "nmf_coherence": nmf_coherence,
    })

    ## ------ Coherence --------
    coherence = 0
    if nmf_coherence > lda_coherence:
        coherence = nmf_coherence
    elif lda_coherence > nmf_coherence:
        coherence = lda_coherence

    ## ------ Diversity --------
    diversity = topic_diversity(lda_topics_variant)

    ## ------ Stability --------
    stability = compute_stability(count_data_variant, feature_names_variant)
    
    ## ------ Run Time --------
    runtime = time.time() - start_time
    
    ## ------ Memory Usage --------
    memory_used = process.memory_info().rss / (1024*1024)

    print("\n Finish train Model")

    results.append({
        "Pipeline": name,
        "F1": f1,
        "Coherence": coherence,
        "Diversity": diversity,
        "Stability": stability,
        "Runtime (s)": runtime,
        "Memory (MB)": memory_used
    })

# --- Summarize results ---
results_df = pd.DataFrame(results)
results_Analysis_df = pd.DataFrame(results_Analysis)
results_coherence_df = pd.DataFrame(results_coherence)
print ("\nSentiment Analysis table")
print (results_Analysis_df)
print ("\n Coherence Table")
print (results_coherence_df)



Running pipeline: Lemmatization + Remove StopWords

 Finish pre-processing

 Finish sentiment analysis

 Finish coherence model

 Finish train Model


Running pipeline: Stemming + Keep StopWords

 Finish pre-processing

 Finish sentiment analysis

 Finish coherence model

 Finish train Model


Running pipeline: Unigrams + Bigrams

 Finish pre-processing

 Finish sentiment analysis

 Finish coherence model

 Finish train Model

Sentiment Analysis table
                           Pipeline  Accuracy    Recall  Precision  F1-score
0  Lemmatization + Remove StopWords     0.874  0.905095   0.852305  0.877907
1         Stemming + Keep StopWords     0.871  0.900100   0.850803  0.874757
2                Unigrams + Bigrams     0.877  0.908092   0.855127  0.880814

 Coherence Table
                           Pipeline  lda_coherence  nmf_coherence
0  Lemmatization + Remove StopWords       0.389678       0.469617
1         Stemming + Keep StopWords       0.177750       0.242114
2                U

## PART 6 — PREPROCESSING QUALITY INDEX (PQI)

In [17]:

# Import Min-max Scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

#Create a new dataframe for normalized values
df_norm = pd.DataFrame()

# F1, Coherence, etc., are 'good': higher values are better.
good_cols = ['F1', 'Coherence', 'Diversity', 'Stability']

# Scale the 'good' columns (0=worst, 1=best)
df_norm[good_cols] = scaler.fit_transform(results_df[good_cols])

# Runtime and Memory are 'costs': lower values are better.
bad_cols = ['Runtime (s)', 'Memory (MB)']

# Scale the 'bad' columns (0=worst, 1=best)
df_norm[bad_cols] = 1 - scaler.fit_transform(results_df[bad_cols])

# PQI formula
results_df["PQI"] = (
    0.30 * df_norm["F1"] +

    0.25 * df_norm["Coherence"] +
    0.15 * df_norm["Diversity"] +
    0.15 * df_norm["Stability"] -
    0.10 * df_norm["Runtime (s)"] -  
    0.05 * df_norm["Memory (MB)"]  
)

best_pipeline = results_df.loc[results_df["PQI"].idxmax(), "Pipeline"]

print("\nFinal Results with PQI:")
print(results_df.round(4))
print(f"\nBest Pipeline based on PQI: {best_pipeline}")


Final Results with PQI:
                           Pipeline      F1  Coherence  Diversity  Stability  \
0  Lemmatization + Remove StopWords  0.8779     0.4696       0.70     0.2213   
1         Stemming + Keep StopWords  0.8748     0.2421       0.40     0.5612   
2                Unigrams + Bigrams  0.8808     0.5339       0.52     0.4086   

   Runtime (s)  Memory (MB)     PQI  
0      76.4598     475.3594  0.3509  
1     106.8254     557.0938  0.1110  
2      76.6147     846.1875  0.5932  

Best Pipeline based on PQI: Unigrams + Bigrams
