In [122]:
import pandas as pd

from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sentence_transformers import SentenceTransformer

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

import re

#### Cleaning Reviews

In [133]:
data = pd.read_csv('balanced_data.csv')

In [135]:
tops = data[data['category'] == 'Shirts/Tops']

tops['text'] = tops['text'].str.strip()
tops = tops[tops['text'].str.split().str.len() >= 3]  # remove very short reviews
bad_values = {"na", "n a", "n/a", "none", ""}
tops = tops[~tops['text'].isin(["na", "n a", "n/a", "none", ""])] #remove bad values


reviews = tops['text'].tolist()
ratings = tops['rating'].astype(int).tolist()

reviews = [r for r in reviews if r.lower() not in bad_values]


In [136]:
reviews = [str(i) for i in reviews]

In [126]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)  # remove links
    text = re.sub(r"[^\w\s]", "", text)         # remove punctuation
    text = re.sub(r"\d+", "", text)             # remove numbers
    text = text.encode("ascii", "ignore").decode()  # remove emojis
    return text

reviews = [clean_text(r) for r in reviews]

#### Topic Modeling with BERTopic

In [127]:
vectorizer_model = CountVectorizer(
    stop_words="english",
    min_df=20,            
    max_df=0.8,           
    ngram_range=(1,2)
)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [128]:
topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    verbose=True
)

In [129]:
topics, probs = topic_model.fit_transform(reviews)

2025-11-23 16:31:19,527 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1001 [00:00<?, ?it/s]

2025-11-23 16:34:17,335 - BERTopic - Embedding - Completed ✓
2025-11-23 16:34:17,338 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-23 16:34:26,666 - BERTopic - Dimensionality - Completed ✓
2025-11-23 16:34:26,681 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-23 16:39:35,991 - BERTopic - Cluster - Completed ✓
2025-11-23 16:39:36,016 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-23 16:39:38,736 - BERTopic - Representation - Completed ✓


In [130]:
topic_model = topic_model.reduce_topics(reviews, nr_topics=30)

2025-11-23 16:39:41,782 - BERTopic - Topic reduction - Reducing number of topics
2025-11-23 16:39:42,004 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-23 16:39:44,027 - BERTopic - Representation - Completed ✓
2025-11-23 16:39:44,036 - BERTopic - Topic reduction - Reduced number of topics from 331 to 30


In [131]:
topics, probs = topic_model.transform(reviews)

Batches:   0%|          | 0/1001 [00:00<?, ?it/s]

2025-11-23 16:42:36,285 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-11-23 16:42:36,397 - BERTopic - Dimensionality - Completed ✓
2025-11-23 16:42:36,397 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-11-23 16:42:37,517 - BERTopic - Probabilities - Start calculation of probabilities with HDBSCAN
2025-11-23 16:45:07,157 - BERTopic - Probabilities - Completed ✓
2025-11-23 16:45:07,157 - BERTopic - Cluster - Completed ✓


In [132]:
tops['topic'] = topics
tops['topic_probs'] = probs.tolist()

ValueError: Length of values (32032) does not match length of index (34082)

#### Add Sentiment Score with VADER

In [None]:
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\yzhen\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
tops['sentiment'] = tops['text'].apply(lambda x: sia.polarity_scores(str(x))["compound"])

#### Creating DataFrame with Probability Distribution for Modeling

In [None]:
df = pd.DataFrame()
df['rating'] = tops['rating']
df['topic_id'] = topics
df['sentiment'] = tops['sentiment']

probs_df = pd.DataFrame(
    probs,
    columns = [f"topic_prob_{i}" for i in range(probs.shape[1])]
)

In [None]:
topic_prob = pd.concat([df.reset_index(drop=True), probs_df.reset_index(drop=True)], axis=1)

topic_prob

Unnamed: 0,rating,topic_id,sentiment,topic_prob_0,topic_prob_1,topic_prob_2,topic_prob_3,topic_prob_4,topic_prob_5,topic_prob_6,...,topic_prob_19,topic_prob_20,topic_prob_21,topic_prob_22,topic_prob_23,topic_prob_24,topic_prob_25,topic_prob_26,topic_prob_27,topic_prob_28
0,4.0,9,0.6918,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1,5.0,1,0.8357,1.407189e-01,8.188058e-02,2.226152e-02,1.450290e-02,1.747066e-02,9.261023e-03,4.040478e-03,...,3.242651e-03,1.977243e-03,7.956023e-04,8.253871e-04,3.012792e-04,5.731639e-04,2.557884e-04,4.783740e-04,6.327596e-04,1.348677e-03
2,3.0,-1,0.0129,5.938534e-01,5.307139e-02,9.089929e-129,5.536364e-129,1.629798e-02,1.386644e-129,4.936046e-130,...,5.785402e-130,3.532178e-130,1.339967e-130,1.383894e-130,2.658912e-131,5.621593e-131,2.418055e-131,3.977161e-131,1.159283e-130,1.425331e-130
3,1.0,16,0.4215,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
4,1.0,5,-0.8079,1.589250e-09,6.833651e-10,2.607126e-10,2.866307e-10,1.906143e-10,9.216001e-01,8.747127e-11,...,2.242550e-10,9.370864e-11,2.350942e-11,1.147031e-11,5.396873e-12,1.180296e-11,3.979662e-12,8.465927e-12,2.193662e-11,1.530323e-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34077,3.0,0,0.0000,9.320462e-01,1.771362e-02,6.868606e-03,5.126198e-03,5.285674e-03,3.002853e-03,1.734204e-03,...,1.165273e-03,7.011363e-04,2.889388e-04,3.775342e-04,1.332665e-04,2.167131e-04,1.260888e-04,1.758986e-04,2.284043e-04,2.967454e-04
34078,1.0,1,-0.4588,0.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
34079,1.0,14,0.0000,9.033152e-02,3.944086e-02,1.609102e-02,1.304207e-02,1.068483e-02,2.149892e-02,5.759583e-03,...,6.247752e-03,3.470283e-03,9.628476e-04,6.650818e-04,4.172307e-04,7.998477e-04,3.188832e-04,6.576810e-04,1.122466e-03,8.797068e-04
34080,1.0,0,-0.4215,6.755078e-01,9.693865e-03,3.593326e-03,2.485474e-03,2.695232e-03,1.664804e-03,1.136906e-03,...,6.262553e-04,3.779877e-04,1.508980e-04,2.013908e-04,9.219208e-05,1.429136e-04,9.088718e-05,1.325760e-04,1.257210e-04,1.701701e-04


In [None]:
#remove topic = -1 because those are outliers and should be ignored
model_df = topic_prob[topic_prob['topic_id'] != -1]

In [None]:
model_df

Unnamed: 0,rating,topic_id,sentiment,topic_prob_0,topic_prob_1,topic_prob_2,topic_prob_3,topic_prob_4,topic_prob_5,topic_prob_6,...,topic_prob_19,topic_prob_20,topic_prob_21,topic_prob_22,topic_prob_23,topic_prob_24,topic_prob_25,topic_prob_26,topic_prob_27,topic_prob_28
0,4.0,9,0.6918,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1,5.0,1,0.8357,1.407189e-01,8.188058e-02,2.226152e-02,1.450290e-02,1.747066e-02,9.261023e-03,4.040478e-03,...,3.242651e-03,1.977243e-03,7.956023e-04,8.253871e-04,3.012792e-04,5.731639e-04,2.557884e-04,4.783740e-04,6.327596e-04,1.348677e-03
3,1.0,16,0.4215,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
4,1.0,5,-0.8079,1.589250e-09,6.833651e-10,2.607126e-10,2.866307e-10,1.906143e-10,9.216001e-01,8.747127e-11,...,2.242550e-10,9.370864e-11,2.350942e-11,1.147031e-11,5.396873e-12,1.180296e-11,3.979662e-12,8.465927e-12,2.193662e-11,1.530323e-11
6,3.0,0,0.6369,9.524698e-01,3.504301e-16,3.650877e-17,2.437965e-17,6.270424e-17,1.292472e-17,6.621524e-18,...,5.188340e-18,3.170885e-18,1.203477e-18,1.355813e-18,4.641004e-19,8.230099e-19,4.265775e-19,6.832141e-19,1.038787e-18,1.314411e-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34077,3.0,0,0.0000,9.320462e-01,1.771362e-02,6.868606e-03,5.126198e-03,5.285674e-03,3.002853e-03,1.734204e-03,...,1.165273e-03,7.011363e-04,2.889388e-04,3.775342e-04,1.332665e-04,2.167131e-04,1.260888e-04,1.758986e-04,2.284043e-04,2.967454e-04
34078,1.0,1,-0.4588,0.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
34079,1.0,14,0.0000,9.033152e-02,3.944086e-02,1.609102e-02,1.304207e-02,1.068483e-02,2.149892e-02,5.759583e-03,...,6.247752e-03,3.470283e-03,9.628476e-04,6.650818e-04,4.172307e-04,7.998477e-04,3.188832e-04,6.576810e-04,1.122466e-03,8.797068e-04
34080,1.0,0,-0.4215,6.755078e-01,9.693865e-03,3.593326e-03,2.485474e-03,2.695232e-03,1.664804e-03,1.136906e-03,...,6.262553e-04,3.779877e-04,1.508980e-04,2.013908e-04,9.219208e-05,1.429136e-04,9.088718e-05,1.325760e-04,1.257210e-04,1.701701e-04


#### Training and Testing the Model

In [None]:
X = model_df.iloc[:, 2:] #topic probs and sentiment score
y = model_df["rating"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

preds = model.predict(X_test)
mse = mean_squared_error(y_test, preds)

print("MSE:", mse)

r2 = r2_score(y_test, preds)
print(f"R-squared: {r2}")

MSE: 1.2935697432787452
R-squared: 0.32440365479650346


#### Results

In [None]:
coef_df = pd.DataFrame({
    "feature": X.columns,
    "coefficient": model.coef_
}).sort_values("coefficient", ascending=False)

In [None]:
coef_df

Unnamed: 0,feature,coefficient
26,topic_prob_25,1.572211
0,sentiment,1.528284
9,topic_prob_8,1.196692
13,topic_prob_12,1.0974
24,topic_prob_23,1.04144
7,topic_prob_6,0.66182
8,topic_prob_7,0.583057
11,topic_prob_10,0.514875
20,topic_prob_19,0.384635
27,topic_prob_26,0.145939


In [None]:
for i in coef_df.index:
    if i != 0: #account for sentiment being 0 but not a topic
        topic = topic_model.get_topic_info(i-1)['Name'].to_string(index=False)
        coef = coef_df.loc[i,'coefficient']
        print(f'{topic} affects rating by {coef}')
    else:
        topic = 'sentiment'
        coef = coef_df.loc[i,'coefficient']
        print(f'{topic} affects rating by {coef}')
    

25_time___ affects rating by 1.572210727679721
sentiment affects rating by 1.5282843678662026
8_love_got_great_time affects rating by 1.1966917695964663
12_product_item_great_good affects rating by 1.0973995892049186
23_product_way__ affects rating by 1.0414396666116854
6_ok_good_perfect_great affects rating by 0.6618196191638699
7_color_material_ok_small affects rating by 0.5830567354080283
10_cute_great_color_looks affects rating by 0.5148747172072277
19_expected_wasnt_bad_ok affects rating by 0.3846348435117261
26_bad_looks_little_quality affects rating by 0.14593904821042042
21_day_right_product_work affects rating by -0.03587161518180883
16_great_good_pretty_perfect affects rating by -0.06434630975246253
3_cute_small_little_way affects rating by -0.09645232219952304
9_work_day_dont_great affects rating by -0.12811173560014133
15_looked_dont_did_didnt affects rating by -0.13636007019084562
5_quality_bad_good_product affects rating by -0.16529747865495006
0_fit_small_im_love affects

In [None]:
topic_model.get_topic_info(25)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,25,21,25_time___,"[time, , , , , , , , , ]","[nan, na at this time, nan]"
