In [1]:
import pandas as pd
import numpy as np

from bertopic import BERTopic

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from xgboost import XGBRegressor

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

import re




#### Cleaning Reviews

In [2]:
data = pd.read_csv('balanced_data.csv')

In [3]:
tops = data[data['category'] == 'Shirts/Tops']

tops['text'] = tops['text'].str.strip()
tops = tops[tops['text'].str.split().str.len() >= 3]  # remove very short reviews
bad_values = {"na", "n a", "n/a", "none", ""}
tops = tops[~tops['text'].isin(["na", "n a", "n/a", "none", ""])] #remove bad values


In [4]:
from langdetect import detect, DetectorFactory

# fix randomness for reproducibility
DetectorFactory.seed = 0

def is_english(text):
    """Return True if the text is English, False otherwise."""
    try:
        return detect(text) == 'en'
    except:
        return False

tops['is_english'] = tops['text'].apply(is_english)

tops = tops[tops['is_english']] #remove non-english reviews
tops.shape

(31058, 8)

In [5]:
reviews = tops['text'].tolist()
reviews = [str(i) for i in reviews]

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)  # remove links
    text = re.sub(r"[^\w\s]", "", text)         # remove punctuation
    text = re.sub(r"\d+", "", text)             # remove numbers
    text = text.encode("ascii", "ignore").decode()  # remove emojis
    return text

reviews = [clean_text(r) for r in reviews]

#### Topic Modeling with BERTopic

In [6]:
vectorizer_model = CountVectorizer(
    stop_words="english",
    min_df=20,            
    max_df=0.80,           
    ngram_range=(1,2)
)

In [7]:
topic_model = BERTopic(
    embedding_model="all-mpnet-base-v2",
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    verbose=True
)

In [8]:
topics, probs = topic_model.fit_transform(reviews)

2025-12-03 14:09:27,330 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/971 [00:00<?, ?it/s]

2025-12-03 14:27:15,942 - BERTopic - Embedding - Completed ✓
2025-12-03 14:27:15,942 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-03 14:27:39,273 - BERTopic - Dimensionality - Completed ✓
2025-12-03 14:27:39,273 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-03 14:35:34,878 - BERTopic - Cluster - Completed ✓
2025-12-03 14:35:35,005 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-03 14:35:41,703 - BERTopic - Representation - Completed ✓


In [9]:
topic_model = topic_model.reduce_topics(reviews, nr_topics=30)

2025-12-03 14:35:47,862 - BERTopic - Topic reduction - Reducing number of topics
2025-12-03 14:35:48,294 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-03 14:35:52,195 - BERTopic - Representation - Completed ✓
2025-12-03 14:35:52,211 - BERTopic - Topic reduction - Reduced number of topics from 285 to 30


In [None]:
topics, probs = topic_model.transform(reviews)

Batches:   0%|          | 0/971 [00:00<?, ?it/s]

In [None]:
tops['topic'] = topics
tops['topic_probs'] = probs.tolist()

In [None]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,11397,-1_size_fit_small_wear,"[size, fit, small, wear, ordered, im, color, g...",[i purchased these in the same size i ordered ...
1,0,10088,0_wear_cute_love_fit,"[wear, cute, love, fit, im, size, ordered, gre...",[i recieved this item after a long day instead...
2,1,3184,1_small_size_large_ordered,"[small, size, large, ordered, way, fit, big, s...",[very small ordered regular size and it was wa...
3,2,1418,2_fit_quality_good_great,"[fit, quality, good, great, little, disappoint...",[soft fabric good fit great customer service ...
4,3,833,3_cute_small_fit_little,"[cute, small, fit, little, really, great, size...","[its cute but it keeps coming apart, cute but ..."
5,4,675,4_product_work_long_dont,"[product, work, long, dont, use, day, way, lit...",[unbelievable you would think that a company t...
6,5,604,5_color_received_ordered_disappointed,"[color, received, ordered, disappointed, look,...",[the color called latte is not even close i w...
7,6,487,6_wear_looking_fit_really,"[wear, looking, fit, really, im, great, going,...",[i ordered compression shirts all of them the...
8,7,372,7_quality_product_price_good,"[quality, product, price, good, great, worth, ...","[good product and great price, arrived promptl..."
9,8,303,8_quality_item_money_look,"[quality, item, money, look, sure, poorly, bet...",[not same item in picture poor quality do not ...


#### Add Sentiment Score with VADER

In [None]:
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\yzhen\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
tops['sentiment'] = tops['text'].apply(lambda x: sia.polarity_scores(str(x))["compound"])

#### Creating DataFrame with Probability Distribution for Modeling

In [None]:
df = pd.DataFrame()
df['rating'] = tops['rating']
df['topic_id'] = topics
df['sentiment'] = tops['sentiment']

probs_df = pd.DataFrame(
    probs,
    columns = [f"topic_prob_{i}" for i in range(probs.shape[1])]
)

In [None]:
topic_prob = pd.concat([df.reset_index(drop=True), probs_df.reset_index(drop=True)], axis=1)

topic_prob

Unnamed: 0,rating,topic_id,sentiment,topic_prob_0,topic_prob_1,topic_prob_2,topic_prob_3,topic_prob_4,topic_prob_5,topic_prob_6,...,topic_prob_19,topic_prob_20,topic_prob_21,topic_prob_22,topic_prob_23,topic_prob_24,topic_prob_25,topic_prob_26,topic_prob_27,topic_prob_28
0,3.0,-1,0.5927,1.843874e-03,7.554332e-04,6.561636e-04,2.283162e-04,1.936477e-04,3.629805e-01,1.390304e-05,...,4.925427e-05,3.827573e-05,4.986757e-05,1.124973e-05,2.282810e-05,9.169973e-06,1.562255e-05,2.669370e-05,9.143313e-06,1.089304e-05
1,4.0,0,0.7814,6.580047e-01,1.711531e-02,1.096351e-02,4.928883e-03,4.177793e-03,4.733760e-03,1.300713e-03,...,7.845620e-04,7.116773e-04,1.079291e-03,3.559952e-04,4.176804e-04,5.199186e-04,3.468462e-04,3.821460e-04,3.388010e-04,3.394231e-04
2,4.0,0,0.5994,1.444800e-01,4.433897e-02,3.272275e-02,1.350556e-02,1.149493e-02,1.262053e-02,3.632340e-03,...,2.254966e-03,1.934102e-03,2.680035e-03,1.047766e-03,1.266431e-03,2.500307e-03,1.007887e-03,1.122510e-03,9.983642e-04,9.956545e-04
3,3.0,1,0.9574,8.879109e-19,9.567266e-01,2.588722e-19,1.768635e-19,8.031729e-20,1.039940e-19,2.819138e-20,...,1.704281e-20,2.331703e-20,1.765493e-20,7.823178e-21,9.046817e-21,8.311599e-21,8.417153e-21,8.392151e-21,7.840417e-21,7.630708e-21
4,5.0,0,0.5346,1.088721e-01,2.618823e-02,1.925668e-02,6.929436e-03,8.633063e-03,9.383899e-03,2.957779e-03,...,1.238681e-03,1.606059e-03,1.402855e-03,5.569428e-04,6.389679e-04,6.693013e-04,5.467197e-04,7.812978e-04,5.267567e-04,5.442721e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31053,3.0,5,0.0000,4.174331e-05,1.449380e-05,1.153704e-05,4.191070e-06,4.066451e-06,8.620023e-01,9.227834e-07,...,8.503311e-07,7.584706e-07,8.814185e-07,3.468753e-07,3.989037e-07,2.970358e-07,3.519682e-07,4.830704e-07,3.143573e-07,3.409149e-07
31054,4.0,2,0.8172,1.143017e-01,9.797981e-02,9.769343e-02,2.535855e-02,1.176670e-02,1.702679e-02,1.849545e-03,...,3.716574e-03,2.428499e-03,3.546589e-03,9.543578e-04,2.010010e-03,9.165034e-04,1.266602e-03,1.326253e-03,9.728741e-04,9.111125e-04
31055,5.0,3,0.5106,0.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
31056,3.0,2,0.0000,1.130326e-01,1.045018e-01,3.741590e-01,3.487885e-02,1.517259e-02,1.652969e-02,2.664462e-04,...,6.102569e-03,2.393360e-03,6.208422e-03,3.405339e-04,3.531095e-03,3.617075e-04,7.241145e-04,1.445746e-03,2.705701e-04,3.245158e-04


In [None]:
#remove topic = -1 because those are outliers and should be ignored
model_df = topic_prob[topic_prob['topic_id'] != -1]

In [None]:
model_df

Unnamed: 0,rating,topic_id,sentiment,topic_prob_0,topic_prob_1,topic_prob_2,topic_prob_3,topic_prob_4,topic_prob_5,topic_prob_6,...,topic_prob_19,topic_prob_20,topic_prob_21,topic_prob_22,topic_prob_23,topic_prob_24,topic_prob_25,topic_prob_26,topic_prob_27,topic_prob_28
1,4.0,0,0.7814,6.580047e-01,0.017115,1.096351e-02,4.928883e-03,4.177793e-03,4.733760e-03,1.300713e-03,...,7.845620e-04,7.116773e-04,1.079291e-03,3.559952e-04,4.176804e-04,5.199186e-04,3.468462e-04,3.821460e-04,3.388010e-04,3.394231e-04
2,4.0,0,0.5994,1.444800e-01,0.044339,3.272275e-02,1.350556e-02,1.149493e-02,1.262053e-02,3.632340e-03,...,2.254966e-03,1.934102e-03,2.680035e-03,1.047766e-03,1.266431e-03,2.500307e-03,1.007887e-03,1.122510e-03,9.983642e-04,9.956545e-04
3,3.0,1,0.9574,8.879109e-19,0.956727,2.588722e-19,1.768635e-19,8.031729e-20,1.039940e-19,2.819138e-20,...,1.704281e-20,2.331703e-20,1.765493e-20,7.823178e-21,9.046817e-21,8.311599e-21,8.417153e-21,8.392151e-21,7.840417e-21,7.630708e-21
4,5.0,0,0.5346,1.088721e-01,0.026188,1.925668e-02,6.929436e-03,8.633063e-03,9.383899e-03,2.957779e-03,...,1.238681e-03,1.606059e-03,1.402855e-03,5.569428e-04,6.389679e-04,6.693013e-04,5.467197e-04,7.812978e-04,5.267567e-04,5.442721e-04
5,4.0,0,0.9067,1.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31050,1.0,1,-0.5256,5.465951e-02,0.039477,1.573322e-02,6.901579e-03,4.946890e-03,6.299302e-03,1.767216e-03,...,1.038365e-03,1.314522e-03,1.059079e-03,4.827816e-04,5.482938e-04,5.055822e-04,5.176976e-04,5.144345e-04,4.826471e-04,4.727237e-04
31053,3.0,5,0.0000,4.174331e-05,0.000014,1.153704e-05,4.191070e-06,4.066451e-06,8.620023e-01,9.227834e-07,...,8.503311e-07,7.584706e-07,8.814185e-07,3.468753e-07,3.989037e-07,2.970358e-07,3.519682e-07,4.830704e-07,3.143573e-07,3.409149e-07
31054,4.0,2,0.8172,1.143017e-01,0.097980,9.769343e-02,2.535855e-02,1.176670e-02,1.702679e-02,1.849545e-03,...,3.716574e-03,2.428499e-03,3.546589e-03,9.543578e-04,2.010010e-03,9.165034e-04,1.266602e-03,1.326253e-03,9.728741e-04,9.111125e-04
31055,5.0,3,0.5106,0.000000e+00,0.000000,0.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00


#### Linear Regression

In [None]:
X = model_df.iloc[:, 2:] #topic probs and sentiment score
y = model_df["rating"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

preds = model.predict(X_test)
mse = mean_squared_error(y_test, preds)

print("MSE:", mse)

r2 = r2_score(y_test, preds)
print(f"R-squared: {r2}")

MSE: 1.3207828528187096
R-squared: 0.2993688501202626


In [None]:
coef_df = pd.DataFrame({
    "feature": X.columns,
    "coefficient": model.coef_
}).sort_values("coefficient", ascending=False)

In [None]:
for i in coef_df.index:
    if i != 0: #account for sentiment being 0 but not a topic
        topic = topic_model.get_topic_info(i-1)['Name'].to_string(index=False)
        coef = coef_df.loc[i,'coefficient']
        print(f'{topic} affects rating by {coef}')
    else:
        topic = 'sentiment'
        coef = coef_df.loc[i,'coefficient']
        print(f'{topic} affects rating by {coef}')
    

9_loved_love_purchased_day affects rating by 1.470498090712023
sentiment affects rating by 1.4483168411001353
27_love_need_received_looking affects rating by 1.4310371050629915
24_work_wear_perfect_make affects rating by 1.232619733287796
16_wanted_looking_did_make affects rating by 1.0131553252539331
15_perfect_great_good_use affects rating by 0.7881350346272384
3_cute_small_fit_little affects rating by 0.6964749709492056
7_quality_product_price_good affects rating by 0.6719617395643437
25_pay_worth_use_purchased affects rating by 0.5751980022680041
11_loved_look_use_great affects rating by 0.5228422288360353
12_use_time_money_good affects rating by 0.31202322649318326
14_dont_use_good_fit affects rating by 0.25885599114849467
2_fit_quality_good_great affects rating by 0.18749074852615139
10_day_look_pretty_wearing affects rating by 0.1835249268733291
0_wear_cute_love_fit affects rating by 0.10826663737963861
17_buy_purchase_order_item affects rating by 0.04699615027343916
28_price_bu

In [None]:
for i in coef_df.index:
    if i != 0: #account for sentiment being 0 but not a topic
        topic = topic_model.get_topic_info(i-1)['Name'].to_string(index=False)
        rep = topic_model.get_representative_docs(i-1)
        print(f'{topic}: {rep}')

9_loved_love_purchased_day: ['got as a christmas gift for my friend and she loved it', 'gift for my son and he loved it', 'i bought this for my  yr old grandson and he loved it']
27_love_need_received_looking: ['love it got a lot of compliments', 'love it i get lots of compliments', 'love it have gotten so many compliments']
24_work_wear_perfect_make: ['thin and see through didnt look or feel like a linencotton shirt dress opened and returned maybe it would work as a beach cover up but not for everyday wear', 'perfect beach wear for your little  great sunprotection fits like it should', 'more like a tunic or long shirt which i can wear with jeans and boots or jeans and sandals this may alos work as a beach coverup besides being short it fits well and is flattering on my curvy figure']
16_wanted_looking_did_make: ['just what i wanted for my trip', 'it was just what he wanted', 'just what i wanted for my trip']
15_perfect_great_good_use: ['i got these to wear under scrubs now that its ge

#### Random Forest

In [None]:
X = model_df.iloc[:, 2:] #topic probs and sentiment score
y = model_df["rating"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    random_state=42
)

rf.fit(X_train, y_train)

preds = rf.predict(X_test)
mse = mean_squared_error(y_test, preds)

print("MSE:", mse)

r2 = r2_score(y_test, preds)
print(f"R-squared: {r2}")

MSE: 1.367861104719162
R-squared: 0.2743954113048289


In [None]:
importances = rf.feature_importances_

sentiment = ['sentiment'] #first feature

n_topics = topic_model.get_topic_info().shape[0]-1

probs_names = [
    topic_model.get_topic_info().loc[i+1, 'Name']
    for i in range(n_topics)
]

topic_names = sentiment + probs_names

for name, imp in sorted(zip(topic_names, importances), key=lambda x: -x[1]):
    print(f"{name}: {imp:.4f}")

sentiment: 0.4657
0_wear_cute_love_fit: 0.0486
1_small_size_large_ordered: 0.0370
6_wear_looking_fit_really: 0.0338
20_small_way_product_quality: 0.0252
2_fit_quality_good_great: 0.0248
3_cute_small_fit_little: 0.0244
24_work_wear_perfect_make: 0.0241
8_quality_item_money_look: 0.0229
9_loved_love_purchased_day: 0.0227
4_product_work_long_dont: 0.0226
5_color_received_ordered_disappointed: 0.0211
15_perfect_great_good_use: 0.0196
26_used_come_time_quality: 0.0177
13_poorly_quality_product_buy: 0.0173
7_quality_product_price_good: 0.0154
14_dont_use_good_fit: 0.0153
21_doesnt_day_wish_pay: 0.0141
12_use_time_money_good: 0.0133
27_love_need_received_looking: 0.0131
16_wanted_looking_did_make: 0.0120
11_loved_look_use_great: 0.0119
10_day_look_pretty_wearing: 0.0115
23_disappointed_poorly_come_long: 0.0112
17_buy_purchase_order_item: 0.0097
25_pay_worth_use_purchased: 0.0095
19_day_use_poorly_used: 0.0092
18_use_dont_used_pretty: 0.0089
28_price_buy_money_want: 0.0087
22_time_work_day_doe

#### XGBoost

In [None]:
xgb_model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6
)

xgb_model.fit(X_train, y_train)

# Make predictions on test set
xgb_preds = xgb_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, xgb_preds)
r2 = r2_score(y_test, xgb_preds)

print("XGBoost Model Performance:")
print(f"MSE: {mse:.4f}")
print(f"R² Score: {r2:.4f}")

XGBoost Model Performance:
MSE: 1.2578
R² Score: 0.3328


In [None]:
importances = rf.feature_importances_

sentiment = ['sentiment'] #first feature

n_topics = topic_model.get_topic_info().shape[0]-1

probs_names = [
    topic_model.get_topic_info().loc[i+1, 'Name']
    for i in range(n_topics)
]

topic_names = sentiment + probs_names

for name, imp in sorted(zip(topic_names, importances), key=lambda x: -x[1]):
    print(f"{name}: {imp:.4f}")

sentiment: 0.4657
0_wear_cute_love_fit: 0.0486
1_small_size_large_ordered: 0.0370
6_wear_looking_fit_really: 0.0338
20_small_way_product_quality: 0.0252
2_fit_quality_good_great: 0.0248
3_cute_small_fit_little: 0.0244
24_work_wear_perfect_make: 0.0241
8_quality_item_money_look: 0.0229
9_loved_love_purchased_day: 0.0227
4_product_work_long_dont: 0.0226
5_color_received_ordered_disappointed: 0.0211
15_perfect_great_good_use: 0.0196
26_used_come_time_quality: 0.0177
13_poorly_quality_product_buy: 0.0173
7_quality_product_price_good: 0.0154
14_dont_use_good_fit: 0.0153
21_doesnt_day_wish_pay: 0.0141
12_use_time_money_good: 0.0133
27_love_need_received_looking: 0.0131
16_wanted_looking_did_make: 0.0120
11_loved_look_use_great: 0.0119
10_day_look_pretty_wearing: 0.0115
23_disappointed_poorly_come_long: 0.0112
17_buy_purchase_order_item: 0.0097
25_pay_worth_use_purchased: 0.0095
19_day_use_poorly_used: 0.0092
18_use_dont_used_pretty: 0.0089
28_price_buy_money_want: 0.0087
22_time_work_day_doe