In [1]:
import pandas as pd
import numpy as np

from bertopic import BERTopic

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from xgboost import XGBRegressor

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

import re




#### Cleaning Reviews

In [2]:
data = pd.read_csv('balanced_data.csv')

In [3]:
tops = data[data['category'] == 'Shirts/Tops']

tops['text'] = tops['text'].str.strip()
tops = tops[tops['text'].str.split().str.len() >= 3]  # remove very short reviews
bad_values = {"na", "n a", "n/a", "none", ""}
tops = tops[~tops['text'].isin(["na", "n a", "n/a", "none", ""])] #remove bad values


In [4]:
from langdetect import detect, DetectorFactory

# fix randomness for reproducibility
DetectorFactory.seed = 0

def is_english(text):
    """Return True if the text is English, False otherwise."""
    try:
        return detect(text) == 'en'
    except:
        return False

tops['is_english'] = tops['text'].apply(is_english)

tops = tops[tops['is_english']] #remove non-english reviews
tops.shape

(31058, 8)

In [5]:
reviews = tops['text'].tolist()
reviews = [str(i) for i in reviews]

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)  # remove links
    text = re.sub(r"[^\w\s]", "", text)         # remove punctuation
    text = re.sub(r"\d+", "", text)             # remove numbers
    text = text.encode("ascii", "ignore").decode()  # remove emojis
    return text

reviews = [clean_text(r) for r in reviews]

#### Topic Modeling with BERTopic

In [6]:
vectorizer_model = CountVectorizer(
    stop_words="english",
    min_df=20,            
    max_df=0.8,           
    ngram_range=(1,2)
)

In [7]:
topic_model = BERTopic(
    embedding_model="all-MiniLM-L6-v2",
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    verbose=True
)

In [8]:
topics, probs = topic_model.fit_transform(reviews)

2025-12-04 08:28:49,342 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/971 [00:00<?, ?it/s]

2025-12-04 08:33:18,994 - BERTopic - Embedding - Completed ✓
2025-12-04 08:33:18,999 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-04 08:35:36,777 - BERTopic - Dimensionality - Completed ✓
2025-12-04 08:35:36,811 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-04 08:41:27,893 - BERTopic - Cluster - Completed ✓
2025-12-04 08:41:27,914 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-04 08:41:28,958 - BERTopic - Representation - Completed ✓


In [9]:
topic_model = topic_model.reduce_topics(reviews, nr_topics=30)

2025-12-04 08:41:29,835 - BERTopic - Topic reduction - Reducing number of topics
2025-12-04 08:41:29,926 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-04 08:41:30,606 - BERTopic - Representation - Completed ✓
2025-12-04 08:41:30,607 - BERTopic - Topic reduction - Reduced number of topics from 283 to 30


In [10]:
topics, probs = topic_model.transform(reviews)

Batches:   0%|          | 0/971 [00:00<?, ?it/s]

2025-12-04 08:43:24,624 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-12-04 08:43:24,709 - BERTopic - Dimensionality - Completed ✓
2025-12-04 08:43:24,710 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-12-04 08:43:25,634 - BERTopic - Probabilities - Start calculation of probabilities with HDBSCAN
2025-12-04 08:45:17,198 - BERTopic - Probabilities - Completed ✓
2025-12-04 08:45:17,199 - BERTopic - Cluster - Completed ✓


In [11]:
tops['topic'] = topics
tops['topic_probs'] = probs.tolist()

In [12]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,11622,-1_fit_size_material_wear,"[fit, size, material, wear, im, color, good, l...",[what a great price love the color and the fi...
1,0,11643,0_size_large_wear_fit,"[size, large, wear, fit, fits, im, love, way, ...",[i wanted an extra small but had to opt for th...
2,1,2208,1_material_looks_good_fit,"[material, looks, good, fit, super, returned, ...","[not comfortable material, strange material an..."
3,2,1302,2_fit_fits_little_way,"[fit, fits, little, way, big, good, bit, retur...",[a little bit to big i bought this to my broth...
4,3,867,3_super_fits_fit_really,"[super, fits, fit, really, little, size, good,...",[theyre super cute but they fit a little too b...
5,4,475,4_looking_wear_fit_material,"[looking, wear, fit, material, really, wanted,...",[i really liked these tank tops material was ...
6,5,321,5_time_good_product_price,"[time, good, product, price, item, worth, look...",[have to say stars on shipping got mines real...
7,6,306,6_love_purchased_day_lot,"[love, purchased, day, lot, perfect, really, k...","[love patty boutik, absolutely love it, its be..."
8,7,298,7_look_use_price_love,"[look, use, price, love, good, looked, perfect...",[i ordered these as a gift for a coworker the...
9,8,261,8_price_good_product_cheaply,"[price, good, product, cheaply, looks, looking...","[good enough for the price paid, good product ..."


#### Add Sentiment Score with VADER

In [13]:
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\yzhen\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [14]:
tops['sentiment'] = tops['text'].apply(lambda x: sia.polarity_scores(str(x))["compound"])

#### Creating DataFrame with Probability Distribution for Modeling

In [15]:
df = pd.DataFrame()
df['rating'] = tops['rating']
df['topic_id'] = topics
df['sentiment'] = tops['sentiment']

probs_df = pd.DataFrame(
    probs,
    columns = [f"topic_prob_{i}" for i in range(probs.shape[1])]
)

In [16]:
topic_prob = pd.concat([df.reset_index(drop=True), probs_df.reset_index(drop=True)], axis=1)

topic_prob

Unnamed: 0,rating,topic_id,sentiment,topic_prob_0,topic_prob_1,topic_prob_2,topic_prob_3,topic_prob_4,topic_prob_5,topic_prob_6,...,topic_prob_19,topic_prob_20,topic_prob_21,topic_prob_22,topic_prob_23,topic_prob_24,topic_prob_25,topic_prob_26,topic_prob_27,topic_prob_28
0,3.0,-1,0.5927,7.773130e-01,7.062517e-08,1.345035e-08,1.079807e-08,6.844942e-10,1.961066e-08,3.570976e-09,...,2.739232e-09,2.443315e-09,8.451914e-09,4.187065e-10,6.347734e-10,6.620677e-10,8.422881e-10,3.113996e-10,9.167766e-10,4.741955e-09
1,4.0,0,0.7814,9.587255e-01,2.711244e-20,1.212588e-20,9.058408e-21,1.864479e-21,2.833172e-21,2.826350e-21,...,2.440603e-21,1.707881e-21,2.669931e-21,5.348403e-22,6.197649e-22,5.756200e-22,6.043361e-22,4.636424e-22,6.279549e-22,6.212254e-22
2,4.0,0,0.5994,2.242834e-01,5.906110e-02,2.425017e-02,1.787909e-02,4.281905e-03,5.964453e-03,6.058525e-03,...,5.370145e-03,3.558222e-03,5.300756e-03,1.147053e-03,1.209654e-03,1.182116e-03,1.229843e-03,1.117667e-03,1.280412e-03,1.252036e-03
3,3.0,2,0.9574,1.112827e-01,3.648608e-02,4.470062e-01,1.454807e-02,2.121285e-03,3.965025e-03,4.068088e-03,...,3.884907e-03,2.623986e-03,3.482833e-03,8.299745e-04,1.531313e-03,7.945222e-04,8.731782e-04,6.414196e-04,8.893959e-04,8.393647e-04
4,5.0,0,0.5346,1.717632e-01,4.130069e-02,1.472984e-02,1.024662e-02,3.742423e-03,5.632805e-03,3.711378e-03,...,3.362864e-03,2.264453e-03,3.801870e-03,7.399096e-04,7.906111e-04,8.145052e-04,7.836451e-04,9.939508e-04,8.083472e-04,1.234185e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31053,3.0,0,0.0000,9.499836e-01,1.490692e-16,5.054359e-17,3.726337e-17,6.473972e-18,2.253405e-17,1.355614e-17,...,1.178630e-17,8.499420e-18,1.553904e-17,2.451439e-18,2.716320e-18,2.825817e-18,2.939091e-18,2.764074e-18,3.136341e-18,4.936932e-18
31054,4.0,-1,0.8172,5.516973e-99,6.064420e-03,1.687731e-03,6.773542e-04,3.369602e-106,4.945615e-83,6.386403e-04,...,1.684747e-04,5.387364e-04,1.190289e-102,5.340556e-106,3.630339e-105,3.278041e-105,1.501899e-104,1.669310e-106,7.903077e-103,1.172575e-104
31055,5.0,3,0.5106,0.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
31056,3.0,1,0.0000,0.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00


In [17]:
#remove topic = -1 because those are outliers and should be ignored
model_df = topic_prob[topic_prob['topic_id'] != -1]

In [18]:
model_df

Unnamed: 0,rating,topic_id,sentiment,topic_prob_0,topic_prob_1,topic_prob_2,topic_prob_3,topic_prob_4,topic_prob_5,topic_prob_6,...,topic_prob_19,topic_prob_20,topic_prob_21,topic_prob_22,topic_prob_23,topic_prob_24,topic_prob_25,topic_prob_26,topic_prob_27,topic_prob_28
1,4.0,0,0.7814,0.958726,2.711244e-20,1.212588e-20,9.058408e-21,1.864479e-21,2.833172e-21,2.826350e-21,...,2.440603e-21,1.707881e-21,2.669931e-21,5.348403e-22,6.197649e-22,5.756200e-22,6.043361e-22,4.636424e-22,6.279549e-22,6.212254e-22
2,4.0,0,0.5994,0.224283,5.906110e-02,2.425017e-02,1.787909e-02,4.281905e-03,5.964453e-03,6.058525e-03,...,5.370145e-03,3.558222e-03,5.300756e-03,1.147053e-03,1.209654e-03,1.182116e-03,1.229843e-03,1.117667e-03,1.280412e-03,1.252036e-03
3,3.0,2,0.9574,0.111283,3.648608e-02,4.470062e-01,1.454807e-02,2.121285e-03,3.965025e-03,4.068088e-03,...,3.884907e-03,2.623986e-03,3.482833e-03,8.299745e-04,1.531313e-03,7.945222e-04,8.731782e-04,6.414196e-04,8.893959e-04,8.393647e-04
4,5.0,0,0.5346,0.171763,4.130069e-02,1.472984e-02,1.024662e-02,3.742423e-03,5.632805e-03,3.711378e-03,...,3.362864e-03,2.264453e-03,3.801870e-03,7.399096e-04,7.906111e-04,8.145052e-04,7.836451e-04,9.939508e-04,8.083472e-04,1.234185e-03
5,4.0,0,0.9067,0.998990,2.751339e-04,1.202752e-04,8.436676e-05,2.395105e-05,3.211371e-05,2.874129e-05,...,2.541400e-05,1.722612e-05,2.631837e-05,5.346949e-06,6.225978e-06,5.757369e-06,6.013686e-06,5.204486e-06,6.202864e-06,6.869385e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31049,4.0,0,0.6360,0.855807,2.518409e-05,8.745178e-06,5.939013e-06,1.822188e-06,5.417120e-06,2.259996e-06,...,2.084893e-06,1.397320e-06,2.298664e-06,4.371322e-07,4.735645e-07,4.897277e-07,4.768212e-07,7.032884e-07,4.907369e-07,8.116953e-07
31050,1.0,2,-0.5256,0.032314,1.113358e-02,9.763641e-03,3.710069e-03,8.399113e-04,1.226438e-03,1.260197e-03,...,1.214484e-03,8.090135e-04,1.059058e-03,2.549725e-04,4.141993e-04,2.449660e-04,2.672781e-04,2.247262e-04,2.713254e-04,2.579017e-04
31053,3.0,0,0.0000,0.949984,1.490692e-16,5.054359e-17,3.726337e-17,6.473972e-18,2.253405e-17,1.355614e-17,...,1.178630e-17,8.499420e-18,1.553904e-17,2.451439e-18,2.716320e-18,2.825817e-18,2.939091e-18,2.764074e-18,3.136341e-18,4.936932e-18
31055,5.0,3,0.5106,0.000000,0.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00


#### Linear Regression

In [19]:
X = model_df.iloc[:, 2:] #topic probs and sentiment score
y = model_df["rating"]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
model = LinearRegression()
model.fit(X_train, y_train)

preds = model.predict(X_test)
mse = mean_squared_error(y_test, preds)

print("MSE:", mse)

r2 = r2_score(y_test, preds)
print(f"R-squared: {r2}")

MSE: 1.3111685113381264
R-squared: 0.28601708029036454


In [22]:
coef_df = pd.DataFrame({
    "feature": X.columns,
    "coefficient": model.coef_
}).sort_values("coefficient", ascending=False)

In [23]:
for i in coef_df.index:
    if i != 0: #account for sentiment being 0 but not a topic
        topic = topic_model.get_topic_info(i-1)['Name'].to_string(index=False)
        coef = coef_df.loc[i,'coefficient']
        print(f'{topic} affects rating by {coef}')
    else:
        topic = 'sentiment'
        coef = coef_df.loc[i,'coefficient']
        print(f'{topic} affects rating by {coef}')
    

sentiment affects rating by 1.498029434769455
6_love_purchased_day_lot affects rating by 1.4171809280234988
19_wanted_looking_works_wasnt affects rating by 0.9778905567220095
17_perfect_work_use_good affects rating by 0.8475386014596661
3_super_fits_fit_really affects rating by 0.7097644713092686
8_price_good_product_cheaply affects rating by 0.6805978715446188
7_look_use_price_love affects rating by 0.5988314811750851
25_cheaply_guess_worth_purchased affects rating by 0.591729816969268
28_know_wouldnt_sure_going affects rating by 0.34292025844802215
5_time_good_product_price affects rating by 0.2397565133638945
11_different_short_material_fits affects rating by 0.23319604607060643
2_fit_fits_little_way affects rating by 0.22856838623829182
13_material_bit_little_wearing affects rating by 0.21020323027547508
1_material_looks_good_fit affects rating by 0.18772743389764587
10_use_time_good_think affects rating by 0.13621367465882125
0_size_large_wear_fit affects rating by 0.0750995706117

In [24]:
for i in coef_df.index:
    if i != 0: #account for sentiment being 0 but not a topic
        topic = topic_model.get_topic_info(i-1)['Name'].to_string(index=False)
        rep = topic_model.get_representative_docs(i-1)
        print(f'{topic}: {rep}')

6_love_purchased_day_lot: ['love patty boutik', 'absolutely love it', 'its beautiful and love it ']
19_wanted_looking_works_wasnt: ['just what i wanted for my trip', 'just what i wanted for my trip', 'exactly what i wanted']
17_perfect_work_use_good: ['these are so comfy and fit perfect high quality im tall and these actually rest on the top of my foot its hard to find a good pair of sweats but these are perfect ill be buying more i highly recommend', 'i love these thin wicking material very stretchy so much higher quality than expected im a c  inch waist  inch hips size s   curvyathletic the small fits perfectly  the tag says us size small if i weighed even  pounds more or was up to a d cup id still get a small bc theyre not really compression  theyre just skin tight and clingy move with you perfect for hot yoga without a bra very very similar to the  degree brand but run slightly smaller perhaps or similar probably more similar these are an amazing bargain and i may even buy more sin

#### Random Forest

In [25]:
X = model_df.iloc[:, 2:] #topic probs and sentiment score
y = model_df["rating"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    random_state=42
)

rf.fit(X_train, y_train)

preds = rf.predict(X_test)
mse = mean_squared_error(y_test, preds)

print("MSE:", mse)

r2 = r2_score(y_test, preds)
print(f"R-squared: {r2}")

MSE: 1.365651888474468
R-squared: 0.2563487345765506


In [26]:
importances = rf.feature_importances_

sentiment = ['sentiment'] #first feature

n_topics = topic_model.get_topic_info().shape[0]-1

probs_names = [
    topic_model.get_topic_info().loc[i+1, 'Name']
    for i in range(n_topics)
]

topic_names = sentiment + probs_names

for name, imp in sorted(zip(topic_names, importances), key=lambda x: -x[1]):
    print(f"{name}: {imp:.4f}")

sentiment: 0.4755
0_size_large_wear_fit: 0.0519
4_looking_wear_fit_material: 0.0304
5_time_good_product_price: 0.0292
2_fit_fits_little_way: 0.0268
1_material_looks_good_fit: 0.0236
6_love_purchased_day_lot: 0.0225
3_super_fits_fit_really: 0.0217
28_know_wouldnt_sure_going: 0.0195
11_different_short_material_fits: 0.0193
26_looks like_looks_cheaply_does: 0.0188
18_bad_return_used_package: 0.0188
14_came_color_better_look: 0.0187
17_perfect_work_use_good: 0.0184
13_material_bit_little_wearing: 0.0169
10_use_time_good_think: 0.0169
23_way_poorly_product_large: 0.0162
12_day_product_work_ive: 0.0137
8_price_good_product_cheaply: 0.0136
22_use_piece_used_pretty: 0.0133
9_looked_pretty_better_looking: 0.0133
21_use_used_day_time: 0.0131
7_look_use_price_love: 0.0129
15_piece_use_fit_doesnt: 0.0127
19_wanted_looking_works_wasnt: 0.0124
16_looks_looks like_better_disappointed: 0.0122
20_returned_sent_wouldnt_product: 0.0103
24_work_know_works_time: 0.0102
27_piece_package_item_came: 0.0086
25

#### XGBoost

In [27]:
xgb_model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6
)

xgb_model.fit(X_train, y_train)

# Make predictions on test set
xgb_preds = xgb_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, xgb_preds)
r2 = r2_score(y_test, xgb_preds)

print("XGBoost Model Performance:")
print(f"MSE: {mse:.4f}")
print(f"R² Score: {r2:.4f}")

XGBoost Model Performance:
MSE: 1.2640
R² Score: 0.3117


In [28]:
importances = rf.feature_importances_

sentiment = ['sentiment'] #first feature

n_topics = topic_model.get_topic_info().shape[0]-1

probs_names = [
    topic_model.get_topic_info().loc[i+1, 'Name']
    for i in range(n_topics)
]

topic_names = sentiment + probs_names

for name, imp in sorted(zip(topic_names, importances), key=lambda x: -x[1]):
    print(f"{name}: {imp:.4f}")

sentiment: 0.4755
0_size_large_wear_fit: 0.0519
4_looking_wear_fit_material: 0.0304
5_time_good_product_price: 0.0292
2_fit_fits_little_way: 0.0268
1_material_looks_good_fit: 0.0236
6_love_purchased_day_lot: 0.0225
3_super_fits_fit_really: 0.0217
28_know_wouldnt_sure_going: 0.0195
11_different_short_material_fits: 0.0193
26_looks like_looks_cheaply_does: 0.0188
18_bad_return_used_package: 0.0188
14_came_color_better_look: 0.0187
17_perfect_work_use_good: 0.0184
13_material_bit_little_wearing: 0.0169
10_use_time_good_think: 0.0169
23_way_poorly_product_large: 0.0162
12_day_product_work_ive: 0.0137
8_price_good_product_cheaply: 0.0136
22_use_piece_used_pretty: 0.0133
9_looked_pretty_better_looking: 0.0133
21_use_used_day_time: 0.0131
7_look_use_price_love: 0.0129
15_piece_use_fit_doesnt: 0.0127
19_wanted_looking_works_wasnt: 0.0124
16_looks_looks like_better_disappointed: 0.0122
20_returned_sent_wouldnt_product: 0.0103
24_work_know_works_time: 0.0102
27_piece_package_item_came: 0.0086
25