In [32]:
import pandas as pd

from bertopic import BERTopic

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sentence_transformers import SentenceTransformer

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

import re

#### Cleaning Reviews

In [33]:
data = pd.read_csv('balanced_data.csv')

In [34]:
tops = data[data['category'] == 'Shirts/Tops']

tops['text'] = tops['text'].str.strip()
tops = tops[tops['text'].str.split().str.len() >= 3]  # remove very short reviews
bad_values = {"na", "n a", "n/a", "none", ""}
tops = tops[~tops['text'].isin(["na", "n a", "n/a", "none", ""])] #remove bad values


In [35]:
from langdetect import detect, DetectorFactory

# fix randomness for reproducibility
DetectorFactory.seed = 0

def is_english(text):
    """Return True if the text is English, False otherwise."""
    try:
        return detect(text) == 'en'
    except:
        return False

tops['is_english'] = tops['text'].apply(is_english)

tops = tops[tops['is_english']] #remove non-english reviews
tops.shape

(30907, 8)

In [36]:
reviews = tops['text'].tolist()
reviews = [str(i) for i in reviews]

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)  # remove links
    text = re.sub(r"[^\w\s]", "", text)         # remove punctuation
    text = re.sub(r"\d+", "", text)             # remove numbers
    text = text.encode("ascii", "ignore").decode()  # remove emojis
    return text

reviews = [clean_text(r) for r in reviews]

#### Topic Modeling with BERTopic

In [37]:
vectorizer_model = CountVectorizer(
    stop_words="english",
    min_df=20,            
    max_df=0.8,           
    ngram_range=(1,2)
)

In [38]:
topic_model = BERTopic(
    embedding_model="all-MiniLM-L6-v2",
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    verbose=True
)

In [39]:
topics, probs = topic_model.fit_transform(reviews)

2025-11-23 23:39:44,736 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/966 [00:00<?, ?it/s]

2025-11-23 23:44:56,300 - BERTopic - Embedding - Completed ✓
2025-11-23 23:44:56,300 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-23 23:45:13,497 - BERTopic - Dimensionality - Completed ✓
2025-11-23 23:45:13,504 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-23 23:52:00,662 - BERTopic - Cluster - Completed ✓
2025-11-23 23:52:00,703 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-23 23:52:02,702 - BERTopic - Representation - Completed ✓


In [40]:
topic_model = topic_model.reduce_topics(reviews, nr_topics=30)

2025-11-23 23:52:04,516 - BERTopic - Topic reduction - Reducing number of topics
2025-11-23 23:52:04,660 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-23 23:52:05,969 - BERTopic - Representation - Completed ✓
2025-11-23 23:52:05,998 - BERTopic - Topic reduction - Reduced number of topics from 299 to 30


In [41]:
topics, probs = topic_model.transform(reviews)

Batches:   0%|          | 0/966 [00:00<?, ?it/s]

2025-11-23 23:54:12,227 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-11-23 23:54:12,359 - BERTopic - Dimensionality - Completed ✓
2025-11-23 23:54:12,359 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-11-23 23:54:14,149 - BERTopic - Probabilities - Start calculation of probabilities with HDBSCAN
2025-11-23 23:58:11,301 - BERTopic - Probabilities - Completed ✓
2025-11-23 23:58:11,310 - BERTopic - Cluster - Completed ✓


In [43]:
tops['topic'] = topics
tops['topic_probs'] = probs.tolist()

#### Add Sentiment Score with VADER

In [44]:
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\yzhen\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [45]:
tops['sentiment'] = tops['text'].apply(lambda x: sia.polarity_scores(str(x))["compound"])

In [46]:
tops.shape

(30907, 11)

#### Creating DataFrame with Probability Distribution for Modeling

In [47]:
df = pd.DataFrame()
df['rating'] = tops['rating']
df['topic_id'] = topics
df['sentiment'] = tops['sentiment']

probs_df = pd.DataFrame(
    probs,
    columns = [f"topic_prob_{i}" for i in range(probs.shape[1])]
)

In [48]:
topic_prob = pd.concat([df.reset_index(drop=True), probs_df.reset_index(drop=True)], axis=1)

topic_prob

Unnamed: 0,rating,topic_id,sentiment,topic_prob_0,topic_prob_1,topic_prob_2,topic_prob_3,topic_prob_4,topic_prob_5,topic_prob_6,...,topic_prob_19,topic_prob_20,topic_prob_21,topic_prob_22,topic_prob_23,topic_prob_24,topic_prob_25,topic_prob_26,topic_prob_27,topic_prob_28
0,4.0,8,0.6918,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1,5.0,3,0.8357,0.113984,6.467506e-02,8.347650e-02,5.294157e-02,1.968119e-02,2.120139e-02,2.572082e-02,...,4.353139e-03,3.125831e-03,1.191621e-03,1.459706e-03,8.332692e-04,9.984247e-04,7.329726e-04,1.153996e-03,2.181929e-03,1.219449e-03
2,3.0,-1,0.0129,0.805020,2.013661e-06,6.582338e-09,5.155350e-10,3.494502e-10,8.818708e-10,2.858562e-10,...,5.907253e-11,4.878661e-11,1.653745e-11,2.926498e-11,1.044749e-11,1.031631e-11,7.195607e-12,1.517155e-11,3.918538e-11,1.543409e-11
3,1.0,0,0.4215,0.209143,5.601512e-02,4.810488e-02,2.970834e-02,2.199772e-02,1.686403e-02,1.546442e-02,...,5.183287e-03,4.032245e-03,1.711953e-03,1.815012e-03,1.353154e-03,1.486515e-03,1.099363e-03,1.404170e-03,2.231416e-03,1.408521e-03
4,1.0,-1,-0.8079,0.000015,6.929839e-06,6.110864e-06,3.541038e-06,3.856594e-06,2.293256e-06,1.771516e-06,...,9.248949e-06,3.783159e-07,2.719872e-07,2.037991e-07,1.745043e-07,1.999559e-07,1.667780e-07,9.166146e-06,1.902402e-07,2.268971e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30902,3.0,0,0.0000,1.000000,1.592156e-14,1.230758e-14,7.491886e-15,5.560434e-15,4.238402e-15,3.624251e-15,...,1.355844e-15,9.904251e-16,4.016844e-16,4.216634e-16,3.468094e-16,3.800522e-16,2.930836e-16,3.574162e-16,5.391210e-16,3.559043e-16
30903,1.0,2,-0.4588,0.000284,1.310679e-04,8.446099e-01,7.152705e-05,3.897395e-05,4.424108e-05,3.962043e-05,...,9.530853e-06,7.555607e-06,2.620942e-06,4.114272e-06,2.114514e-06,2.247559e-06,1.864749e-06,2.496291e-06,4.388135e-06,2.640735e-06
30904,1.0,7,0.0000,0.085809,3.992891e-02,3.477086e-02,1.939252e-02,1.564730e-02,1.250312e-02,9.863981e-03,...,7.333703e-03,2.105879e-03,1.304028e-03,1.119877e-03,1.180466e-03,1.397501e-03,1.408159e-03,2.029665e-03,1.054056e-03,2.568623e-03
30905,1.0,0,-0.4215,0.438110,3.187120e-02,2.891664e-02,1.519680e-02,1.084456e-02,9.266790e-03,7.612027e-03,...,2.768034e-03,2.237065e-03,8.037939e-04,9.831679e-04,7.353669e-04,7.791168e-04,6.961211e-04,7.290450e-04,1.110968e-03,7.391793e-04


In [49]:
#remove topic = -1 because those are outliers and should be ignored
model_df = topic_prob[topic_prob['topic_id'] != -1]

In [50]:
model_df

Unnamed: 0,rating,topic_id,sentiment,topic_prob_0,topic_prob_1,topic_prob_2,topic_prob_3,topic_prob_4,topic_prob_5,topic_prob_6,...,topic_prob_19,topic_prob_20,topic_prob_21,topic_prob_22,topic_prob_23,topic_prob_24,topic_prob_25,topic_prob_26,topic_prob_27,topic_prob_28
0,4.0,8,0.6918,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1,5.0,3,0.8357,0.113984,6.467506e-02,8.347650e-02,5.294157e-02,1.968119e-02,2.120139e-02,2.572082e-02,...,4.353139e-03,3.125831e-03,1.191621e-03,1.459706e-03,8.332692e-04,9.984247e-04,7.329726e-04,1.153996e-03,2.181929e-03,1.219449e-03
3,1.0,0,0.4215,0.209143,5.601512e-02,4.810488e-02,2.970834e-02,2.199772e-02,1.686403e-02,1.546442e-02,...,5.183287e-03,4.032245e-03,1.711953e-03,1.815012e-03,1.353154e-03,1.486515e-03,1.099363e-03,1.404170e-03,2.231416e-03,1.408521e-03
5,3.0,0,0.6369,0.898635,7.124923e-07,2.390733e-07,7.295117e-08,4.969725e-08,7.536106e-08,3.660777e-08,...,1.072418e-08,7.884656e-09,3.014684e-09,4.391573e-09,2.255533e-09,2.236285e-09,1.748678e-09,2.756073e-09,4.908241e-09,2.803968e-09
6,4.0,1,0.0516,0.119912,7.481167e-02,3.148522e-02,1.893187e-02,1.360554e-02,1.199654e-02,8.302279e-03,...,3.142439e-03,2.010733e-03,8.930101e-04,1.019955e-03,7.531042e-04,7.756186e-04,6.799786e-04,8.054770e-04,1.074368e-03,8.147843e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30902,3.0,0,0.0000,1.000000,1.592156e-14,1.230758e-14,7.491886e-15,5.560434e-15,4.238402e-15,3.624251e-15,...,1.355844e-15,9.904251e-16,4.016844e-16,4.216634e-16,3.468094e-16,3.800522e-16,2.930836e-16,3.574162e-16,5.391210e-16,3.559043e-16
30903,1.0,2,-0.4588,0.000284,1.310679e-04,8.446099e-01,7.152705e-05,3.897395e-05,4.424108e-05,3.962043e-05,...,9.530853e-06,7.555607e-06,2.620942e-06,4.114272e-06,2.114514e-06,2.247559e-06,1.864749e-06,2.496291e-06,4.388135e-06,2.640735e-06
30904,1.0,7,0.0000,0.085809,3.992891e-02,3.477086e-02,1.939252e-02,1.564730e-02,1.250312e-02,9.863981e-03,...,7.333703e-03,2.105879e-03,1.304028e-03,1.119877e-03,1.180466e-03,1.397501e-03,1.408159e-03,2.029665e-03,1.054056e-03,2.568623e-03
30905,1.0,0,-0.4215,0.438110,3.187120e-02,2.891664e-02,1.519680e-02,1.084456e-02,9.266790e-03,7.612027e-03,...,2.768034e-03,2.237065e-03,8.037939e-04,9.831679e-04,7.353669e-04,7.791168e-04,6.961211e-04,7.290450e-04,1.110968e-03,7.391793e-04


In [51]:
# topic_model.get_topic_info()

#### Linear Regression

In [52]:
X = model_df.iloc[:, 2:] #topic probs and sentiment score
y = model_df["rating"]

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
model = LinearRegression()
model.fit(X_train, y_train)

preds = model.predict(X_test)
mse = mean_squared_error(y_test, preds)

print("MSE:", mse)

r2 = r2_score(y_test, preds)
print(f"R-squared: {r2}")

MSE: 1.2908376089545026
R-squared: 0.31015978269920974


In [55]:
coef_df = pd.DataFrame({
    "feature": X.columns,
    "coefficient": model.coef_
}).sort_values("coefficient", ascending=False)

In [56]:
# coef_df

In [57]:
for i in coef_df.index:
    if i != 0: #account for sentiment being 0 but not a topic
        topic = topic_model.get_topic_info(i-1)['Name'].to_string(index=False)
        coef = coef_df.loc[i,'coefficient']
        print(f'{topic} affects rating by {coef}')
    else:
        topic = 'sentiment'
        coef = coef_df.loc[i,'coefficient']
        print(f'{topic} affects rating by {coef}')
    

26_perfect_way_bad_ affects rating by 1.926369467339854
24_beautiful_received_time_doesnt affects rating by 1.7308022200927269
12_loved_old_bought_purchased affects rating by 1.5108096096725516
28_work_didnt_day_want affects rating by 1.4802396260784128
sentiment affects rating by 1.4311284035487088
11_described_quality_time_took affects rating by 1.3390194616925142
27_perfect_used_wouldnt_beautiful affects rating by 1.3031654959841856
6_comfortable_fits_cute_perfect affects rating by 1.0724399121691706
19_expected_wanted_described_wasnt affects rating by 0.571259906831186
3_fit_fits_expected_didnt affects rating by 0.4709108079843513
9_beautiful_look_cute_dont affects rating by 0.11565029185629849
15_big_comfortable_old_pretty affects rating by 0.10531658351763765
8_look_time_use_dont affects rating by 0.08439048563479386
0_wear_cute_fit_size affects rating by 0.05602217043507216
13_look_picture_better_cute affects rating by 0.017328943815247293
4_old_cute_small_size affects rating by

In [63]:
for i in coef_df.index:
    if i != 0: #account for sentiment being 0 but not a topic
        topic = topic_model.get_topic_info(i-1)['Name'].to_string(index=False)
        rep = topic_model.get_representative_docs(i-1)
        print(f'{topic}: {rep}')

26_perfect_way_bad_: ['perfect  in love', 'perfect    thanks', 'perfect and trendy']
24_beautiful_received_time_doesnt: ['i received lots of complement', 'beautiful  i get lots of compliments on this', 'beautiful get good complements on it']
12_loved_old_bought_purchased: ['my daughter loved it', 'gift for a friend he loved it', 'daughter loved it lil short for my taste personally']
28_work_didnt_day_want: ['i didnt see how it helps at all', 'ware it to work every day', 'just didnt work for me']
11_described_quality_time_took: ['product was as described and delivered on time', 'this product was exactly as described only down fall it took  weeks to receive did not come in the time frame given', 'good quality fast shipment excellent price product was exactly what was described son is very pleased']
27_perfect_used_wouldnt_beautiful: ['it was a bit seethrough so im probably just going to wear it as a beach coverup but other than that its at least somewhat like the picture', 'its okay  it 

#### Random Forest

In [59]:
X = model_df.iloc[:, 2:] #topic probs and sentiment score
y = model_df["rating"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    random_state=42
)

rf.fit(X_train, y_train)

preds = model.predict(X_test)
mse = mean_squared_error(y_test, preds)

print("MSE:", mse)

r2 = r2_score(y_test, preds)
print(f"R-squared: {r2}")

MSE: 1.2908376089545026
R-squared: 0.31015978269920974


In [60]:
importances = rf.feature_importances_

sentiment = ['sentiment'] #first feature

n_topics = topic_model.get_topic_info().shape[0]-1

probs_names = [
    topic_model.get_topic_info().loc[i+1, 'Name']
    for i in range(n_topics)
]

topic_names = sentiment + probs_names

for name, imp in sorted(zip(topic_names, importances), key=lambda x: -x[1]):
    print(f"{name}: {imp:.4f}")

sentiment: 0.4710
0_wear_cute_fit_size: 0.0480
1_small_size_ordered_large: 0.0428
14_looking_definitely_fit_wear: 0.0283
2_material_cheap_expected_quality: 0.0255
6_comfortable_fits_cute_perfect: 0.0254
3_fit_fits_expected_didnt: 0.0254
10_cheap_quality_money_bad: 0.0236
5_picture_ordered_fit_received: 0.0218
12_loved_old_bought_purchased: 0.0203
4_old_cute_small_size: 0.0190
11_described_quality_time_took: 0.0182
15_big_comfortable_old_pretty: 0.0175
16_cheaply_bad_cheap_came: 0.0162
27_perfect_used_wouldnt_beautiful: 0.0154
17_material_cheap_super_returned: 0.0153
22_came_wasnt_cheaply_wouldnt: 0.0146
20_used_comfortable_dont_way: 0.0145
9_beautiful_look_cute_dont: 0.0145
25_worth_purchase_ok_better: 0.0142
8_look_time_use_dont: 0.0137
13_look_picture_better_cute: 0.0130
24_beautiful_received_time_doesnt: 0.0126
7_like picture_picture_quality_cheap: 0.0119
23_time_return_use_old: 0.0108
18_returned_return_came_need: 0.0102
21_recommend_use_day_old: 0.0093
26_perfect_way_bad_: 0.0091
