In [1]:
import pandas as pd

from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sentence_transformers import SentenceTransformer

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

import re




#### Cleaning Reviews

In [2]:
data = pd.read_csv('balanced_data.csv')

tops = data[data['category'] == 'Shirts/Tops']

tops['text'] = tops['text'].str.strip()
tops = tops[tops['text'].str.split().str.len() >= 3]  # remove very short reviews
bad_values = {"na", "n a", "n/a", "none", ""}
tops = tops[~tops['text'].isin(["na", "n a", "n/a", "none", ""])] #remove bad values

from langdetect import detect, DetectorFactory

# fix randomness for reproducibility
DetectorFactory.seed = 0

def is_english(text):
    """Return True if the text is English, False otherwise."""
    try:
        return detect(text) == 'en'
    except:
        return False

tops['is_english'] = tops['text'].apply(is_english)

tops = tops[tops['is_english']] #remove non-english reviews
tops.shape

(30907, 8)

In [3]:
reviews = tops['text'].tolist()

reviews = [r for r in reviews if r.lower() not in bad_values]

reviews = [str(i) for i in reviews]

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)  # remove links
    text = re.sub(r"[^\w\s]", "", text)         # remove punctuation
    text = re.sub(r"\d+", "", text)             # remove numbers
    text = text.encode("ascii", "ignore").decode()  # remove emojis
    return text

reviews = [clean_text(r) for r in reviews]

len(reviews)

30907

#### Topic Modeling BERTopic

In [4]:
vectorizer_model = CountVectorizer(
    stop_words="english",
    min_df=20,            
    max_df=0.8,           
    ngram_range=(1,2)
)

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    verbose=True
)

In [5]:
topics, probs = topic_model.fit_transform(reviews)

2025-11-23 19:18:59,871 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/966 [00:00<?, ?it/s]

2025-11-23 19:20:56,869 - BERTopic - Embedding - Completed ✓
2025-11-23 19:20:56,870 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-23 19:21:21,958 - BERTopic - Dimensionality - Completed ✓
2025-11-23 19:21:21,973 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-23 19:23:39,827 - BERTopic - Cluster - Completed ✓
2025-11-23 19:23:39,854 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-23 19:23:40,816 - BERTopic - Representation - Completed ✓


In [6]:
topic_model = topic_model.reduce_topics(reviews, nr_topics=30) #cosolidate topics

2025-11-23 19:23:41,941 - BERTopic - Topic reduction - Reducing number of topics
2025-11-23 19:23:42,046 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-23 19:23:42,770 - BERTopic - Representation - Completed ✓
2025-11-23 19:23:42,770 - BERTopic - Topic reduction - Reduced number of topics from 306 to 30


In [7]:
topics, probs = topic_model.transform(reviews)

Batches:   0%|          | 0/966 [00:00<?, ?it/s]

2025-11-23 19:25:38,694 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-11-23 19:25:38,781 - BERTopic - Dimensionality - Completed ✓
2025-11-23 19:25:38,781 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-11-23 19:25:39,764 - BERTopic - Probabilities - Start calculation of probabilities with HDBSCAN
2025-11-23 19:30:07,275 - BERTopic - Probabilities - Completed ✓
2025-11-23 19:30:07,276 - BERTopic - Cluster - Completed ✓


In [8]:
tops['topic'] = topics
tops['topic_probs'] = probs.tolist()

In [9]:
tops.shape

(30907, 10)

#### Sentiment Score with VADER

In [10]:
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

tops['sentiment'] = tops['text'].apply(lambda x: sia.polarity_scores(str(x))["compound"])

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\yzhen\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


#### Creating DataFrame with Probability Distribution for Modeling

In [11]:
df = pd.DataFrame()
df['rating'] = tops['rating']
df['topic_id'] = topics
df['sentiment'] = tops['sentiment']

probs_df = pd.DataFrame(
    probs,
    columns = [f"topic_prob_{i}" for i in range(probs.shape[1])]
)

topic_prob = pd.concat([df.reset_index(drop=True), probs_df.reset_index(drop=True)], axis=1)

#remove topic = -1 because those are outliers and should be ignored
model_df = topic_prob[topic_prob['topic_id'] != -1]

model_df

Unnamed: 0,rating,topic_id,sentiment,topic_prob_0,topic_prob_1,topic_prob_2,topic_prob_3,topic_prob_4,topic_prob_5,topic_prob_6,...,topic_prob_19,topic_prob_20,topic_prob_21,topic_prob_22,topic_prob_23,topic_prob_24,topic_prob_25,topic_prob_26,topic_prob_27,topic_prob_28
0,4.0,0,0.6918,1.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1,5.0,3,0.8357,0.235300,4.650442e-02,3.650306e-02,1.679372e-01,3.575584e-02,1.530364e-02,1.066822e-02,...,9.842679e-04,4.895131e-03,6.082960e-04,1.136030e-03,7.398211e-04,7.129811e-04,9.065141e-04,1.247030e-03,1.651904e-03,1.636247e-03
3,1.0,14,0.4215,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
6,4.0,0,0.0516,0.170790,2.533316e-02,2.670997e-02,2.041132e-02,1.492635e-02,9.030376e-03,5.922454e-03,...,7.375602e-04,1.819878e-03,5.828516e-04,9.445351e-04,7.226641e-04,5.980123e-04,8.023744e-04,7.284924e-04,1.302170e-03,8.173486e-04
8,5.0,1,0.9150,0.107149,5.792131e-01,1.014467e-02,2.184771e-02,1.816267e-02,3.983197e-03,2.764112e-03,...,2.705237e-04,9.995637e-04,1.826030e-04,3.732847e-04,2.463259e-04,2.100690e-04,2.982318e-04,3.295871e-04,4.930494e-04,3.642787e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30902,3.0,0,0.0000,1.000000,1.967487e-09,1.636042e-09,1.496203e-09,9.910888e-10,7.593174e-10,5.015099e-10,...,7.352875e-11,1.480350e-10,5.904868e-11,1.019727e-10,1.633424e-10,5.516477e-11,1.168074e-10,6.114784e-11,6.419881e-11,6.623546e-11
30903,1.0,3,-0.4588,0.357991,9.006978e-02,5.774870e-02,2.393977e-01,5.271012e-02,2.614224e-02,2.009556e-02,...,2.113248e-03,6.951781e-03,1.741326e-03,2.604612e-03,2.016828e-03,1.895402e-03,2.395403e-03,2.342274e-03,2.427522e-03,2.705861e-03
30904,1.0,6,0.0000,0.101207,1.946482e-02,2.019649e-02,2.342093e-02,1.224150e-02,1.061801e-02,3.238983e-02,...,9.881662e-04,2.225914e-03,9.391871e-04,7.973846e-04,7.320764e-04,1.275978e-03,7.828115e-04,2.477447e-03,7.502202e-04,1.412698e-03
30905,1.0,1,-0.4215,0.000013,8.629145e-01,2.139970e-06,2.242695e-06,1.355255e-06,9.754264e-07,6.730335e-07,...,9.185833e-08,2.051024e-07,8.538584e-08,1.387654e-07,1.213709e-07,7.774757e-08,1.397541e-07,8.109060e-08,8.092358e-08,8.843149e-08


#### Training and Testing the Model

In [12]:
X = model_df.iloc[:, 2:] #topic probs and sentiment score
y = model_df["rating"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
model = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    random_state=42
)

model.fit(X_train, y_train)

preds = model.predict(X_test)
mse = mean_squared_error(y_test, preds)

print("MSE:", mse)

r2 = r2_score(y_test, preds)
print(f"R-squared: {r2}")

MSE: 1.3133649149456061
R-squared: 0.29328045999982744


#### Results

In [19]:
importances = model.feature_importances_

sentiment = ['sentiment'] #first feature

n_topics = topic_model.get_topic_info().shape[0]-1

probs_names = [
    topic_model.get_topic_info().loc[i+1, 'Name']
    for i in range(n_topics)
]

topic_names = sentiment + probs_names

for name, imp in sorted(zip(topic_names, importances), key=lambda x: -x[1]):
    print(f"{name}: {imp:.4f}")

sentiment: 0.4802
0_ordered_size_fit_wear: 0.0497
12_looking_definitely_comfortable_fit: 0.0289
3_material_expected_looks_returned: 0.0273
2_old_big_way_expected: 0.0268
1_time_bad_day_work: 0.0263
4_picture_looks_received_disappointed: 0.0236
11_disappointed_use_going_received: 0.0221
27_product_bit_know_look: 0.0219
7_product_item_perfect_time: 0.0217
23_received_time_doesnt_did: 0.0185
10_old_bought_perfect_works: 0.0177
9_bad_material_looks_expected: 0.0175
5_cute_size_big_looks: 0.0174
22_day_fine_look_received: 0.0167
14_big_use_old_bit: 0.0155
18_comfortable_better_returned_going: 0.0150
15_material_returned_right_wear: 0.0141
16_return_returned_time_day: 0.0137
8_cute_day_time_look: 0.0136
25_going_works_perfect_use: 0.0133
20_picture_doesnt_wasnt_look: 0.0129
6_picture_product_bad_worth: 0.0126
24_worth_ok_better_does: 0.0116
13_bought_look_use_day: 0.0113
21_time_return_use_works: 0.0111
28_comfortable_bad_ok_fine: 0.0102
26_works_product_work_didnt: 0.0101
19_recommend_day_u