In [142]:
import pandas as pd

from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sentence_transformers import SentenceTransformer

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

import re

#### Cleaning Reviews

In [143]:
data = pd.read_csv('balanced_data.csv')

In [144]:
tops = data[data['category'] == 'Shirts/Tops']

tops['text'] = tops['text'].str.strip()
tops = tops[tops['text'].str.split().str.len() >= 3]  # remove very short reviews
bad_values = {"na", "n a", "n/a", "none", ""}
tops = tops[~tops['text'].isin(["na", "n a", "n/a", "none", ""])] #remove bad values


reviews = tops['text'].tolist()
ratings = tops['rating'].astype(int).tolist()

reviews = [r for r in reviews if r.lower() not in bad_values]


In [145]:
reviews = [str(i) for i in reviews]

In [146]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)  # remove links
    text = re.sub(r"[^\w\s]", "", text)         # remove punctuation
    text = re.sub(r"\d+", "", text)             # remove numbers
    text = text.encode("ascii", "ignore").decode()  # remove emojis
    return text

reviews = [clean_text(r) for r in reviews]

#### Topic Modeling with BERTopic

In [147]:
vectorizer_model = CountVectorizer(
    stop_words="english",
    min_df=20,            
    max_df=0.8,           
    ngram_range=(1,2)
)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [148]:
topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    verbose=True
)

In [149]:
topics, probs = topic_model.fit_transform(reviews)

2025-11-23 16:55:05,703 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1001 [00:00<?, ?it/s]

2025-11-23 16:57:17,531 - BERTopic - Embedding - Completed ✓
2025-11-23 16:57:17,531 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-23 16:57:25,316 - BERTopic - Dimensionality - Completed ✓
2025-11-23 16:57:25,318 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-23 17:02:20,259 - BERTopic - Cluster - Completed ✓
2025-11-23 17:02:20,275 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-23 17:02:22,011 - BERTopic - Representation - Completed ✓


In [150]:
topic_model = topic_model.reduce_topics(reviews, nr_topics=30)

2025-11-23 17:02:24,040 - BERTopic - Topic reduction - Reducing number of topics
2025-11-23 17:02:24,230 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-23 17:02:25,694 - BERTopic - Representation - Completed ✓
2025-11-23 17:02:25,711 - BERTopic - Topic reduction - Reduced number of topics from 339 to 30


In [151]:
topics, probs = topic_model.transform(reviews)

Batches:   0%|          | 0/1001 [00:00<?, ?it/s]

2025-11-23 17:06:02,660 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-11-23 17:06:02,761 - BERTopic - Dimensionality - Completed ✓
2025-11-23 17:06:02,761 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-11-23 17:06:03,814 - BERTopic - Probabilities - Start calculation of probabilities with HDBSCAN
2025-11-23 17:09:04,558 - BERTopic - Probabilities - Completed ✓
2025-11-23 17:09:04,560 - BERTopic - Cluster - Completed ✓


In [152]:
tops['topic'] = topics
tops['topic_probs'] = probs.tolist()

#### Add Sentiment Score with VADER

In [153]:
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\yzhen\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [154]:
tops['sentiment'] = tops['text'].apply(lambda x: sia.polarity_scores(str(x))["compound"])

#### Creating DataFrame with Probability Distribution for Modeling

In [155]:
df = pd.DataFrame()
df['rating'] = tops['rating']
df['topic_id'] = topics
df['sentiment'] = tops['sentiment']

probs_df = pd.DataFrame(
    probs,
    columns = [f"topic_prob_{i}" for i in range(probs.shape[1])]
)

In [156]:
topic_prob = pd.concat([df.reset_index(drop=True), probs_df.reset_index(drop=True)], axis=1)

topic_prob

Unnamed: 0,rating,topic_id,sentiment,topic_prob_0,topic_prob_1,topic_prob_2,topic_prob_3,topic_prob_4,topic_prob_5,topic_prob_6,...,topic_prob_19,topic_prob_20,topic_prob_21,topic_prob_22,topic_prob_23,topic_prob_24,topic_prob_25,topic_prob_26,topic_prob_27,topic_prob_28
0,4.0,13,0.6918,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1,5.0,0,0.8357,0.219522,8.820456e-02,1.631699e-02,2.846477e-02,2.374721e-02,3.428578e-02,2.206994e-02,...,1.514134e-03,4.047023e-03,2.435023e-03,2.274813e-03,1.144617e-03,1.630100e-03,2.489775e-03,1.211559e-03,8.425617e-04,1.521806e-03
2,3.0,-1,0.0129,0.766671,7.748200e-02,6.868952e-36,5.542804e-36,4.523793e-36,7.432973e-36,1.435023e-36,...,1.164579e-37,1.624183e-37,1.635477e-37,8.534656e-38,4.614032e-38,4.432972e-37,2.128705e-37,6.084603e-38,2.574324e-38,1.713614e-37
3,1.0,0,0.4215,1.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
4,1.0,-1,-0.8079,0.025905,7.599528e-03,3.384295e-03,3.979079e-03,6.630725e-03,3.843393e-03,3.293735e-01,...,2.950890e-04,4.769956e-03,3.464901e-04,3.552896e-03,3.079801e-04,2.175044e-04,2.524947e-04,2.121483e-04,2.289494e-04,2.015352e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32027,3.0,0,0.0000,0.565993,2.404273e-02,1.003683e-02,1.254019e-02,1.242842e-02,1.510536e-02,1.097786e-02,...,1.246602e-03,2.357587e-03,1.513076e-03,1.235595e-03,7.459692e-04,7.331522e-04,8.114442e-04,9.312664e-04,5.955903e-04,1.033921e-03
32028,1.0,1,-0.4588,0.186919,1.743065e-01,2.013385e-02,2.393291e-02,2.028393e-02,2.784264e-02,2.064893e-02,...,1.993731e-03,4.129078e-03,2.895171e-03,2.252465e-03,1.153598e-03,1.951451e-03,2.009479e-03,1.363685e-03,1.030186e-03,1.433284e-03
32029,1.0,6,0.0000,0.121842,3.671261e-02,1.327187e-02,2.012866e-02,1.878882e-02,1.822838e-02,5.765613e-02,...,1.446622e-03,7.177866e-03,1.823610e-03,4.298955e-03,1.262442e-03,1.027328e-03,1.190231e-03,9.972922e-04,1.168718e-03,9.473825e-04
32030,1.0,0,-0.4215,0.913962,6.236952e-10,2.622935e-10,3.106081e-10,2.781644e-10,3.507338e-10,2.724615e-10,...,3.203356e-11,5.733413e-11,4.800904e-11,3.030382e-11,1.737200e-11,2.009258e-11,2.023411e-11,2.324032e-11,1.644001e-11,2.132148e-11


In [157]:
#remove topic = -1 because those are outliers and should be ignored
model_df = topic_prob[topic_prob['topic_id'] != -1]

In [158]:
model_df

Unnamed: 0,rating,topic_id,sentiment,topic_prob_0,topic_prob_1,topic_prob_2,topic_prob_3,topic_prob_4,topic_prob_5,topic_prob_6,...,topic_prob_19,topic_prob_20,topic_prob_21,topic_prob_22,topic_prob_23,topic_prob_24,topic_prob_25,topic_prob_26,topic_prob_27,topic_prob_28
0,4.0,13,0.6918,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1,5.0,0,0.8357,0.219522,8.820456e-02,1.631699e-02,2.846477e-02,2.374721e-02,3.428578e-02,2.206994e-02,...,1.514134e-03,4.047023e-03,2.435023e-03,2.274813e-03,1.144617e-03,1.630100e-03,2.489775e-03,1.211559e-03,8.425617e-04,1.521806e-03
3,1.0,0,0.4215,1.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
5,3.0,0,0.6369,1.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
6,4.0,0,0.0516,0.215168,3.803359e-02,1.423603e-02,2.185822e-02,1.742302e-02,2.030849e-02,1.406477e-02,...,1.779990e-03,3.038106e-03,1.851148e-03,1.609744e-03,8.770611e-04,1.125303e-03,1.111224e-03,1.045313e-03,7.499693e-04,1.093758e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32027,3.0,0,0.0000,0.565993,2.404273e-02,1.003683e-02,1.254019e-02,1.242842e-02,1.510536e-02,1.097786e-02,...,1.246602e-03,2.357587e-03,1.513076e-03,1.235595e-03,7.459692e-04,7.331522e-04,8.114442e-04,9.312664e-04,5.955903e-04,1.033921e-03
32028,1.0,1,-0.4588,0.186919,1.743065e-01,2.013385e-02,2.393291e-02,2.028393e-02,2.784264e-02,2.064893e-02,...,1.993731e-03,4.129078e-03,2.895171e-03,2.252465e-03,1.153598e-03,1.951451e-03,2.009479e-03,1.363685e-03,1.030186e-03,1.433284e-03
32029,1.0,6,0.0000,0.121842,3.671261e-02,1.327187e-02,2.012866e-02,1.878882e-02,1.822838e-02,5.765613e-02,...,1.446622e-03,7.177866e-03,1.823610e-03,4.298955e-03,1.262442e-03,1.027328e-03,1.190231e-03,9.972922e-04,1.168718e-03,9.473825e-04
32030,1.0,0,-0.4215,0.913962,6.236952e-10,2.622935e-10,3.106081e-10,2.781644e-10,3.507338e-10,2.724615e-10,...,3.203356e-11,5.733413e-11,4.800904e-11,3.030382e-11,1.737200e-11,2.009258e-11,2.023411e-11,2.324032e-11,1.644001e-11,2.132148e-11


In [None]:
# topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,13349,-1_wear_fabric_ordered_large,"[wear, fabric, ordered, large, cute, fits, col...",[the shirt does not look like the picture as f...
1,0,9691,0_ordered_large_fits_wear,"[ordered, large, fits, wear, way, cute, little...",[this is a very cute comfy top originally i or...
2,1,1714,1_cheap_fabric_expected_horrible,"[cheap, fabric, expected, horrible, super, mon...","[fabric is very shiny and seems kinda cheap, c..."
3,2,1214,2_short_tight_weird_work,"[short, tight, weird, work, right, little, wor...",[the seams on the sleeves are not very straigh...
4,3,910,3_smaller_way_expected_return,"[smaller, way, expected, return, little, large...","[way off min its size so much smaller, nothing..."
5,4,848,4_cute_daughter_short_little,"[cute, daughter, short, little, super, way, ex...","[cute a little short, i liked it a little shor..."
6,5,688,5_comfortable_wear_work_fits,"[comfortable, wear, work, fits, cute, works, d...","[comfortable and easy care, comfortable light ..."
7,6,525,6_picture_cheap_looks like_cheaply,"[picture, cheap, looks like, cheaply, item, ba...","[nothing like picture at all, nothing like on ..."
8,7,475,7_color_picture_shown_different,"[color, picture, shown, different, ordered, ex...",[i like the celebrating color military allusio...
9,8,354,8_color_horrible_real_super,"[color, horrible, real, super, large, , , , , ]",[videoidddbbcfcaadca muy mala calidad devuelta...


#### Training and Testing the Model

In [159]:
X = model_df.iloc[:, 2:] #topic probs and sentiment score
y = model_df["rating"]

In [160]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [161]:
model = LinearRegression()
model.fit(X_train, y_train)

preds = model.predict(X_test)
mse = mean_squared_error(y_test, preds)

print("MSE:", mse)

r2 = r2_score(y_test, preds)
print(f"R-squared: {r2}")

MSE: 1.33296650356958
R-squared: 0.27769041886010015


#### Results

In [162]:
coef_df = pd.DataFrame({
    "feature": X.columns,
    "coefficient": model.coef_
}).sort_values("coefficient", ascending=False)

In [163]:
coef_df

Unnamed: 0,feature,coefficient
0,sentiment,1.521557
11,topic_prob_10,1.341228
29,topic_prob_28,1.25075
6,topic_prob_5,0.850711
9,topic_prob_8,0.658017
21,topic_prob_20,0.478219
7,topic_prob_6,0.394473
19,topic_prob_18,0.22457
13,topic_prob_12,0.197528
14,topic_prob_13,0.195215


In [164]:
for i in coef_df.index:
    if i != 0: #account for sentiment being 0 but not a topic
        topic = topic_model.get_topic_info(i-1)['Name'].to_string(index=False)
        coef = coef_df.loc[i,'coefficient']
        print(f'{topic} affects rating by {coef}')
    else:
        topic = 'sentiment'
        coef = coef_df.loc[i,'coefficient']
        print(f'{topic} affects rating by {coef}')
    

sentiment affects rating by 1.5215567811980721
10_loved_daughter_happy_did affects rating by 1.3412284969680863
28_close_used_wouldnt_use affects rating by 1.250749654400749
5_comfortable_wear_work_fits affects rating by 0.8507107131884667
8_color_horrible_real_super affects rating by 0.6580167784279691
20_expected_exactly_wanted_wasnt affects rating by 0.47821868043659144
6_picture_cheap_looks like_cheaply affects rating by 0.3944732214656627
18_money_waste_worth_buy affects rating by 0.2245702740551532
12_item_took_exactly_buy affects rating by 0.19752815612944144
13_daughter_use_loved_work affects rating by 0.19521516361103008
23_recommend_use_day_right affects rating by 0.08741633728911764
4_cute_daughter_short_little affects rating by 0.06884198357234385
9_cute_daughter_day_thought affects rating by 0.05305655279955291
7_color_picture_shown_different affects rating by 0.03829219863642765
25_picture_doesnt_fabric_wasnt affects rating by 0.019353917300889673
17_comfortable_little_us

In [165]:
topic_model.get_topic_info(25)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,25,23,25_picture_doesnt_fabric_wasnt,"[picture, doesnt, fabric, wasnt, looked, looki...",[the fabric is a pretty print but thats the on...
