In [3]:
import pandas as pd

from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sentence_transformers import SentenceTransformer

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

import re

#### Cleaning Reviews

In [4]:
data = pd.read_csv('balanced_data.csv')

In [5]:
tops = data[data['category'] == 'Shirts/Tops']

tops['text'] = tops['text'].str.strip()
tops = tops[tops['text'].str.split().str.len() >= 3]  # remove very short reviews
bad_values = {"na", "n a", "n/a", "none", ""}
tops = tops[~tops['text'].isin(["na", "n a", "n/a", "none", ""])] #remove bad values


In [6]:
from langdetect import detect, DetectorFactory

# fix randomness for reproducibility
DetectorFactory.seed = 0

def is_english(text):
    """Return True if the text is English, False otherwise."""
    try:
        return detect(text) == 'en'
    except:
        return False

tops['is_english'] = tops['text'].apply(is_english)

tops = tops[tops['is_english']] #remove non-english reviews
tops.shape

(30907, 8)

In [7]:
reviews = tops['text'].tolist()
reviews = [str(i) for i in reviews]

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)  # remove links
    text = re.sub(r"[^\w\s]", "", text)         # remove punctuation
    text = re.sub(r"\d+", "", text)             # remove numbers
    text = text.encode("ascii", "ignore").decode()  # remove emojis
    return text

reviews = [clean_text(r) for r in reviews]

#### Topic Modeling with BERTopic

In [9]:
vectorizer_model = CountVectorizer(
    stop_words="english",
    min_df=20,            
    max_df=0.8,           
    ngram_range=(1,2)
)

In [10]:
topic_model = BERTopic(
    embedding_model="all-MiniLM-L6-v2",
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    verbose=True
)

In [11]:
topics, probs = topic_model.fit_transform(reviews)

2025-11-23 18:51:10,747 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/966 [00:00<?, ?it/s]

2025-11-23 18:53:23,106 - BERTopic - Embedding - Completed ✓
2025-11-23 18:53:23,106 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-23 18:53:45,746 - BERTopic - Dimensionality - Completed ✓
2025-11-23 18:53:45,748 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-23 18:56:01,640 - BERTopic - Cluster - Completed ✓
2025-11-23 18:56:01,670 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-23 18:56:02,570 - BERTopic - Representation - Completed ✓


In [12]:
topic_model = topic_model.reduce_topics(reviews, nr_topics=30)

2025-11-23 18:56:03,523 - BERTopic - Topic reduction - Reducing number of topics
2025-11-23 18:56:03,616 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-23 18:56:04,303 - BERTopic - Representation - Completed ✓
2025-11-23 18:56:04,317 - BERTopic - Topic reduction - Reduced number of topics from 305 to 30


In [13]:
topics, probs = topic_model.transform(reviews)

Batches:   0%|          | 0/966 [00:00<?, ?it/s]

2025-11-23 18:58:43,962 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-11-23 18:58:44,069 - BERTopic - Dimensionality - Completed ✓
2025-11-23 18:58:44,069 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-11-23 18:58:45,093 - BERTopic - Probabilities - Start calculation of probabilities with HDBSCAN
2025-11-23 19:00:45,157 - BERTopic - Probabilities - Completed ✓
2025-11-23 19:00:45,157 - BERTopic - Cluster - Completed ✓


In [14]:
tops['topic'] = topics
tops['topic_probs'] = probs.tolist()

#### Add Sentiment Score with VADER

In [15]:
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\yzhen\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [16]:
tops['sentiment'] = tops['text'].apply(lambda x: sia.polarity_scores(str(x))["compound"])

In [17]:
tops.shape

(30907, 11)

#### Creating DataFrame with Probability Distribution for Modeling

In [18]:
df = pd.DataFrame()
df['rating'] = tops['rating']
df['topic_id'] = topics
df['sentiment'] = tops['sentiment']

probs_df = pd.DataFrame(
    probs,
    columns = [f"topic_prob_{i}" for i in range(probs.shape[1])]
)

In [19]:
topic_prob = pd.concat([df.reset_index(drop=True), probs_df.reset_index(drop=True)], axis=1)

topic_prob

Unnamed: 0,rating,topic_id,sentiment,topic_prob_0,topic_prob_1,topic_prob_2,topic_prob_3,topic_prob_4,topic_prob_5,topic_prob_6,...,topic_prob_19,topic_prob_20,topic_prob_21,topic_prob_22,topic_prob_23,topic_prob_24,topic_prob_25,topic_prob_26,topic_prob_27,topic_prob_28
0,4.0,7,0.6918,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1,5.0,1,0.8357,1.057566e-01,1.063411e-01,2.818887e-02,7.807468e-02,2.670484e-02,2.120229e-02,8.598746e-03,...,1.203544e-03,1.386529e-03,6.700466e-04,1.027934e-03,7.425539e-04,6.581720e-04,1.252856e-03,8.938212e-04,1.323511e-03,1.277081e-03
2,3.0,-1,0.0129,7.381566e-01,4.934022e-02,8.665510e-175,2.693237e-159,9.143343e-174,4.676196e-170,4.488390e-173,...,7.333563e-177,2.292597e-176,2.148623e-177,4.989666e-177,2.344941e-177,1.843418e-177,8.161632e-177,4.976148e-177,1.366724e-176,1.347963e-176
3,1.0,3,0.4215,0.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
4,1.0,-1,-0.8079,6.970956e-70,5.020753e-70,1.844051e-70,3.277011e-70,1.511947e-70,1.302569e-70,5.784408e-71,...,1.107979e-71,1.051290e-71,5.713942e-72,6.357858e-72,4.803504e-72,6.607422e-72,8.059758e-72,5.263226e-72,1.133957e-69,1.311341e-69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30902,3.0,0,0.0000,7.833401e-01,5.070764e-03,1.595211e-03,2.839447e-03,1.347359e-03,1.291427e-03,6.912988e-04,...,8.886801e-05,1.069459e-04,4.209400e-05,1.003608e-04,6.615979e-05,3.682015e-05,1.251257e-04,7.675772e-05,8.826599e-05,8.957924e-05
30903,1.0,3,-0.4588,1.151140e-07,7.971078e-08,2.492201e-08,9.042637e-01,2.505366e-08,2.613334e-08,8.668479e-09,...,1.101235e-09,1.772292e-09,7.652076e-10,1.142487e-09,8.430621e-10,7.529744e-10,1.270704e-09,1.001103e-09,1.208774e-09,1.164333e-09
30904,1.0,9,0.0000,9.708633e-02,6.498663e-02,2.749905e-02,4.327016e-02,1.866393e-02,1.618007e-02,8.743206e-03,...,1.639317e-03,1.308601e-03,1.306173e-03,1.178842e-03,1.134533e-03,1.581003e-03,1.253291e-03,1.026023e-03,3.346382e-03,2.790602e-03
30905,1.0,0,-0.4215,4.853068e-01,3.419295e-02,1.309742e-02,2.020771e-02,9.150791e-03,9.640193e-03,4.930322e-03,...,6.789294e-04,8.598088e-04,6.021821e-04,9.427402e-04,8.047399e-04,5.391876e-04,9.030668e-04,7.277411e-04,5.905072e-04,5.926220e-04


In [20]:
#remove topic = -1 because those are outliers and should be ignored
model_df = topic_prob[topic_prob['topic_id'] != -1]

In [21]:
model_df

Unnamed: 0,rating,topic_id,sentiment,topic_prob_0,topic_prob_1,topic_prob_2,topic_prob_3,topic_prob_4,topic_prob_5,topic_prob_6,...,topic_prob_19,topic_prob_20,topic_prob_21,topic_prob_22,topic_prob_23,topic_prob_24,topic_prob_25,topic_prob_26,topic_prob_27,topic_prob_28
0,4.0,7,0.6918,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1,5.0,1,0.8357,1.057566e-01,1.063411e-01,2.818887e-02,0.078075,2.670484e-02,2.120229e-02,8.598746e-03,...,1.203544e-03,1.386529e-03,6.700466e-04,1.027934e-03,7.425539e-04,6.581720e-04,1.252856e-03,8.938212e-04,1.323511e-03,1.277081e-03
3,1.0,3,0.4215,0.000000e+00,0.000000e+00,0.000000e+00,1.000000,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
7,4.0,4,0.7430,9.065665e-03,4.229857e-03,1.319976e-03,0.002429,9.756767e-01,1.774872e-03,4.953634e-04,...,5.715596e-05,8.433575e-05,4.106757e-05,5.894902e-05,4.423431e-05,3.949952e-05,6.767767e-05,6.648091e-05,6.115859e-05,5.982028e-05
8,5.0,5,0.9150,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,1.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30902,3.0,0,0.0000,7.833401e-01,5.070764e-03,1.595211e-03,0.002839,1.347359e-03,1.291427e-03,6.912988e-04,...,8.886801e-05,1.069459e-04,4.209400e-05,1.003608e-04,6.615979e-05,3.682015e-05,1.251257e-04,7.675772e-05,8.826599e-05,8.957924e-05
30903,1.0,3,-0.4588,1.151140e-07,7.971078e-08,2.492201e-08,0.904264,2.505366e-08,2.613334e-08,8.668479e-09,...,1.101235e-09,1.772292e-09,7.652076e-10,1.142487e-09,8.430621e-10,7.529744e-10,1.270704e-09,1.001103e-09,1.208774e-09,1.164333e-09
30904,1.0,9,0.0000,9.708633e-02,6.498663e-02,2.749905e-02,0.043270,1.866393e-02,1.618007e-02,8.743206e-03,...,1.639317e-03,1.308601e-03,1.306173e-03,1.178842e-03,1.134533e-03,1.581003e-03,1.253291e-03,1.026023e-03,3.346382e-03,2.790602e-03
30905,1.0,0,-0.4215,4.853068e-01,3.419295e-02,1.309742e-02,0.020208,9.150791e-03,9.640193e-03,4.930322e-03,...,6.789294e-04,8.598088e-04,6.021821e-04,9.427402e-04,8.047399e-04,5.391876e-04,9.030668e-04,7.277411e-04,5.905072e-04,5.926220e-04


In [22]:
# topic_model.get_topic_info()

#### Training and Testing the Model

In [23]:
X = model_df.iloc[:, 2:] #topic probs and sentiment score
y = model_df["rating"]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
model = LinearRegression()
model.fit(X_train, y_train)

preds = model.predict(X_test)
mse = mean_squared_error(y_test, preds)

print("MSE:", mse)

r2 = r2_score(y_test, preds)
print(f"R-squared: {r2}")

MSE: 1.2714050294417456
R-squared: 0.29920888686149605


#### Results

In [26]:
coef_df = pd.DataFrame({
    "feature": X.columns,
    "coefficient": model.coef_
}).sort_values("coefficient", ascending=False)

In [27]:
coef_df

Unnamed: 0,feature,coefficient
24,topic_prob_23,1.874777
29,topic_prob_28,1.718915
0,sentiment,1.486579
9,topic_prob_8,1.310877
11,topic_prob_10,1.098162
28,topic_prob_27,0.852106
13,topic_prob_12,0.825674
26,topic_prob_25,0.807347
18,topic_prob_17,0.636
27,topic_prob_26,0.585976


In [28]:
for i in coef_df.index:
    if i != 0: #account for sentiment being 0 but not a topic
        topic = topic_model.get_topic_info(i-1)['Name'].to_string(index=False)
        coef = coef_df.loc[i,'coefficient']
        print(f'{topic} affects rating by {coef}')
    else:
        topic = 'sentiment'
        coef = coef_df.loc[i,'coefficient']
        print(f'{topic} affects rating by {coef}')
    

23_got_received_price_time affects rating by 1.8747765721753786
28_perfect_way_bad_look affects rating by 1.7189154186791202
sentiment affects rating by 1.4865790596006214
8_loved_cute_bought_old affects rating by 1.3108772278790048
10_comfortable_fits_cute_perfect affects rating by 1.098161650755869
27_work_product_works_day affects rating by 0.8521055306881787
12_product_package_time_came affects rating by 0.8256735438265511
25_perfect_price_works_little affects rating by 0.8073467781102405
17_expected_wanted_wasnt_ok affects rating by 0.6359998904440662
26_perfect_best_work_doesnt affects rating by 0.585975837762018
16_use_comfortable_came_different affects rating by 0.23137278604337805
9_picture_price_cheap_cheaply affects rating by 0.21151961893724638
7_work_day_works_use affects rating by 0.21058413413333713
0_wear_fit_size_im affects rating by 0.15896034467421863
11_look_cute_came_time affects rating by 0.1263818868572901
3_material_fabric_cheap_expected affects rating by 0.1229

In [29]:
topic_model.get_topic_info(28)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,28,10,28_perfect_way_bad_look,"[perfect, way, bad, look, im, , , , , ]","[perfect thanks, perfect in love, its perf..."


In [30]:
topic_model.get_representative_docs(8)

['bought for granddaughter she loved it',
 'loved it as did all my coworkers cute as can be exactly as pictured',
 'loved it so cute on']