In [None]:
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk




#### Preprocessing

In [3]:
data = pd.read_csv('balanced_data.csv')

In [4]:
tops = data[data['category'] == 'Shirts/Tops']
reviews = tops['text'].tolist()
ratings = tops['rating'].astype(int).tolist()

In [5]:
reviews = [str(i) for i in reviews]

In [6]:
vectorizer_model = CountVectorizer(
    stop_words="english",
    min_df=20,            
    max_df=0.8,           
    ngram_range=(1,2)
)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [7]:
topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    verbose=True
)

In [8]:
topics, probs = topic_model.fit_transform(reviews)

2025-11-22 22:34:17,905 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1066 [00:00<?, ?it/s]

2025-11-22 22:37:00,941 - BERTopic - Embedding - Completed ✓
2025-11-22 22:37:00,944 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-22 22:37:42,936 - BERTopic - Dimensionality - Completed ✓
2025-11-22 22:37:42,944 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-22 22:45:57,160 - BERTopic - Cluster - Completed ✓
2025-11-22 22:45:57,184 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-22 22:45:59,132 - BERTopic - Representation - Completed ✓


In [9]:
topic_model = topic_model.reduce_topics(reviews, nr_topics=30)

2025-11-22 22:46:01,966 - BERTopic - Topic reduction - Reducing number of topics
2025-11-22 22:46:02,115 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-22 22:46:03,578 - BERTopic - Representation - Completed ✓
2025-11-22 22:46:03,584 - BERTopic - Topic reduction - Reduced number of topics from 390 to 30


In [10]:
topics, probs = topic_model.transform(reviews)

Batches:   0%|          | 0/1066 [00:00<?, ?it/s]

2025-11-22 22:48:48,639 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-11-22 22:48:48,806 - BERTopic - Dimensionality - Completed ✓
2025-11-22 22:48:48,811 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-11-22 22:48:50,877 - BERTopic - Probabilities - Start calculation of probabilities with HDBSCAN
2025-11-22 22:57:29,236 - BERTopic - Probabilities - Completed ✓
2025-11-22 22:57:29,238 - BERTopic - Cluster - Completed ✓


In [11]:
tops['topic'] = topics
tops['topic_probs'] = probs.tolist()

In [12]:
df = pd.DataFrame()
df["rating"] = tops['rating']
df["topic_id"] = topics

probs_df = pd.DataFrame(
    probs,
    columns = [f"topic_prob_{i}" for i in range(probs.shape[1])]
)

In [13]:
topic_prob = pd.concat([df.reset_index(drop=True), probs_df.reset_index(drop=True)], axis=1)

topic_prob

Unnamed: 0,rating,topic_id,topic_prob_0,topic_prob_1,topic_prob_2,topic_prob_3,topic_prob_4,topic_prob_5,topic_prob_6,topic_prob_7,...,topic_prob_19,topic_prob_20,topic_prob_21,topic_prob_22,topic_prob_23,topic_prob_24,topic_prob_25,topic_prob_26,topic_prob_27,topic_prob_28
0,4.0,0,9.930713e-01,3.017873e-124,1.687763e-124,7.321998e-125,1.440849e-124,1.186254e-124,5.643862e-125,3.411020e-125,...,6.338331e-125,8.705333e-126,9.541142e-126,1.358718e-125,1.138660e-125,3.953098e-126,4.489924e-126,1.751435e-126,2.337052e-126,1.825661e-126
1,5.0,0,1.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
2,3.0,-1,2.885117e-01,7.923890e-03,1.567541e-02,1.133107e-02,2.506389e-02,7.714047e-03,5.726407e-183,1.759640e-183,...,2.313210e-144,7.624947e-194,2.636815e-191,2.150892e-189,1.757447e-183,2.394460e-193,7.056265e-194,6.419931e-195,8.276699e-195,6.820398e-195
3,1.0,18,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
4,1.0,-1,4.532876e-130,1.328785e-130,4.990267e-131,2.350403e-131,6.679890e-131,4.090653e-131,4.808722e-131,1.543736e-131,...,3.967152e-131,1.347591e-132,3.014100e-132,5.563532e-132,1.008384e-131,1.336946e-132,1.225052e-132,2.896110e-133,3.522023e-133,2.986191e-133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34077,3.0,0,9.937478e-01,1.369804e-03,7.348985e-04,3.624150e-04,4.842877e-04,4.297777e-04,2.373040e-04,1.693863e-04,...,2.193281e-04,6.001394e-05,5.398820e-05,5.972120e-05,4.846788e-05,1.887808e-05,2.181566e-05,1.048751e-05,1.676463e-05,1.083456e-05
34078,1.0,1,3.618749e-06,8.880941e-01,4.732855e-07,1.921472e-07,4.120303e-07,2.973118e-07,1.743350e-07,1.200237e-07,...,1.608404e-07,3.535079e-08,3.692484e-08,4.484517e-08,3.643732e-08,1.356552e-08,1.693694e-08,7.477117e-09,1.004428e-08,7.874520e-09
34079,1.0,-1,2.359132e-41,8.082418e-42,2.008582e-42,1.071951e-42,3.765132e-36,8.977801e-43,1.670610e-02,2.847201e-41,...,1.186591e-41,4.438374e-46,8.420833e-45,3.200348e-44,2.851032e-41,1.101347e-45,6.175247e-46,7.131541e-47,8.994934e-47,7.381537e-47
34080,1.0,14,1.802746e-01,4.321421e-02,2.532377e-02,1.063474e-02,1.622936e-02,1.503787e-02,8.725053e-03,6.995211e-03,...,7.489775e-03,2.532012e-03,2.233248e-03,2.459086e-03,1.675492e-03,7.427515e-04,1.025048e-03,5.424657e-04,7.606237e-04,5.684998e-04


In [14]:
#remove topic = -1 because those are outliers and should be ignored
model_df = topic_prob[topic_prob['topic_id'] != -1]

In [15]:
model_df

Unnamed: 0,rating,topic_id,topic_prob_0,topic_prob_1,topic_prob_2,topic_prob_3,topic_prob_4,topic_prob_5,topic_prob_6,topic_prob_7,...,topic_prob_19,topic_prob_20,topic_prob_21,topic_prob_22,topic_prob_23,topic_prob_24,topic_prob_25,topic_prob_26,topic_prob_27,topic_prob_28
0,4.0,0,0.993071,3.017873e-124,1.687763e-124,7.321998e-125,1.440849e-124,1.186254e-124,5.643862e-125,3.411020e-125,...,6.338331e-125,8.705333e-126,9.541142e-126,1.358718e-125,1.138660e-125,3.953098e-126,4.489924e-126,1.751435e-126,2.337052e-126,1.825661e-126
1,5.0,0,1.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
3,1.0,18,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
6,3.0,0,1.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
7,4.0,0,0.405350,4.216009e-02,3.538023e-02,1.249355e-02,2.232365e-02,1.642447e-02,8.206165e-03,5.868102e-03,...,7.769368e-03,2.392243e-03,2.049810e-03,2.071537e-03,1.691949e-03,6.471634e-04,6.931386e-04,3.440668e-04,4.759012e-04,3.604700e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34076,3.0,1,0.191845,2.337818e-01,2.827114e-02,1.105256e-02,1.993560e-02,1.791423e-02,1.066678e-02,7.338152e-03,...,9.268903e-03,2.005343e-03,1.884561e-03,2.758076e-03,2.271906e-03,8.381670e-04,1.006462e-03,4.600773e-04,6.574498e-04,4.766779e-04
34077,3.0,0,0.993748,1.369804e-03,7.348985e-04,3.624150e-04,4.842877e-04,4.297777e-04,2.373040e-04,1.693863e-04,...,2.193281e-04,6.001394e-05,5.398820e-05,5.972120e-05,4.846788e-05,1.887808e-05,2.181566e-05,1.048751e-05,1.676463e-05,1.083456e-05
34078,1.0,1,0.000004,8.880941e-01,4.732855e-07,1.921472e-07,4.120303e-07,2.973118e-07,1.743350e-07,1.200237e-07,...,1.608404e-07,3.535079e-08,3.692484e-08,4.484517e-08,3.643732e-08,1.356552e-08,1.693694e-08,7.477117e-09,1.004428e-08,7.874520e-09
34080,1.0,14,0.180275,4.321421e-02,2.532377e-02,1.063474e-02,1.622936e-02,1.503787e-02,8.725053e-03,6.995211e-03,...,7.489775e-03,2.532012e-03,2.233248e-03,2.459086e-03,1.675492e-03,7.427515e-04,1.025048e-03,5.424657e-04,7.606237e-04,5.684998e-04


#### Training

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [17]:
X = model_df.iloc[:, 2:] #topic probs
y = model_df["rating"]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
model = LinearRegression()
model.fit(X_train, y_train)

preds = model.predict(X_test)
mse = mean_squared_error(y_test, preds)

print("MSE:", mse)

r2 = r2_score(y_test, preds)
print(f"R-squared: {r2}")

MSE: 1.787786096884356
R-squared: 0.06290779415333725


In [20]:
coef_df = pd.DataFrame({
    "feature": X.columns,
    "coefficient": model.coef_
}).sort_values("coefficient", ascending=False)

In [21]:
coef_df

Unnamed: 0,feature,coefficient
26,topic_prob_26,1.940812
6,topic_prob_6,1.842508
24,topic_prob_24,1.311299
28,topic_prob_28,1.244016
7,topic_prob_7,1.108692
19,topic_prob_19,0.804883
5,topic_prob_5,0.761095
18,topic_prob_18,0.65472
17,topic_prob_17,0.553142
3,topic_prob_3,0.276792


In [23]:
# topic_model.get_topic_info()