In [1]:
import pandas as pd
import numpy as np
from bertopic import BERTopic

In [2]:
data = pd.read_csv('balanced_data.csv')

In [3]:
tops = data[data['category'] == 'Shirts/Tops']
reviews = tops['text'].tolist()
ratings = tops['rating'].astype(int).tolist()


In [4]:
reviews = [str(i) for i in reviews]

In [5]:
tops.columns

Index(['Unnamed: 0', 'rating', 'parent_asin', 'text', 'title',
       'average_rating', 'category'],
      dtype='object')

In [6]:
#topic_model = BERTopic(language="english", verbose=True)
#topics, probs = topic_model.fit_transform(reviews)

In [7]:
#topic_model.get_topic_info()

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer

In [9]:
embedding_model = SentenceTransformer("all-mpnet-base-v2")
topic_model = BERTopic(embedding_model=embedding_model)

In [10]:
vectorizer_model = CountVectorizer(
    stop_words="english",
    ngram_range=(1, 2)   
)

In [11]:

topic_model = BERTopic(embedding_model=embedding_model, vectorizer_model=vectorizer_model)

In [12]:
topics, probs = topic_model.fit_transform(reviews)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got f

In [13]:
tops['topic'] = topics
tops['topic_probs'] = probs

In [14]:
tops_cleaned = tops[tops['topic'] != -1]

# Calculate sentiment polarity for each review
from textblob import TextBlob
import nltk

# Download required NLTK data (only needed first time)
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger')

# Polarity ranges from -1 (negative) to 1 (positive)
sentiment_scores = []
for review in tops_cleaned['text']:
    blob = TextBlob(str(review))
    sentiment_scores.append(blob.sentiment.polarity)

tops_cleaned['sentiment_polarity'] = sentiment_scores

print(f"Sentiment scores calculated. Range: {min(sentiment_scores):.3f} to {max(sentiment_scores):.3f}")
print(f"Mean sentiment: {np.mean(sentiment_scores):.3f}")

Sentiment scores calculated. Range: -1.000 to 1.000
Mean sentiment: 0.154


In [15]:
tops_cleaned.head()

Unnamed: 0.1,Unnamed: 0,rating,parent_asin,text,title,average_rating,category,topic,topic_probs,sentiment_polarity
0,0,2.0,B08NHJ5S5H,Not long enough. I like shirt that cover or be...,Dokotoo Women's Ladies Spring Basic Ribbed Str...,3.6,Shirts/Tops,18,1.0,0.0125
3,3,3.0,B071F91SCM,"Very pretty, but the post is super thick and t...",925 Sterling Silver Cubic Zirconia Purple Butt...,4.7,Shirts/Tops,16,0.205686,0.119444
9,9,4.0,B08JGGTCB9,I got it for my bf for his birthday was disapp...,Jordan Paris Saint-Germain Long-Sleeve T-Shirt...,4.0,Shirts/Tops,345,1.0,-0.125
17,17,3.0,B01IP4GEA2,Small and itchy.,Romwe Women's Striped Crewneck Short Sleeve Lo...,3.3,Shirts/Tops,241,1.0,-0.25
61,61,4.0,B01EN4KVRK,Cute little dress but does run on the smaller ...,Tenworld Women Crew Neck Short Sleeve Striped ...,3.4,Shirts/Tops,0,1.0,0.177083


In [16]:
#topic_model.get_topic_info(1)['Representative_Docs']

print(topic_model.get_topic_info(2)['Representative_Docs'])

0    [I love this Blouse!, Really like this blouse....
Name: Representative_Docs, dtype: object


In [17]:
topic_model.get_representative_docs(45)

['Buy a size larger.', 'Order 1 size up..', 'Order a size or two larger']

In [18]:
topic_model.get_topic_info(45)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,45,91,45_size order_order size_order sizes_buy size,"[size order, order size, order sizes, buy size...","[Buy a size larger., Order 1 size up.., Order ..."


In [19]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,15512,-1_shirt_size_like_material,"[shirt, size, like, material, fit, small, larg...",[Really cute - way too small. I ordered a siz...
1,0,1737,0_dress_love dress_dresses_cute dress,"[dress, love dress, dresses, cute dress, pocke...","[very cute dress, Love dress, The whole dress]"
2,1,414,1_la_muy_es_el,"[la, muy, es, el, que, calidad, pero, tela, en...","[La tela es muy delgada, [[VIDEOID:9bc533c84ab..."
3,2,358,2_blouse_nice blouse_blouses_cute blouse,"[blouse, nice blouse, blouses, cute blouse, be...","[I love this Blouse!, Really like this blouse...."
4,3,318,3_suit_bathing_swimsuit_bathing suit,"[suit, bathing, swimsuit, bathing suit, bottom...",[I loved this bathing suit! I was worried if m...
...,...,...,...,...,...
373,372,10,372_ran small_small ran_ran_smo ran,"[ran small, small ran, ran, smo ran, smo, way ...","[Ran to small, Ran too small, Ran small.]"
374,373,10,373_shirt price_shirt nice_dark fine_likes dis...,"[shirt price, shirt nice, dark fine, likes dis...",[This shirt is perfect!!!! It is a beautiful c...
375,374,10,374_new semester_start brand_semester best_bes...,"[new semester, start brand, semester best, bes...","[Everyone likes it. But not me..., a good star..."
376,375,10,375_gray_grey_gray dark_color grey,"[gray, grey, gray dark, color grey, ordered gr...","[I ordered black, I got grey. Sweats are comfo..."


In [20]:
# Get topic info and remove topic = -1 (outliers)
topic_info = topic_model.get_topic_info()
print("Topic info before removing outliers:")
print(topic_info)
print(f"\nTotal topics including outliers: {len(topic_info)}")

# Remove topic = -1 from topic info
topic_info_cleaned = topic_info[topic_info['Topic'] != -1].copy()
print(f"\nTopic info after removing outliers (topic = -1):")
print(topic_info_cleaned)
print(f"\nTotal topics after removing outliers: {len(topic_info_cleaned)}")

Topic info before removing outliers:
     Topic  Count                                               Name  \
0       -1  15512                        -1_shirt_size_like_material   
1        0   1737              0_dress_love dress_dresses_cute dress   
2        1    414                                     1_la_muy_es_el   
3        2    358           2_blouse_nice blouse_blouses_cute blouse   
4        3    318               3_suit_bathing_swimsuit_bathing suit   
..     ...    ...                                                ...   
373    372     10                372_ran small_small ran_ran_smo ran   
374    373     10  373_shirt price_shirt nice_dark fine_likes dis...   
375    374     10  374_new semester_start brand_semester best_bes...   
376    375     10                 375_gray_grey_gray dark_color grey   
377    376     10  376_quality small_rating larger_poor large_tex...   

                                        Representation  \
0    [shirt, size, like, material, fit, 

In [21]:
# Display cleaned topic info (without topic = -1)
topic_info_cleaned


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
1,0,1737,0_dress_love dress_dresses_cute dress,"[dress, love dress, dresses, cute dress, pocke...","[very cute dress, Love dress, The whole dress]"
2,1,414,1_la_muy_es_el,"[la, muy, es, el, que, calidad, pero, tela, en...","[La tela es muy delgada, [[VIDEOID:9bc533c84ab..."
3,2,358,2_blouse_nice blouse_blouses_cute blouse,"[blouse, nice blouse, blouses, cute blouse, be...","[I love this Blouse!, Really like this blouse...."
4,3,318,3_suit_bathing_swimsuit_bathing suit,"[suit, bathing, swimsuit, bathing suit, bottom...",[I loved this bathing suit! I was worried if m...
5,4,292,4_2x_3x_2xl_3xl,"[2x, 3x, 2xl, 3xl, ordered 3x, ordered 2x, 1x,...","[2x fit like a Large, Shirt was to big for a 3..."
...,...,...,...,...,...
373,372,10,372_ran small_small ran_ran_smo ran,"[ran small, small ran, ran, smo ran, smo, way ...","[Ran to small, Ran too small, Ran small.]"
374,373,10,373_shirt price_shirt nice_dark fine_likes dis...,"[shirt price, shirt nice, dark fine, likes dis...",[This shirt is perfect!!!! It is a beautiful c...
375,374,10,374_new semester_start brand_semester best_bes...,"[new semester, start brand, semester best, bes...","[Everyone likes it. But not me..., a good star..."
376,375,10,375_gray_grey_gray dark_color grey,"[gray, grey, gray dark, color grey, ordered gr...","[I ordered black, I got grey. Sweats are comfo..."


#### Training the Model

##### Linear Regression

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Convert topic_probs Series to a 2D numpy array
topic_features = np.vstack(tops_cleaned['topic_probs'].values)

# Add sentiment polarity as an additional feature
sentiment_features = tops_cleaned['sentiment_polarity'].values.reshape(-1, 1)

# Combine topic probabilities with sentiment features
combined_features = np.hstack([topic_features, sentiment_features])

print(f"Feature shape: {combined_features.shape}")
print(f"  - Topic probabilities: {topic_features.shape[1]} features")
print(f"  - Sentiment polarity: 1 feature")

X_train, X_test, y_train, y_test = train_test_split(
    combined_features, tops_cleaned['rating'], test_size=0.2, random_state=42
)

model = LinearRegression()
model.fit(X_train, y_train)

preds = model.predict(X_test)
mse = mean_squared_error(y_test, preds)

print("MSE:", mse)

from sklearn.metrics import r2_score
r2_model = r2_score(y_test, preds)
print ("R-squared:", r2_model)

Feature shape: (18421, 2)
  - Topic probabilities: 1 features
  - Sentiment polarity: 1 feature
MSE: 1.4167180904905652
R-squared: 0.2828498183505883


#### XGBoost

In [23]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

xgb_model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6
)

xgb_model.fit(X_train, y_train)

# Make predictions on test set
xgb_preds = xgb_model.predict(X_test)

# Calculate evaluation metrics
xgb_mse = mean_squared_error(y_test, xgb_preds)
xgb_rmse = np.sqrt(xgb_mse)
xgb_r2 = r2_score(y_test, xgb_preds)

print("XGBoost Model Performance:")
print(f"MSE: {xgb_mse:.4f}")
print(f"RMSE: {xgb_rmse:.4f}")
print(f"R² Score: {xgb_r2:.4f}")


XGBoost Model Performance:
MSE: 1.3828
RMSE: 1.1759
R² Score: 0.3000


#### Random Forest

In [24]:
#X = topic_info_cleaned.iloc[:, 2:] #topic probs and sentiment score
#y = topic_info_cleaned['rating']
from sklearn.ensemble import RandomForestRegressor

X_train, X_test, y_train, y_test = train_test_split(combined_features, tops_cleaned['rating'], test_size=0.2, random_state=42)

rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    random_state=42
)

rf.fit(X_train, y_train)

preds = rf.predict(X_test)
mse = mean_squared_error(y_test, preds)

print("MSE:", mse)

r2 = r2_score(y_test, preds)
print(f"R-squared: {r2}")

MSE: 1.6064945085386402
R-squared: 0.18678399298315396
