# UMAP visualization

In [None]:
import pandas as pd
import numpy as np
import umap
import matplotlib.pyplot as plt
import seaborn as sns

# Optional for interactive plots (only import if you'll use it):
# import plotly.express as px


In [3]:
# Load your labeled dataset
df_work = pd.read_csv("amazon_reviews_with_final_labels.csv")
df_work.head()

Unnamed: 0,asin,brand_name,parent_asin,user_id,date,title,text,images,helpful_vote,verified_purchase,doc,cluster,umap_x,umap_y,label,final_label
0,B0007XB2Y8,Carhartt,B0C1CW6ZCJ,AEXGQF5KRMQPIIGABYU4NGU2QWIA,2022-08-04 06:35:58.242,Too tight. Not loose fit. Only sleeves is loose.,This thing is too slim and does not fit loosel...,[],1,True,Too tight. Not loose fit. Only sleeves is loos...,7,0.390308,4.165219,Clothing / Fit & Quality,Clothing / Fit & Quality
1,B012SPPR3Y,Sloggers,B09NSZ12NZ,AG3ZJ6A7PU7ZDMUKDZVOHMFTOOJA,2021-06-21 04:17:06.209,Even the larger size is too narrow & painfully...,Too small. I usually wear 8.5 W so I ordered a...,[],0,True,Even the larger size is too narrow & painfully...,36,4.67241,-2.192302,Shoes / Fit & Width Issues,Shoes / Fit & Width Issues
2,B01LXCNS2G,Sbicca,B08JSYWH7D,AFFVAJTCQEAG5PQDCKTDYC3YUYWQ,2019-07-18 21:38:27.088,Not well made,Sole started coming apart after occasional wea...,[],1,True,Not well made. Sole started coming apart after...,23,2.509393,-2.44066,Shoes / Durability & Quality,Shoes / Durability & Quality
3,B00A9MQCVK,Hue,B00S7UIJE4,AGGCLHRQSTWKWKMJL4VZEUJAVPSA,2020-06-25 21:06:48.647,One size does not fit all.,Too small!,[],2,True,One size does not fit all.. Too small!,28,3.710247,1.569907,Fit & Sizing Issues,Fit & Sizing Issues
4,B08NTV9GLM,NuuSol,B0C5QM8RWX,AHNTYMBCE5ZEEOGECYSF6FJQPPCA,2022-09-07 12:18:17.813,Shrunk and got hard!,This was my second pair. Absolutely loved his ...,[],0,True,Shrunk and got hard!. This was my second pair....,-1,2.075371,-2.103879,Other / Mixed / Noise,Other / Mixed / Noise


In [None]:
# load precomputed embeddings.
data = np.load("amazon_embeddings_3k.npz", allow_pickle=True)
embeddings = data['embeddings']
user_ids = data['user_id']

In [None]:
# Run UMAP for 2D projection
reducer = umap.UMAP(
    n_components=2, 
    random_state=42,
    n_jobs=1  # Explicitly set to avoid warning when random_state is used
)
umap_embeddings = reducer.fit_transform(embeddings)

df_work['umap_x'] = umap_embeddings[:, 0]
df_work['umap_y'] = umap_embeddings[:, 1]

  warn(


In [None]:
# Plot using Matplotlib/Seaborn

plt.figure(figsize=(12, 9))
sns.scatterplot(
    x='umap_x',
    y='umap_y',
    hue='label',
    palette="tab20",
    data=df_work,
    legend='full',
    s=50,
    alpha=0.8
)
plt.title("UMAP Visualization of Amazon Review Clusters", fontsize=16)
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
# Optional: Interactive Plot with Plotly
# This allows you to hover over points to see the actual review text, which is great for presentations or deeper exploration.
# Uncomment the import in cell 1 first if you want to use this!

# import plotly.express as px
# fig = px.scatter(
#     df_work,
#     x='umap_x',
#     y='umap_y',
#     color='label',
#     hover_data=['doc', 'cluster'],
#     title="Interactive UMAP of Amazon Review Clusters"
# )
# fig.show()



In [None]:
# Optional: Cluster Summary Table
# This quickly shows the size of each labeled cluster alongside the visualization.

df_work.groupby('label').size().sort_values(ascending=False)


label
Other / Mixed / Noise                        918
Fit & Sizing Issues                          379
Positive / Satisfaction / Quality            215
Comfort & Wearability                        159
Work / Durability / Fit                      155
Quality & Durability                         103
Miscellaneous / Other                         95
Fit / Authenticity / Defects                  86
Socks / Fit & Quality                         76
Shoes / Durability & Quality                  75
Shoes / Fit & Width Issues                    73
Clothing / Fit & Quality                      68
Positive Satisfaction                         64
Shoes / Comfort & Satisfaction                58
Sweat Protection / Undershirt Performance     56
Socks / Fit & Comfort                         56
Shoe Trees / Sizing & Quality                 54
Shoes / Run Small / Size Accuracy             52
Socks / Value, Warmth & Wool Quality          42
Comfort & Cushioning (Positive)               42
Socks / Positi