# UMAP visualization

In [None]:
import pandas as pd
import numpy as np
import umap
import matplotlib.pyplot as plt
import seaborn as sns

# Optional for interactive plots (only import if you'll use it):
# import plotly.express as px


In [None]:
# Load your labeled dataset
df_work = pd.read_csv("amazon_reviews_with_final_labels.csv")
df_work.head()

In [None]:
# load precomputed embeddings.
data = np.load("amazon_embeddings_3k.npz", allow_pickle=True)
embeddings = data['embeddings']
user_ids = data['user_id']

In [None]:
# Run UMAP for 2D projection
reducer = umap.UMAP(
    n_components=2, 
    random_state=42,
    n_jobs=1  # Explicitly set to avoid warning when random_state is used
)
umap_embeddings = reducer.fit_transform(embeddings)

df_work['umap_x'] = umap_embeddings[:, 0]
df_work['umap_y'] = umap_embeddings[:, 1]

In [None]:
# Plot using Matplotlib/Seaborn

plt.figure(figsize=(12, 9))
sns.scatterplot(
    x='umap_x',
    y='umap_y',
    hue='label',
    palette="tab20",
    data=df_work,
    legend='full',
    s=50,
    alpha=0.8
)
plt.title("UMAP Visualization of Amazon Review Clusters", fontsize=16)
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
# Optional: Interactive Plot with Plotly
# This allows you to hover over points to see the actual review text, which is great for presentations or deeper exploration.
# Uncomment the import in cell 1 first if you want to use this!

# import plotly.express as px
# fig = px.scatter(
#     df_work,
#     x='umap_x',
#     y='umap_y',
#     color='label',
#     hover_data=['doc', 'cluster'],
#     title="Interactive UMAP of Amazon Review Clusters"
# )
# fig.show()



In [None]:
# Optional: Cluster Summary Table
# This quickly shows the size of each labeled cluster alongside the visualization.

df_work.groupby('label').size().sort_values(ascending=False)
