# Lab Session 4 - Task C: Visualization

1. Setup and Imports 

In [6]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.express as px
from tqdm.notebook import tqdm

try:
    _ = ppmi_matrix
    _ = svd_matrix
    _ = word2vec_embeddings
    _ = id_to_word
    print("Found all necessary variables from previous tasks. Ready to proceed.")
except NameError as e:
    print(f"ERROR: Could not find a required variable: {e}")
    print("Please make sure you have run the notebooks for Task A and Task B in this session first.")

ERROR: Could not find a required variable: name 'ppmi_matrix' is not defined
Please make sure you have run the notebooks for Task A and Task B in this session first.


In [7]:
# === 2. Prepare Data for Visualization ===
# Plotting all 10,000 words would be unreadable.
# We will select the 750 most frequent words from our vocabulary for a clearer plot.

NUM_WORDS_TO_PLOT = 750

# Assuming id_to_word is ordered by frequency (which it is from our setup)
plot_indices = list(range(NUM_WORDS_TO_PLOT))
plot_words = [id_to_word[i] for i in plot_indices]

# Get the corresponding vectors from each matrix
# Note: PPMI is a sparse matrix, so we convert it to a dense array first
ppmi_vectors_plot = ppmi_matrix[plot_indices, :].toarray()
svd_vectors_plot = svd_matrix[plot_indices, :]
word2vec_vectors_plot = word2vec_embeddings[plot_indices, :]

print(f"Selected {len(plot_words)} most frequent words for visualization.")

NameError: name 'id_to_word' is not defined

In [4]:
# === 3. Dimensionality Reduction ===
# We will now run both PCA and t-SNE on our subset of vectors from each of the three models.
# t-SNE can take a minute or two to run.

def reduce_dimensions(vectors, random_state=42):
    """Reduces dimensions of vectors using both PCA and t-SNE."""
    # PCA reduction
    pca = PCA(n_components=2, random_state=random_state)
    vectors_pca = pca.fit_transform(vectors)
    
    # t-SNE reduction
    tsne = TSNE(n_components=2, perplexity=15, random_state=random_state, n_iter=1000)
    vectors_tsne = tsne.fit_transform(vectors)
    
    return vectors_pca, vectors_tsne


print("Starting dimensionality reduction for all three models...")

# This might take a few minutes, especially the t-SNE parts.
with tqdm(total=3, desc="Reducing Dimensions") as pbar:
    ppmi_pca, ppmi_tsne = reduce_dimensions(ppmi_vectors_plot)
    pbar.update(1)
    svd_pca, svd_tsne = reduce_dimensions(svd_vectors_plot)
    pbar.update(1)
    word2vec_pca, word2vec_tsne = reduce_dimensions(word2vec_vectors_plot)
    pbar.update(1)

print("Dimensionality reduction complete.")

Starting dimensionality reduction for all three models...


ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [5]:
# === 4. Create Interactive Plots ===
# We will now create a DataFrame to hold all our results
# and then generate interactive plots using Plotly.
# Hover over the points in the plot to see which word they represent.

def create_plots(df, model_name):
    """Generates PCA and t-SNE plots for a given model."""
    # PCA Plot
    fig_pca = px.scatter(df, x='pca_x', y='pca_y', text='word',
                         title=f'PCA Visualization of {model_name} Embeddings')
    fig_pca.update_traces(textposition='top center', mode='markers+text', textfont_size=8, marker=dict(size=5))
    fig_pca.update_layout(height=800, width=800, showlegend=False)
    fig_pca.show()
    
    # t-SNE Plot
    fig_tsne = px.scatter(df, x='tsne_x', y='tsne_y', text='word',
                          title=f't-SNE Visualization of {model_name} Embeddings')
    fig_tsne.update_traces(textposition='top center', mode='markers+text', textfont_size=8, marker=dict(size=5))
    fig_tsne.update_layout(height=800, width=800, showlegend=False)
    fig_tsne.show()


# Create DataFrames and plot for each model
models_data = {
    'PPMI (VSM)': (ppmi_pca, ppmi_tsne),
    'Truncated SVD': (svd_pca, svd_tsne),
    'Word2Vec': (word2vec_pca, word2vec_tsne)
}

for model_name, (pca_data, tsne_data) in models_data.items():
    df = pd.DataFrame({
        'word': plot_words,
        'pca_x': pca_data[:, 0],
        'pca_y': pca_data[:, 1],
        'tsne_x': tsne_data[:, 0],
        'tsne_y': tsne_data[:, 1]
    })
    create_plots(df, model_name)


NameError: name 'ppmi_pca' is not defined

 === 5. How to Interpret the Plots ===

#As you explore the interactive plots, here are some things to look for to compare the models:

#* Semantic Clustering: Look for meaningful groups of words.
  Do you see clusters for countries, names, technologies, or sports terms?
  A good model should place semantically similar words close to each other.

* Model Comparison: How do these clusters compare across the three models (PPMI, SVD, Word2Vec)?
  Often, SVD and Word2Vec will show more defined and coherent clusters than the raw PPMI matrix.

* PCA vs. t-SNE:
 - PCA is a linear projection that tries to preserve the global variance in the data.
    You might see broad themes or directions in the PCA plot.
  - t-SNE is a non-linear technique that focuses on preserving the local neighborhood of each point.
   It is excellent at revealing fine-grained clusters but can sometimes create a false sense of separation between clusters.
 By comparing these visualizations, you can gain a more intuitive understanding of the conceptual differences
#between these powerful word representation techniques.
