In [4]:
import os
import pandas as pd
import numpy as np
import string
import re
import pickle
from umap import UMAP
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.metrics import adjusted_rand_score, silhouette_score

nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [5]:
# Mount Google Drive (required every time)
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Define and check the paths
# PROJECT_ROOT assumes the shared Milestone II folder is in your root google drive
PROJECT_ROOT = '/content/drive/MyDrive/SIADS 692 Milestone II/Milestone II - NLP Cryptic Crossword Clues' # Nathan's Drive
DATA_DIR = f"{PROJECT_ROOT}/data"
NOTEBOOK_DIR = f"{PROJECT_ROOT}/notebooks"
OUTPUT_DIR = f"{PROJECT_ROOT}/outputs"

if not os.path.exists(PROJECT_ROOT):
    PROJECT_ROOT = os.path.abspath("..")  # fallback for local runs

In [7]:
"""
| Model       | Training flavor                   |
| ----------- | --------------------------------- |
| MiniLM      | distilled sentence similarity     |
| MPNet       | masked + permuted LM              |
| E5          | retrieval / contrastive           |
| BGE-M3      | multi-task embedding              |

| Axis                        | Models                      |
| --------------------------- | --------------------------- |
| Small vs Large              | MiniLM vs E5-large          |
| General vs Retrieval        | MPNet vs E5                 |
| Monolingual vs Multilingual | MPNet vs Multilingual-MPNet |
| General vs Specialized      | MPNet vs BGE-M3             |
"""

MODEL_FILES = {
    "MiniLM": f"{DATA_DIR}/MiniLM_indicator_embeddings.parquet",
    "MiniLM-L12": f"{DATA_DIR}/MiniLM-L12_indicator_embeddings.parquet",
    "MPNet": f"{DATA_DIR}/MPNet_indicator_embeddings.parquet",
    "E5-base": f"{DATA_DIR}/E5-base_indicator_embeddings.parquet",
    "E5-large": f"{DATA_DIR}/E5-large_indicator_embeddings.parquet",
    "BGE-M3": f"{DATA_DIR}/BGE-M3_indicator_embeddings.parquet",
    "Multilingual-MPNet": f"{DATA_DIR}/Multilingual-MPNet_indicator_embeddings.parquet"
}

MODEL_NAME = "Multilingual-MPNet"
PARQUET_PATH = MODEL_FILES[MODEL_NAME]

In [8]:
# ==========================
# UMAP PLOTTING HELPER FUNCTION
# ==========================

def run_umap_and_plot(
    df,
    X,
    output_prefix,
    title,
    model_name,
    output_dir,
    hue_col="wordplay_type",
    n_neighbors=20,
    min_dist=0.1,
    random_state=42,
):
    """
    Runs UMAP on embedding matrix X,
    adds coordinates to df,
    creates scatterplot,
    and saves the figure.
    """

    # ----- Run UMAP -----
    reducer = UMAP(
        n_components=2,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        random_state=random_state,
        init="random",
    )

    X_umap = reducer.fit_transform(X)

    # ----- Add columns to df -----
    x_col = f"{output_prefix}_umap_x"
    y_col = f"{output_prefix}_umap_y"

    df[x_col] = X_umap[:, 0]
    df[y_col] = X_umap[:, 1]

    # ----- Plot -----
    plt.figure(figsize=(8, 6))

    sns.scatterplot(
        data=df,
        x=x_col,
        y=y_col,
        hue=hue_col,
        palette="tab10",
        s=40
    )

    plt.title(f"{model_name} {title}")
    plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))
    plt.tight_layout()

    # ----- Save -----
    filename = f"{model_name}_{output_prefix}.png"
    path = os.path.join(output_dir, filename)

    plt.savefig(path, dpi=300, bbox_inches="tight")
    plt.close()

    print(f"Saved: {filename}")

In [9]:
# ==========================
# 0) LOAD DATA
# ==========================
df = pd.read_parquet(PARQUET_PATH)

# Stack embeddings into matrix
X_ctx = np.vstack(df[f"emb_{MODEL_NAME}_with_context"].values)
X_no  = np.vstack(df[f"emb_{MODEL_NAME}_no_context"].values)
print("Loaded shapes:", X_ctx.shape, X_no.shape)

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/SIADS 692 Milestone II/Milestone II - NLP Cryptic Crossword Clues/data/Multilingual-MPNet_indicator_embeddings.parquet'

In [None]:
%%time
# ==========================
# 1) RAW UMAPS
# ==========================

run_umap_and_plot(
    df=df,
    X=X_ctx,
    output_prefix="umap_with_context_prepca",
    title="with context, UMAP Before PCA",
    model_name=MODEL_NAME,
    output_dir=OUTPUT_DIR
)

run_umap_and_plot(
    df=df,
    X=X_no,
    output_prefix="umap_without_context_prepca",
    title="without context, UMAP Before PCA",
    model_name=MODEL_NAME,
    output_dir=OUTPUT_DIR
)

In [None]:
%%time
# ==========================
# 2) PCA 0.90
# ==========================
pca_90 = PCA(n_components=0.90)
X_ctx_pca = pca_90.fit_transform(X_ctx)
k_mle_ctx = int(pca_90.n_components_)
print(f"{MODEL_NAME} 90% PCA optimal k, with context = {k_mle_ctx}")

pca_90 = PCA(n_components=0.90)
X_no_pca  = pca_90.fit_transform(X_no)
k_mle_no = int(pca_90.n_components_)
print(f"{MODEL_NAME} 90% PCA optimal k, without context = {k_mle_no}")

In [None]:
%%time
# ==========================
# 3) PCA UMAPS
# ==========================

run_umap_and_plot(
    df=df,
    X=X_ctx_pca,
    output_prefix="umap_with_context_postpca",
    title="with context, UMAP After PCA",
    model_name=MODEL_NAME,
    output_dir=OUTPUT_DIR
)

run_umap_and_plot(
    df=df,
    X=X_no_pca,
    output_prefix="umap_without_context_postpca",
    title="without context, UMAP After PCA",
    model_name=MODEL_NAME,
    output_dir=OUTPUT_DIR
)

In [None]:
%%time
# ==========================
# 4) SAVE TO PARQUET
# ==========================
print(df.columns)
df_path = os.path.join(DATA_DIR, f"{MODEL_NAME}_pca_umap.parquet")
df.to_parquet(df_path, index=False)
print("Saved dataframe to:", df_path)