In [1]:
import os
import pandas as pd
import numpy as np
import string
import re
import umap
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.metrics import adjusted_rand_score, silhouette_score

nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [2]:
# Mount Google Drive (required every time)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Define and check the paths
# PROJECT_ROOT assumes the shared Milestone II folder is in your root google drive
PROJECT_ROOT = "/content/drive/MyDrive/Milestone II - NLP Cryptic Crossword Clues" # Sahana's Root Filepath
DATA_DIR = f"{PROJECT_ROOT}/data"
NOTEBOOK_DIR = f"{PROJECT_ROOT}/notebooks"

if not os.path.exists(PROJECT_ROOT):
    PROJECT_ROOT = os.path.abspath("..")  # fallback for local runs

In [5]:
# Read each CSV file into a DataFrame
df_clues = pd.read_csv(f'{DATA_DIR}/clues_raw.csv')
df_indicators = pd.read_csv(f'{DATA_DIR}/indicators_raw.csv')
df_ind_by_clue = pd.read_csv(f'{DATA_DIR}/indicators_by_clue_raw.csv')
df_ind_consolidated = pd.read_csv(f'{DATA_DIR}/indicators_consolidated_raw.csv')
df_charades = pd.read_csv(f'{DATA_DIR}/charades_raw.csv')
df_charades_by_clue = pd.read_csv(f'{DATA_DIR}/charades_by_clue_raw.csv')
df_ver_indicators = pd.read_csv(f'{DATA_DIR}/verified_indicators.csv')

*   A total of 12621 unique indicator values were extracted from `df_ver_indicators`.
*   The `df_indicators` DataFrame was successfully filtered, and only contains the rows where the 'indicator' column's value matches one of the unique indicators found in `df_ver_indicators`.


In [6]:
# Clean df indicators only if the indicator words appear in clue text.
unique_ver_indicators = set(df_ver_indicators.iloc[:, 0].unique())
df_indicators = df_indicators[df_indicators['indicator'].isin(unique_ver_indicators)]
df_indicators

Unnamed: 0,ind_id,wordplay,indicator,clue_ids
0,1,alternation,abnormal,[623961](/data/clues/623961)
2,3,alternation,after regular excisions,[107211](/data/clues/107211)
3,4,alternation,alternately,[407055](/data/clues/407055)
4,5,alternation,alternating,[449798](/data/clues/449798)
5,6,alternation,alternative,[623976](/data/clues/623976)
...,...,...,...,...
15730,15731,reversal,wrote up,[207283](/data/clues/207283)
15731,15732,reversal,yields up,[412934](/data/clues/412934)
15732,15733,reversal,you once reflected,[50741](/data/clues/50741)
15733,15734,reversal,yours truly brought up,[61777](/data/clues/61777)


In [7]:
# Instead of a string with redundant indices, extract only the clue_ids in
# brackets to create a list of integers
df_indicators["clue_ids"] = (
    df_indicators["clue_ids"]
    .str.findall(r"\[(\d+)\]")
    .apply(lambda xs: [int(x) for x in xs])
)

# Include a new column to keep track of how many clues have this indicator
df_indicators["num_clues"] = df_indicators["clue_ids"].apply(len)

In [8]:
# Add indicator_word(s) and wordplay_type(s) to df_clues.
indicator_clue_map_records = []
for index, row in df_indicators.iterrows():
    wordplay = row['wordplay']
    indicator = row['indicator']
    for clue_id in row['clue_ids']:
        indicator_clue_map_records.append({
            'clue_id': clue_id,
            'indicator_word': indicator,
            'wordplay_type': wordplay
        })

df_indicator_clue_map = pd.DataFrame(indicator_clue_map_records)

# Group by clue_id and aggregate the indicator words and wordplay types into lists
df_aggregated_indicators = df_indicator_clue_map.groupby('clue_id').agg({
    'indicator_word': lambda x: list(x),
    'wordplay_type': lambda x: list(x)
}).reset_index()

# Merge with df_clues
df_clues = df_clues.merge(df_aggregated_indicators, on='clue_id', how='left')

df_clues.head()

Unnamed: 0,clue_id,clue,answer,definition,clue_number,puzzle_date,puzzle_name,source_url,source,indicator_word,wordplay_type
0,1,"Acquisitive chap, as we see it (8)",COVETOUS,Acquisitive,1a,2019-08-08,Times 27424,https://times-xwd-times.livejournal.com/218581...,times_xwd_times,,
1,2,Back yard fencing weak and sagging (6),DROOPY,sagging,5a,2019-08-08,Times 27424,https://times-xwd-times.livejournal.com/218581...,times_xwd_times,,
2,3,"Stripping off uniform, love holding colonel's ...",UNCLOTHING,Stripping,8a,2019-08-08,Times 27424,https://times-xwd-times.livejournal.com/218581...,times_xwd_times,,
3,4,Without a mark where they should be gained (4),EXAM,where they should be gained,9a,2019-08-08,Times 27424,https://times-xwd-times.livejournal.com/218581...,times_xwd_times,,
4,5,"Put a stop to Rugby's foul school leader (5,2,...",KNOCK ON THE HEAD,Put a stop to,10a,2019-08-08,Times 27424,https://times-xwd-times.livejournal.com/218581...,times_xwd_times,,


In [9]:
# Mod actual dataset to fit into model as training data.
df_clues_ind_cleaned = df_clues.dropna(subset=['indicator_word'])
df_clues_ind_cleaned = df_clues_ind_cleaned[['clue', 'indicator_word', 'answer', 'wordplay_type']]
df_clues_ind_cleaned['clue'] = df_clues_ind_cleaned['clue'].astype(str)

# Separate clues with multiple indicators so that each row of the df is one indicator_word and one wordplay_type.
df_clues_ind_cleaned = df_clues_ind_cleaned.explode('indicator_word')

df_clues_ind_cleaned["indicator_word"] = (
    df_clues_ind_cleaned["indicator_word"]
    .apply(lambda x: x[0] if isinstance(x, list) else x)
)

df_clues_ind_cleaned["wordplay_type"] = (
    df_clues_ind_cleaned["wordplay_type"]
    .apply(lambda x: x[0] if isinstance(x, list) else x)
)

df_clues_ind_cleaned

Unnamed: 0,clue,indicator_word,answer,wordplay_type
89,Training device transforming Liam's tour (9),transforming,SIMULATOR,anagram
96,Switch posts near ground (9),ground,TRANSPOSE,anagram
100,Destroyed a Parisian serving-girl verbally? (6),verbally,UNMADE,homophone
141,About to go back to a security organisation - ...,about to go back,ACACIA,reversal
144,Perth perv returned to dance (4),returned,REEL,reversal
...,...,...,...,...
660599,Cookware from Spooner’s inquisitive admirer (6-3),breaks,FRYING,anagram
660600,Cub leader is dishonest about working with lea...,say,LIONESS,homophone
660602,Smarter bit of couture: it tantalises when twi...,said,NATTIER,homophone
660606,Strange realities figure in the Bible (9),cryptically,ISRAELITE,anagram


In [10]:
print(df_clues_ind_cleaned["indicator_word"].head())
print(df_clues_ind_cleaned["indicator_word"].apply(type).value_counts())

89         transforming
96               ground
100            verbally
141    about to go back
144            returned
Name: indicator_word, dtype: object
indicator_word
<class 'str'>    91448
Name: count, dtype: int64


In [None]:
%%time
# ----------------------------------------------------
# 0) Models to compare
# ----------------------------------------------------
MODEL_NAMES = {
    "MiniLM": "sentence-transformers/all-MiniLM-L6-v2",
    "MPNet": "sentence-transformers/all-mpnet-base-v2",
    "DistilRoBERTa": "sentence-transformers/all-distilroberta-v1"
}

N_CLUSTERS = 8   # same as number of wordplay types.

# ----------------------------------------------------
# 1) Highlight the indicator inside the clue text.
# ----------------------------------------------------
def highlight_indicators(row):
    clue = row["clue"]
    ind = row["indicator_word"]

    pattern = re.compile(re.escape(ind), re.IGNORECASE)
    highlighted = pattern.sub(f"[{ind.upper()}]", clue)

    return highlighted

df = df_clues_ind_cleaned.copy()
df["highlighted_clue"] = df.apply(highlight_indicators, axis=1)


# ----------------------------------------------------
# 2) Function to run full pipeline for one model
# ----------------------------------------------------
def run_pipeline(model_name, model_ckpt, n_clusters=8):

    print(f"\n=== RUNNING MODEL: {model_name} ===")

    # ---- Embed ----
    model = SentenceTransformer(model_ckpt)

    embeddings = model.encode(
        df["highlighted_clue"].tolist(),
        convert_to_tensor=True
    ).cpu().numpy()

    embeddings_norm = normalize(embeddings)

    # ---- Cluster ----
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(embeddings_norm)
    df["cluster"] = clusters

    # ---- Metrics ----
    # Adjusted Rand Index vs your gold labels
    ari = adjusted_rand_score(
        df["wordplay_type"].astype(str),
        clusters
    )

    # Silhouette score (intrinsic quality)
    sil = silhouette_score(embeddings_norm, clusters)

    # Average within-cluster variance
    variances = []
    for c in range(n_clusters):
        cluster_points = embeddings_norm[clusters == c]
        if len(cluster_points) > 1:
            variances.append(cluster_points.var())
    mean_variance = float(np.mean(variances))

    # ---- Aggregate cluster info  ----
    cluster_agg = {}

    for c in range(n_clusters):
        cluster_data = df[
            df["cluster"] == c
        ]

        cluster_agg[c] = {
            "indicator_words": sorted(
                cluster_data["indicator_word"].unique().tolist()
            ),
            "wordplay_types": sorted(
                cluster_data["wordplay_type"].unique().tolist()
            ),
            "centroid": normalize(
                kmeans.cluster_centers_[c].reshape(1, -1)
            )[0]
        }

    results = {
        "model": model_name,
        "ari": ari,
        "silhouette": sil,
        "mean_within_cluster_variance": mean_variance,
        "kmeans": kmeans,
        "embeddings_norm": embeddings_norm,
        "clusters": clusters,
        "cluster_agg": cluster_agg,
        "model_obj": model
    }

    print(f"ARI: {ari:.4f}")
    print(f"Silhouette: {sil:.4f}")
    print(f"Mean within-cluster variance: {mean_variance:.6f}")

    return results

# ----------------------------------------------------
# 3) Run comparison for all models
# ----------------------------------------------------
all_results = {}

for name, ckpt in MODEL_NAMES.items():
    all_results[name] = run_pipeline(name, ckpt, n_clusters=N_CLUSTERS)

# ----------------------------------------------------
# 4) Put results in a table for easy comparison
# ----------------------------------------------------
summary = pd.DataFrame({
    name: {
        "ARI": all_results[name]["ari"],
        "Silhouette": all_results[name]["silhouette"],
        "MeanVariance": all_results[name]["mean_within_cluster_variance"]
    }
    for name in all_results
}).T

print("\n=== MODEL COMPARISON SUMMARY ===")
print(summary.sort_values("ARI", ascending=False))


=== RUNNING MODEL: MiniLM ===


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]



config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

ARI: 0.0031
Silhouette: 0.0172
Mean within-cluster variance: 0.002604

=== RUNNING MODEL: MPNet ===


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

MPNetModel LOAD REPORT from: sentence-transformers/all-mpnet-base-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

ARI: 0.0036
Silhouette: 0.0145
Mean within-cluster variance: 0.001302

=== RUNNING MODEL: DistilRoBERTa ===


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

RobertaModel LOAD REPORT from: sentence-transformers/all-distilroberta-v1
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def plot_umap(embeddings_norm, clusters, title):
    reducer = umap.UMAP(
        n_neighbors=15,
        min_dist=0.1,
        metric="cosine",
        random_state=42
    )

    emb_2d = reducer.fit_transform(embeddings_norm)

    plt.figure(figsize=(7,6))
    plt.scatter(emb_2d[:,0], emb_2d[:,1], c=clusters, s=5)
    plt.title(title)
    plt.show()

In [None]:
# Test predictions.

def predict_indicator_type(new_clue, indicator):
    # highlight single indicator
    pattern = re.compile(re.escape(indicator), re.IGNORECASE)
    highlighted = pattern.sub(f"[{indicator.upper()}]", new_clue)

    # embed
    emb = model.encode([highlighted], convert_to_tensor=True).cpu().numpy()
    emb_norm = normalize(emb)[0]

    # cosine similarity to centroids
    best_cluster = None
    best_sim = -1

    for c, info in cluster_agg.items():
        sim = float(np.dot(emb_norm, info["centroid"]))
        if sim > best_sim:
            best_sim = sim
            best_cluster = c

    return (
        cluster_agg[best_cluster]["wordplay_types"],
        best_cluster,
        best_sim
    )

test_clues = [
    ("Chaos letters wild in disorder", "wild"),
    ("Go back to the previous step", "back"),
    ("The surrounding letters in the container", "about")
]

for clue_text, ind in test_clues:
    types, cluster, confidence = predict_indicator_type(clue_text, ind)
    print(f"\nClue: {clue_text}")
    print(f"Indicator: {ind}")
    print(f"Predicted types: {types} (Cluster {cluster})")
    print(f"Similarity: {confidence:.4f}")

NameError: name 'model' is not defined

In [None]:
search_terms = ["anagrammed", "westernly"]
def find_indicator_words(indicator_list, terms):
    for indicator in indicator_list:
        for term in terms:
            if term.lower() in indicator.lower():
                return True
    return False

df_filtered_indicators = df_clues_ind_cleaned[df_clues_ind_cleaned['indicator_word'].apply(lambda x: find_indicator_words(x, search_terms))]
df_filtered_indicators

In [None]:
clue_indicator_tuples = []
for index, row in df_filtered_indicators.iterrows():
    clue_indicator_tuples.append((row['clue'], row['indicator_word']))

for clue_text, indicators in clue_indicator_tuples:
    types, cluster, confidence = predict_indicator_type(clue_text, indicators)
    print(f"Clue: {clue_text}")
    print(f"Indicators: {indicators}")
    print(f"Predicted types: {types} (Cluster {cluster})")
    print(f"Similarity score: {confidence:.4f}\n")

In [None]:
'''# SIMPLE MODEL (NOT ACTUALLY CALE THOUGH)

# ------------------------
# Step 1: Highlight indicators
# ------------------------
def highlight_indicators(row):
    clue = row["clue"]
    indicators = row["indicator_word"]
    for ind in indicators:
        pattern = re.compile(re.escape(ind), re.IGNORECASE)
        clue = pattern.sub(f"[{ind.upper()}]", clue)
    return clue

df_clues_ind_cleaned["highlighted_clue"] = df_clues_ind_cleaned.apply(highlight_indicators, axis=1)

# ------------------------
# Step 2: Embed clues
# ------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(
    df_clues_ind_cleaned["highlighted_clue"].tolist(),
    convert_to_tensor=True
).cpu().numpy()

embeddings_norm = normalize(embeddings)

# ------------------------
# Step 3: Cluster embeddings
# ------------------------
n_clusters = 8    # Try 8 clusters, compare with 8 labels of wordplay_type
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(embeddings_norm)
df_clues_ind_cleaned["cluster"] = clusters

# Aggregate cluster info
cluster_agg = {}

for c in range(n_clusters):
    cluster_data = df_clues_ind_cleaned[df_clues_ind_cleaned["cluster"] == c]

    indicators_in_cluster = []
    types_in_cluster = []

    for _, row in cluster_data.iterrows():
        indicators_in_cluster.extend(row["indicator_word"])
        types_in_cluster.extend(row["wordplay_type"])

    centroid = normalize(kmeans.cluster_centers_[c].reshape(1, -1))[0]

    cluster_agg[c] = {
        "indicator_words": sorted(set(indicators_in_cluster)),
        "wordplay_types": sorted(set(types_in_cluster)),
        "centroid": centroid
    }

# ------------------------
# Step 4: Predict indicator type
# ------------------------
def predict_indicator_type(new_clue, indicator_list):
    highlighted = new_clue
    for ind in indicator_list:
        pattern = re.compile(re.escape(ind), re.IGNORECASE)
        highlighted = pattern.sub(f"[{ind.upper()}]", highlighted)

    emb = model.encode([highlighted], convert_to_tensor=True).cpu().numpy()
    emb_norm = normalize(emb)[0]

    best_cluster = None
    best_sim = -1

    for c, info in cluster_agg.items():
        sim = np.dot(emb_norm, info["centroid"])
        if sim > best_sim:
            best_sim = sim
            best_cluster = c

    return (
        cluster_agg[best_cluster]["wordplay_types"],
        best_cluster,
        float(best_sim)
    )

# ------------------------
# Step 5: Test with new clues
# ------------------------
test_clues = [
    ("Chaos letters wild in disorder", ["wild"]),
    ("Go back to the previous step", ["back"]),
    ("The surrounding letters in the container", ["about"])
]

for clue_text, indicators in test_clues:
    types, cluster, confidence = predict_indicator_type(clue_text, indicators)
    print(f"Clue: {clue_text}")
    print(f"Indicators: {indicators}")
    print(f"Predicted types: {types} (Cluster {cluster})")
    print(f"Similarity score: {confidence:.4f}\n")'''