In [14]:
import os
import pandas as pd
import numpy as np
import string
import re
import umap
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.metrics import adjusted_rand_score, silhouette_score

nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [15]:
# Mount Google Drive (required every time)
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
# Define and check the paths
# PROJECT_ROOT assumes the shared Milestone II folder is in your root google drive
PROJECT_ROOT = '/content/drive/MyDrive/SIADS 692 Milestone II/Milestone II - NLP Cryptic Crossword Clues' # Nathan's Drive
DATA_DIR = f"{PROJECT_ROOT}/data"
NOTEBOOK_DIR = f"{PROJECT_ROOT}/notebooks"
OUTPUT_DIR = f"{PROJECT_ROOT}/outputs"

if not os.path.exists(PROJECT_ROOT):
    PROJECT_ROOT = os.path.abspath("..")  # fallback for local runs

In [17]:
# Read each CSV file into a DataFrame
df_clues = pd.read_csv(f'{DATA_DIR}/clues_raw.csv')
df_indicators = pd.read_csv(f'{DATA_DIR}/indicators_raw.csv')
df_ind_by_clue = pd.read_csv(f'{DATA_DIR}/indicators_by_clue_raw.csv')
df_ind_consolidated = pd.read_csv(f'{DATA_DIR}/indicators_consolidated_raw.csv')
df_charades = pd.read_csv(f'{DATA_DIR}/charades_raw.csv')
df_charades_by_clue = pd.read_csv(f'{DATA_DIR}/charades_by_clue_raw.csv')
df_ver_indicators = pd.read_csv(f'{DATA_DIR}/verified_indicators.csv')

In [18]:
# Clean df indicators only if the indicator words appear in clue text.
unique_ver_indicators = set(df_ver_indicators.iloc[:, 0].unique())
df_indicators = df_indicators[df_indicators['indicator'].isin(unique_ver_indicators)]
df_indicators.shape

(14397, 4)

In [19]:
# Instead of a string with redundant indices, extract only the clue_ids in
# brackets to create a list of integers
df_indicators["clue_ids"] = (
    df_indicators["clue_ids"]
    .str.findall(r"\[(\d+)\]")
    .apply(lambda xs: [int(x) for x in xs])
)

# Include a new column to keep track of how many clues have this indicator
df_indicators["num_clues"] = df_indicators["clue_ids"].apply(len)

In [20]:
# Add indicator_word(s) and wordplay_type(s) to df_clues.
indicator_clue_map_records = []
for index, row in df_indicators.iterrows():
    wordplay = row['wordplay']
    indicator = row['indicator']
    for clue_id in row['clue_ids']:
        indicator_clue_map_records.append({
            'clue_id': clue_id,
            'indicator_word': indicator,
            'wordplay_type': wordplay
        })

df_indicator_clue_map = pd.DataFrame(indicator_clue_map_records)

# Group by clue_id and aggregate the indicator words and wordplay types into lists
df_aggregated_indicators = df_indicator_clue_map.groupby('clue_id').agg({
    'indicator_word': lambda x: list(x),
    'wordplay_type': lambda x: list(x)
}).reset_index()

# Merge with df_clues
df_clues = df_clues.merge(df_aggregated_indicators, on='clue_id', how='left')

df_clues

Unnamed: 0,clue_id,clue,answer,definition,clue_number,puzzle_date,puzzle_name,source_url,source,indicator_word,wordplay_type
0,1,"Acquisitive chap, as we see it (8)",COVETOUS,Acquisitive,1a,2019-08-08,Times 27424,https://times-xwd-times.livejournal.com/218581...,times_xwd_times,,
1,2,Back yard fencing weak and sagging (6),DROOPY,sagging,5a,2019-08-08,Times 27424,https://times-xwd-times.livejournal.com/218581...,times_xwd_times,,
2,3,"Stripping off uniform, love holding colonel's ...",UNCLOTHING,Stripping,8a,2019-08-08,Times 27424,https://times-xwd-times.livejournal.com/218581...,times_xwd_times,,
3,4,Without a mark where they should be gained (4),EXAM,where they should be gained,9a,2019-08-08,Times 27424,https://times-xwd-times.livejournal.com/218581...,times_xwd_times,,
4,5,"Put a stop to Rugby's foul school leader (5,2,...",KNOCK ON THE HEAD,Put a stop to,10a,2019-08-08,Times 27424,https://times-xwd-times.livejournal.com/218581...,times_xwd_times,,
...,...,...,...,...,...,...,...,...,...,...,...
660608,663648,"You reportedly lead attack, getting reprimand (7)",UPBRAID,reprimand,18d,2023-06-21,Daily Telegraph 30332,http://bigdave44.com/2023/06/21/dt-30332/,bigdave44,,
660609,663649,"Delay punishment given to criminal (4,3)",TIME LAG,Delay,19d,2023-06-21,Daily Telegraph 30332,http://bigdave44.com/2023/06/21/dt-30332/,bigdave44,,
660610,663650,Gather case of Sancerre will rise in value (6),ESTEEM,value,20d,2023-06-21,Daily Telegraph 30332,http://bigdave44.com/2023/06/21/dt-30332/,bigdave44,,
660611,663651,Open a place to drink in Paris? (5),UNBAR,Open,22d,2023-06-21,Daily Telegraph 30332,http://bigdave44.com/2023/06/21/dt-30332/,bigdave44,[playing],[anagram]


In [21]:
# Mod actual dataset to fit into model as training data.
df_clues_ind_cleaned = df_clues.dropna(subset=['indicator_word'])
df_clues_ind_cleaned = df_clues_ind_cleaned[['clue_id', 'clue', 'indicator_word', 'wordplay_type']]
df_clues_ind_cleaned['clue'] = df_clues_ind_cleaned['clue'].astype(str)

# Separate clues with multiple indicators so that each row of the df is one indicator_word and one wordplay_type.
df_clues_ind_cleaned = df_clues_ind_cleaned.explode('indicator_word')

df_clues_ind_cleaned["indicator_word"] = (
    df_clues_ind_cleaned["indicator_word"]
    .apply(lambda x: x[0] if isinstance(x, list) else x)
)

df_clues_ind_cleaned["wordplay_type"] = (
    df_clues_ind_cleaned["wordplay_type"]
    .apply(lambda x: x[0] if isinstance(x, list) else x)
)

df_clues_ind_cleaned

Unnamed: 0,clue_id,clue,indicator_word,wordplay_type
89,90,Training device transforming Liam's tour (9),transforming,anagram
96,97,Switch posts near ground (9),ground,anagram
100,101,Destroyed a Parisian serving-girl verbally? (6),verbally,homophone
141,142,About to go back to a security organisation - ...,about to go back,reversal
144,145,Perth perv returned to dance (4),returned,reversal
...,...,...,...,...
660599,663639,Cookware from Spooner’s inquisitive admirer (6-3),breaks,anagram
660600,663640,Cub leader is dishonest about working with lea...,say,homophone
660602,663642,Smarter bit of couture: it tantalises when twi...,said,homophone
660606,663646,Strange realities figure in the Bible (9),cryptically,anagram


In [22]:
# ----------------------------------------------------
# 1) Highlight the indicator inside the clue text.
# ----------------------------------------------------
def highlight_indicators(row):
    clue = row["clue"]
    ind = row["indicator_word"]

    pattern = re.compile(re.escape(ind), re.IGNORECASE)
    highlighted = pattern.sub(f"[{ind.upper()}]", clue)

    return highlighted

df_clues_ind_cleaned["highlighted_clue"] = df_clues_ind_cleaned.apply(highlight_indicators, axis=1)
df_clues_ind_cleaned["tokens"] = df_clues_ind_cleaned["clue"].apply(lambda s: s.lower().split())
df_clues_ind_cleaned["bigrams"] = df_clues_ind_cleaned["tokens"].apply(
    lambda t: [" ".join(t[i:i+2]) for i in range(len(t)-1)]
)
df_clues_ind_cleaned["trigrams"] = df_clues_ind_cleaned["tokens"].apply(
    lambda t: [" ".join(t[i:i+3]) for i in range(len(t)-2)]
)

df_clues_ind_cleaned = df_clues_ind_cleaned.reset_index().drop(columns=["index"])
df_clues_ind_cleaned


Unnamed: 0,clue_id,clue,indicator_word,wordplay_type,highlighted_clue,tokens,bigrams,trigrams
0,90,Training device transforming Liam's tour (9),transforming,anagram,Training device [TRANSFORMING] Liam's tour (9),"[training, device, transforming, liam's, tour,...","[training device, device transforming, transfo...","[training device transforming, device transfor..."
1,97,Switch posts near ground (9),ground,anagram,Switch posts near [GROUND] (9),"[switch, posts, near, ground, (9)]","[switch posts, posts near, near ground, ground...","[switch posts near, posts near ground, near gr..."
2,101,Destroyed a Parisian serving-girl verbally? (6),verbally,homophone,Destroyed a Parisian serving-girl [VERBALLY]? (6),"[destroyed, a, parisian, serving-girl, verball...","[destroyed a, a parisian, parisian serving-gir...","[destroyed a parisian, a parisian serving-girl..."
3,142,About to go back to a security organisation - ...,about to go back,reversal,[ABOUT TO GO BACK] to a security organisation ...,"[about, to, go, back, to, a, security, organis...","[about to, to go, go back, back to, to a, a se...","[about to go, to go back, go back to, back to ..."
4,145,Perth perv returned to dance (4),returned,reversal,Perth perv [RETURNED] to dance (4),"[perth, perv, returned, to, dance, (4)]","[perth perv, perv returned, returned to, to da...","[perth perv returned, perv returned to, return..."
...,...,...,...,...,...,...,...,...
91443,663639,Cookware from Spooner’s inquisitive admirer (6-3),breaks,anagram,Cookware from Spooner’s inquisitive admirer (6-3),"[cookware, from, spooner’s, inquisitive, admir...","[cookware from, from spooner’s, spooner’s inqu...","[cookware from spooner’s, from spooner’s inqui..."
91444,663640,Cub leader is dishonest about working with lea...,say,homophone,Cub leader is dishonest about working with lea...,"[cub, leader, is, dishonest, about, working, w...","[cub leader, leader is, is dishonest, dishones...","[cub leader is, leader is dishonest, is dishon..."
91445,663642,Smarter bit of couture: it tantalises when twi...,said,homophone,Smarter bit of couture: it tantalises when twi...,"[smarter, bit, of, couture:, it, tantalises, w...","[smarter bit, bit of, of couture:, couture: it...","[smarter bit of, bit of couture:, of couture: ..."
91446,663646,Strange realities figure in the Bible (9),cryptically,anagram,Strange realities figure in the Bible (9),"[strange, realities, figure, in, the, bible, (9)]","[strange realities, realities figure, figure i...","[strange realities figure, realities figure in..."


In [23]:
print(df_clues_ind_cleaned.shape)
df_clues_ind_cleaned['indicator_word'].value_counts().head(10)

(91448, 8)


Unnamed: 0_level_0,count
indicator_word,Unnamed: 1_level_1
about,2839
in,2704
out,909
around,828
back,827
upset,798
up,731
new,693
over,621
reportedly,571


In [26]:
def add_embeddings_for_one_model(
    df,
    model_name: str,
    output_dir: str,
    text_col_with_context: str = "highlighted_clue",
    text_col_no_context: str = "indicator_word"
):
    """
    Compute embeddings for ONE model and save a pickle immediately.

    Returns: updated dataframe (also saved to disk).
    """

    os.makedirs(output_dir, exist_ok=True)
    MODEL_NAMES = {
        "MiniLM": "sentence-transformers/all-MiniLM-L6-v2",
        "MiniLM-L12": "sentence-transformers/all-MiniLM-L12-v2",
        "MPNet": "sentence-transformers/all-mpnet-base-v2",
        "E5-base": "intfloat/e5-base-v2",
        "E5-large": "intfloat/e5-large-v2",
        "BGE-M3": "BAAI/bge-m3",
        "Multilingual-MPNet": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    }

    print(f"\n=== Computing embeddings for: {model_name} ===")

    model = SentenceTransformer(MODEL_NAMES[model_name])

    # Column names
    col_with_ctx = f"emb_{model_name}_with_context"
    col_no_ctx   = f"emb_{model_name}_no_context"

    # ---- Encode WITH context ----
    print("Encoding WITH context...")
    emb_with = model.encode(
        df[text_col_with_context].tolist(),
        convert_to_numpy=True,
        show_progress_bar=True
    )

    # ---- Encode WITHOUT context (indicator alone) ----
    print("Encoding WITHOUT context...")
    emb_no = model.encode(
        df[text_col_no_context].tolist(),
        convert_to_numpy=True,
        show_progress_bar=True
    )

    # Store as lists of numpy arrays (pickle-safe)
    df[col_with_ctx] = [vec for vec in emb_with]
    df[col_no_ctx]   = [vec for vec in emb_no]

    # ---- Save immediately ----
    out_path = f"{output_dir}/{model_name}_indicator_embeddings.parquet"
    df.to_parquet(out_path, index=False)

    print(f"Saved: {out_path}")

    return df


In [32]:
%%time

"""
| Model       | Training flavor                   |
| ----------- | --------------------------------- |
| MiniLM      | distilled sentence similarity     |
| MPNet       | masked + permuted LM              |
| E5          | retrieval / contrastive           |
| BGE-M3      | multi-task embedding              |

| Axis                        | Models                      |
| --------------------------- | --------------------------- |
| Small vs Large              | MiniLM vs E5-large          |
| General vs Retrieval        | MPNet vs E5                 |
| Monolingual vs Multilingual | MPNet vs Multilingual-MPNet |
| General vs Specialized      | MPNet vs BGE-M3             |
"""
MODEL_NAMES = {
    "MiniLM": "sentence-transformers/all-MiniLM-L6-v2",
    "MiniLM-L12": "sentence-transformers/all-MiniLM-L12-v2",
    "MPNet": "sentence-transformers/all-mpnet-base-v2",
    "E5-base": "intfloat/e5-base-v2",
    "E5-large": "intfloat/e5-large-v2",
    "BGE-M3": "BAAI/bge-m3",
    "Multilingual-MPNet": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
}

df = df_clues_ind_cleaned.copy()
# Run once for each model in MODEL_NAMES.
df = add_embeddings_for_one_model(
    df,
    model_name="Multilingual-MPNet",
    output_dir=DATA_DIR
)


=== Computing embeddings for: Multilingual-MPNet ===


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

XLMRobertaModel LOAD REPORT from: sentence-transformers/paraphrase-multilingual-mpnet-base-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Encoding WITH context...


Batches:   0%|          | 0/2858 [00:00<?, ?it/s]

Encoding WITHOUT context...


Batches:   0%|          | 0/2858 [00:00<?, ?it/s]

Saved: /content/drive/MyDrive/SIADS 692 Milestone II/Milestone II - NLP Cryptic Crossword Clues/data/Multilingual-MPNet_indicator_embeddings.parquet
CPU times: user 2min 56s, sys: 6.18 s, total: 3min 2s
Wall time: 3min 11s
