# EDA and Data Cleaning for `definition`-`answer` Clue Misdirection
Unsupervised Learning Component of Milestone II group project:

Exploring Wordplay and Misdirection in Cryptic Crossword Clues with Natural Language Processing

## Imports

In [1]:
# NLTK Setup - WordNet
import nltk
from nltk.corpus import wordnet as wn

try:
    wn.synsets("test")
except LookupError:
    nltk.download("wordnet", quiet=True)

In [2]:
import os
import pandas as pd
import numpy as np
import re
import unicodedata
import matplotlib.pyplot as plt

In [3]:
# Mount Google Drive (required every time)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Define and check the paths
# PROJECT_ROOT assumes the shared Milestone II folder is in your root google drive
PROJECT_ROOT = "/content/drive/MyDrive/Milestone II - NLP Cryptic Crossword Clues" # Sahana's Root Filepath
DATA_DIR = f"{PROJECT_ROOT}/data"
NOTEBOOK_DIR = f"{PROJECT_ROOT}/notebooks"

if not os.path.exists(PROJECT_ROOT):
    PROJECT_ROOT = os.path.abspath("..")  # fallback for local runs

In [5]:
# Read the relevant CSV file(s) into a DataFrame
df_clues = pd.read_csv(
    f'{DATA_DIR}/clues_raw.csv',
    usecols=['clue_id', 'clue', 'answer', 'definition'],
    index_col='clue_id')

# Uncomment to add additional tables from the dataset
#df_indicators = pd.read_csv(f'{DATA_DIR}/indicators_raw.csv')
#df_ind_by_clue = pd.read_csv(f'{DATA_DIR}/indicators_by_clue_raw.csv')
#df_ind_consolidated = pd.read_csv(f'{DATA_DIR}/indicators_consolidated_raw.csv')
#df_charades = pd.read_csv(f'{DATA_DIR}/charades_raw.csv')
#df_charades_by_clue = pd.read_csv(f'{DATA_DIR}/charades_by_clue_raw.csv')

## Helper Functions

### `view_row(clue_id)`
This helper function views an entire clue row given the clue ID. It makes sure the entire text of the clue is displayed.

In [6]:
def view_row(clue_id):
  display(pd.DataFrame(df_clues.loc[clue_id]).style.set_properties(**{"white-space": "pre-wrap"}))

### `normalize(text)`

Normalize text (any clue, answer, definition, etc) to remove all capitalization, punctuation (including dashes), and accent marks.

#### Normalization Question: Remove Dashes in Answer?

See Clue 624269. Should LA-DI-DA be normalized as:
* la di da
* la-di-da
* ladida <- current behavior

#### Whitespace not fully normalized

See Clue 270661. There are two " " in a row after removing a dash.

In [7]:
# Normalize takes a string (clue surface, indicator, definition, answer),
# And returns the same text but with punctuation (including dashes) and
# accents removed, and all lowercase.
def normalize(s: str) -> str:
  # remove accents and punctuation, convert to lowercase
  s_normalized = ''.join(
      ch for ch in unicodedata.normalize('NFD', s)
      if unicodedata.category(ch).startswith(('L', 'N', 'Zs'))
  ).lower()

  return s_normalized

## Add New Features
Compute new features that will be useful for analysis:
* `surface`: The clue text without the information about answer letter count and format.
* `surface_normalized`: The surface without capitalization, punctuation, or accent marks
* `required_answer_format`: The necessary letter count and format (spaces, dashes) according to the `clue` text.
* `answer_format`: The observed formatting and letter count parsed from the `answer`.

### `surface`

The surface text of the clue. The information about `answer` letter length is removed.

In [8]:
# Remove the answer letter count from the end of clue to get clue_surface
# Surface: remove trailing numeric parentheses in clue
df_clues['surface'] = df_clues['clue'].astype(str).apply(
    lambda x: re.sub(r'\s*\(\d+(?:[,\s-]+\d+)*\)$', '', x)
    )

In [9]:
# Uncomment to see the new feature
#df_clues.head()

### `surface_normalized`
The surface text of the clue with capitalization, punctuation, and accent marks removed.

In [10]:
# Add columns in the dataframe for normalized versions of clue, answer,
# and/or definition

# Create surface normalized - no accents, punctuation, capitalized letters
df_clues['surface_normalized'] = df_clues['surface'].astype(str).apply(normalize)

# Create definition normalized - no accents, punctuation, capitalized letters
#df_clues['definition_normalized'] = df_clues['definition'].astype(str).apply(normalize)

In [11]:
# Uncomment to see this new feature
#df_clues[['clue', 'surface', 'surface_normalized']].sample(3).style.set_properties(**{"white-space": "pre-wrap"})##

### `required_answer_format`
The required letter count and format for the `answer` as it was specified in the `clue`.

In [12]:
# The letter count(s) and format of the answer according to the clue
df_clues["required_answer_format"] = df_clues["clue"].str.extract(
    r'\(\s*([\d,\s-]+)\s*\)\s*$',
    expand=False).str.replace(" ", "", regex=False) # remove spaces

In [13]:
# Uncomment to see this new feature
#df_clues[['clue', 'required_answer_format', 'answer']].sample(3).style.set_properties(**{"white-space": "pre-wrap"})

### `answer_format`
The letter count and format of the actual `answer`.

In [14]:
# This helper function extracts the letter count and formatting of the actual
# answer in the same form as it's given in the clue, so they can be compared
def pattern_from_A(s):
    if not isinstance(s, str):
        return None

    s = s.upper().strip()

    words = s.split()  # split on spaces → comma boundaries
    word_patterns = []

    for word in words:
        parts = re.findall(r'[A-Z]+', word)  # splits on hyphens/punctuation
        if not parts:
            continue

        if '-' in word:
            word_patterns.append("-".join(str(len(p)) for p in parts))
        else:
            word_patterns.append(str(len(parts[0])))

    return ",".join(word_patterns)


# Create a column with the observed format of `answer`
df_clues["answer_format"] = df_clues["answer"].apply(pattern_from_A)

In [15]:
# Reorder columns in df_clues for conceptual coherence
col_order = ['clue', 'surface', 'surface_normalized', 'definition', 'answer', 'answer_format', 'required_answer_format']
df_clues = df_clues[col_order]

In [16]:
# Take a look at the new columns
df_clues.sample(3).style.set_properties(**{"white-space": "pre-wrap"})

Unnamed: 0_level_0,clue,surface,surface_normalized,definition,answer,answer_format,required_answer_format
clue_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
590508,A wee bittie Scotch cheers – something with gin too (4),A wee bittie Scotch cheers – something with gin too,a wee bittie scotch cheers something with gin too,A wee bittie Scotch,TAIT,4,4
249506,Forbidden pasty’s finished outside (10),Forbidden pasty’s finished outside,forbidden pastys finished outside,Forbidden/Forbidden,DISALLOWED,10,10
456733,Plant is clean after treatment (7),Plant is clean after treatment,plant is clean after treatment,,SANICLE,7,7


## Accommodate Multi-Definition Clues

In "Double Definition" clues, two different definitions appear in the clue's surface. These are captured in the data when '/' appears in `definition`.

<b>About 5% of clues in this dataset have multiple definitions.</b>

See these example `clue_ids`: 402435, 168011, 287013

`clue_id` = 402435:
* `clue`: Part of a garden scheme (4)
* `answer`: PLOT
* `definition`: Part of a garden/scheme

It turns out that some clue have as many as <b>eight definitions</b>.

`clue_id` = 90342:
* `clue`: Try fun class A drug hit: lose it, gag and snap (5)
* `answer`: CRACK
* `definition`: Try/fun/class A/drug/hit/lose it/gag/snap

And sometimes a clue's entire surface can be its definition (see clue ID 373039). I think this might be called a "double definition" clue, even though there is only one definition; the clue itself "doubles" as the definition.

### `definition_list`

Create a new column, `definition_list` containing a list of unique, verified definitions; "verified" definitions must appear in the clue's `surface_normalized` once normalized. At this step we do NOT require that the definition appears at the beginning or end of the clue.

NOTE: Our process of "verifying" that definitions appear as intact whole words in the clue will exclude a definition that is part of a contraction. See `clue_id` = 446426, where the definition "castle" appears in the clue surface with 's: "Castle's storage space, over afternoon, used in party."

In [17]:
# Identify proportion of "double definitions" in the data:
# Find definition entries that contain '/'
print(f"{100 * df_clues['definition'].str.contains('/').sum()/len(df_clues):.2}%")

4.9%


In [18]:
# Create a new column containing a list of all
# valid, unique definitions

# Pre-compile whitespace cleaner
slash_splitter = re.compile(r'/+')
ws_normalizer = re.compile(r'\s+')

# Extract valid phrases in the definition of a row separated by '/'
def extract_valid_phrases(row):
    text_c = row["surface"]
    text_c_norm = row["surface_normalized"]
    text_d = row["definition"]

    if pd.isna(text_d) or pd.isna(text_c):
        return []

    # 1) Split on one-or-more slashes to handle malformed '//////' cases
    parts = slash_splitter.split(text_d)

    # 2) Normalize whitespace inside phrases, strip ends
    cleaned = []
    for p in parts:
        p = ws_normalizer.sub(" ", p).strip()
        if not p:
          continue

        # If a fragment becomes empty after normalization, it's just punctuation
        if not normalize(p):  # This avoids adding "" to our list.
          continue

        cleaned.append(p)

    # 3. Deduplicate while preserving order
    seen = set()
    unique_parts = []
    for p in parts:
        if p not in seen:
            seen.add(p)
            unique_parts.append(p)

    # 4. Keep only phrases appearing intact in C (full word/phrase match)
    # Normalize first, so it's not sensitive to capitalization or punctuation
    valid = []
    for p in unique_parts:
        p_norm = normalize(p) # Use the normalize helper function
        if not p_norm: # Make sure it's not empty
          continue

        norm_pat = re.compile(r'\b{}\b'.format(re.escape(p_norm)))

        if norm_pat.search(text_c_norm):
            valid.append(p_norm) # Change to p if you want def as it is in clue

    return valid

# Apply once across rows
df_clues["definition_list"] = df_clues.apply(extract_valid_phrases, axis=1)

In [19]:
# Uncomment to see definition_lists
#df_clues[['clue', 'surface_normalized', 'definition', 'definition_list']].sample(3).style.set_properties(**{"white-space": "pre-wrap"})


In [20]:
# Uncomment to see an example of a double definition where the entire clue is
# the definition.
#view_row(373039)

### `num_definitions`

The number of valid definitions contained in `definition_list`.

In [21]:
# Add a column to keep track of how many valid definitions each clue has
df_clues['num_definitions'] = df_clues['definition_list'].apply(len)

In [22]:
# See the distribution of how many valid definitions each clue has
df_clues['num_definitions'].value_counts().sort_index()

Unnamed: 0_level_0,count
num_definitions,Unnamed: 1_level_1
0,162154
1,474651
2,23004
3,711
4,74
5,11
6,6
8,2


In [23]:
# Uncomment to visualize how common multi-definitions are
#df_clues['num_definitions'].value_counts().sort_index().plot(kind='bar')

# Apply Requirements

Keep track of how much data we exclude with each requirement we impose in `df_clue_count`. Build the cleaned data in `df_clues_clean`.

In [24]:
# A new dataframe to clean as we apply requirements
df_clues_clean = df_clues.copy()

# A dataframe to keep track of how much data we retain at each step
df_clue_count = pd.DataFrame(data={'full_dataset': [len(df_clues)]})

In [25]:
df_clue_count.style.format('{:,}')

Unnamed: 0,full_dataset
0,660613


## No Missing Data (`clue`, `answer`, `definition`)

The original dataset contains 660,613 rows (clues). While only a few rows are missing `clue` or `answer`, almost 23% are missing a valid `definition`.


In [26]:
# The numbers of original clue, answer, and definition fields with missing data
pd.DataFrame(df_clues_clean[['clue', 'answer', 'definition']].isna().sum()).style.format('{:,}')

Unnamed: 0,0
clue,323
answer,2259
definition,149096


In [27]:
# Keep track of how many datapoints we have left
df_clue_count['no_missing_data'] = df_clues_clean[['clue', 'answer', 'definition']].dropna().shape[0]

In [28]:
df_clue_count.style.format('{:,}')

Unnamed: 0,full_dataset,no_missing_data
0,660613,510886


## `clue` must contain information about required answer format

This step also drops any rows where the clue text did not specific the format of the answer (e.g., (5), (2-2-2), (4,6)). Without this information from the `clue` text, we cannot double check that `answer` is correct.

After dropping all rows that are missing data, we are left with 510,886 clues.

NOTE: While we may not need the `clue` surface for our models, we are dropping rows with missing data in `clue` because we cannot check that `definition` is valid without a correctly parsed `clue`.

In [29]:
# How many rows have missing data for clue, answer, or definition?
# And/or do not provide a required answer format in the clue text?
df_clues_clean.isna().sum()

Unnamed: 0,0
clue,323
surface,0
surface_normalized,0
definition,149096
answer,2259
answer_format,2259
required_answer_format,33567
definition_list,0
num_definitions,0


In [30]:
# Drop rows that are missing any data (clue, answer, or definition)
# This also drops rows where the answer letter count and format cannot be
# verified because the clue did not contain that information.
df_clues_clean.dropna(inplace=True)

In [31]:
# How many rows of data are left?
df_clue_count['clue_contains_ans_format'] = df_clues_clean.shape[0]

In [32]:
df_clue_count.style.format('{:,}')

Unnamed: 0,full_dataset,no_missing_data,clue_contains_ans_format
0,660613,510886,484271


## `answer` is the correct length
The `clue` contains the answer length at the end in parentheses. Make sure that the answer has the right number/placement of characters, spaces, and dashes.

In [33]:
cols = ['clue', 'required_answer_format', 'answer_format', 'answer']
df_clues_clean[cols].sample(3).style.set_properties(**{"white-space": "pre-wrap"})

Unnamed: 0_level_0,clue,required_answer_format,answer_format,answer
clue_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
635095,Plant's bitterness starting to cool off when infused with head of garlic (8),8,81,AGRIMONY A
70217,"What's SM roared right out? (5,4)",54,54,ORDER ARMS
611263,"Basic food, lolly (5)",5,5,BREAD


In [34]:
# Reduce the dataset to require answer to be in the required format
df_clues_clean = df_clues_clean[df_clues_clean['required_answer_format'] == df_clues_clean['answer_format']]

In [35]:
# How many rows are left?
df_clue_count['answer_correct_length'] = df_clues_clean.shape[0]

## `definition` appears FIRST or LAST in `clue`
We'll check to see if <b>at least one definition</b> in the `definition_list` appears intact at the beginning or end of `surface_normalized`.

We already verified that every entry of `definition_list` appears *somewhere* in the clue's `surface_normalized` when we created the list of valid definitions. One of our checks in adding definitions to the list was to make sure `normalized(definition_part)` appeared in `normalized(clue_surface)`. However, the list may contain entries that appeared somewhere in the middle of the clue.

In [36]:
# Function to determine if a definition in the list `phrases` appears at the
# START or END of the text of the normalized surface of the clue `c_norm`
def appears_at_edge(c_norm, phrases):
    if not phrases: # empty list - no definitions
        return False

    for p_norm in phrases:
        # START check (whole word/phrase)
        if c_norm.startswith(p_norm):
            if len(c_norm) == len(p_norm) or c_norm[len(p_norm)] == " ":
                return True

        # END check (whole word/phrase)
        if c_norm.endswith(p_norm):
            if len(c_norm) == len(p_norm) or c_norm[-len(p_norm)-1] == " ":
                return True

    return False

# Create a new column (boolean) that says whether ANY definition in the list
# appears at the START or END of the clue's surface_normalized
df_clues_clean["def_at_start_or_end"] = [
    appears_at_edge(cn, dl)
    for cn, dl in zip(df_clues_clean["surface_normalized"], df_clues_clean["definition_list"])
]

In [37]:
pd.DataFrame(df_clues_clean['def_at_start_or_end'].value_counts()).style.format('{:,}')

Unnamed: 0_level_0,count
def_at_start_or_end,Unnamed: 1_level_1
True,449945
False,21988


In [38]:
# Reduce the dataset to require that definition appears first or last in clue
df_clues_clean = df_clues_clean[df_clues_clean['def_at_start_or_end'] == True]

In [39]:
df_clue_count["def_verified"] = df_clues_clean.shape[0]

In [40]:
df_clue_count.style.format('{:,}')

Unnamed: 0,full_dataset,no_missing_data,clue_contains_ans_format,answer_correct_length,def_verified
0,660613,510886,484271,471933,449945


In [41]:
# Take a look at the cleaned data
df_clues_clean.sample(3).style.set_properties(**{"white-space": "pre-wrap"})

Unnamed: 0_level_0,clue,surface,surface_normalized,definition,answer,answer_format,required_answer_format,definition_list,num_definitions,def_at_start_or_end
clue_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
599063,Start to read publication that includes new version (9),Start to read publication that includes new version,start to read publication that includes new version,version,RENDITION,9,9,['version'],1,True
224296,I used to own a small house in this state (5),I used to own a small house in this state,i used to own a small house in this state,this state,IDAHO,5,5,['this state'],1,True
533745,Language spooks assume ban will be revoked (6),Language spooks assume ban will be revoked,language spooks assume ban will be revoked,Language,ARABIC,6,6,['language'],1,True


# Explore WordNet
See if `answer` and any of the elements in `definition_list` are in WordNet. We won't restrict the dataset to one-word answer and definition at this point because WordNet sometimes handles short phrases.

In [42]:
# Make a new Dataframe to play around with wordnet
cols = ['surface', 'surface_normalized', 'definition_list', 'num_definitions', 'answer', 'answer_format']
df_clues_wn = df_clues_clean[cols].copy()


In [74]:
df_clues_wn.sample(10)

Unnamed: 0_level_0,surface,surface_normalized,definition_list,num_definitions,answer,answer_format
clue_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
361193,Seismic activity in epicentre more evident,seismic activity in epicentre more evident,[seismic activity],1,TREMOR,6
15351,"The German holding party in Africa, one provid...",the german holding party in africa one providi...,[one providing entertainment],1,DANCER,6
168310,Youth gets one an unusual phone number,youth gets one an unusual phone number,[number],1,LA DONNA E MOBILE,2516
548064,"First results in, counting afresh — winning?",first results in counting afresh winning,[winning],1,TROUNCING,9
117592,An upper-class girl? A girl,an upperclass girl a girl,[girl],1,ANGELA,6
653719,"Marry Hitch? Perhaps, but it’s not indicative ...",marry hitch perhaps but its not indicative of ...,[its not indicative of a steady heart],1,ARRHYTHMIC,10
660173,Stories about ordinary runs with cargo essenti...,stories about ordinary runs with cargo essenti...,[heavy vehicles],1,LORRIES,7
12573,Some stock book by Latin poet,some stock book by latin poet,[some stock],1,BOVID,5
149556,Any one of three parts of lovely spot,any one of three parts of lovely spot,[spot],1,SEE,3
305187,Formula Three motor Hill’s abandoned disastrously,formula three motor hills abandoned disastrously,[formula],1,THEOREM,7


In [85]:
wn.synsets('reprimand')[2].definition()

'censure severely or angrily'

In [89]:
wn.synsets('bovid')

[Synset('bovid.n.01'), Synset('bovine.a.01')]

In [71]:
wn.synset('touch.n.06').definition()

'a slight but appreciable amount'

In [90]:
wn.synset('bovid.n.01').lemma_names()

['bovid']

In [91]:
wn.synset('bovid.n.01').hypernyms()

[Synset('ruminant.n.01')]

In [105]:
wn.synset('ruminant.n.01').hypernyms()

[Synset('even-toed_ungulate.n.01')]

In [92]:
wn.synset('bovid.n.01').hyponyms()

[Synset('wild_sheep.n.01'),
 Synset('goat.n.01'),
 Synset('old_world_buffalo.n.01'),
 Synset('bison.n.01'),
 Synset('forest_goat.n.01'),
 Synset('goat_antelope.n.01'),
 Synset('antelope.n.01'),
 Synset('sheep.n.01'),
 Synset('musk_ox.n.01'),
 Synset('bovine.n.01')]

In [102]:
wn.synsets('stock')

[Synset('stock.n.01'),
 Synset('stock.n.02'),
 Synset('stock.n.03'),
 Synset('stock_certificate.n.01'),
 Synset('store.n.02'),
 Synset('lineage.n.01'),
 Synset('breed.n.01'),
 Synset('broth.n.01'),
 Synset('stock.n.09'),
 Synset('stock.n.10'),
 Synset('stock.n.11'),
 Synset('stock.n.12'),
 Synset('malcolm_stock.n.01'),
 Synset('stock.n.14'),
 Synset('stock.n.15'),
 Synset('neckcloth.n.01'),
 Synset('livestock.n.01'),
 Synset('stock.v.01'),
 Synset('stock.v.02'),
 Synset('stock.v.03'),
 Synset('stock.v.04'),
 Synset('stock.v.05'),
 Synset('stock.v.06'),
 Synset('sprout.v.02'),
 Synset('banal.s.01'),
 Synset('stock.s.02'),
 Synset('standard.s.05')]

In [104]:
wn.synset('breed.n.01').hypernyms()

[Synset('variety.n.03'), Synset('animal_group.n.01')]

In [110]:
wn.synset('livestock.n.01').definition()

'any animals kept for use or profit'

In [113]:

# 1. Define the word senses (synsets)
bovid = wn.synset('bovid.n.01')
breed = wn.synset('breed.n.01')
livestock = wn.synset('livestock.n.01')

# 2. Calculate various similarity scores (0 = distant, 1 = identical)
breed_path_dist = bovid.path_similarity(breed)
breed_wup_dist = bovid.wup_similarity(breed)
livestock_path_dist = bovid.path_similarity(breed)
livestock_wup_dist = bovid.wup_similarity(breed)

print(f"Path Similarity: {breed_path_dist}")  # Output: ~0.2
print(f"Wu-Palmer Similarity: {breed_wup_dist}") # Output: ~0.85
print(f"Path Similarity: {livestock_path_dist}")  # Output: ~0.2
print(f"Wu-Palmer Similarity: {livestock_wup_dist}") # Output: ~0.85

Path Similarity: 0.05
Wu-Palmer Similarity: 0.09523809523809523
Path Similarity: 0.05
Wu-Palmer Similarity: 0.09523809523809523


In [46]:
# See if the answer is in WordNet as a synset
df_ind_one_word['in_wordnet'] = df_ind_one_word['indicator'].apply(
    lambda x: bool(wn.synsets(x))
    )

NameError: name 'df_ind_one_word' is not defined

## Filtering to clues with single word definitions and answers

In [None]:
df_clues.head()

In [None]:
df_clues_clean['answer_wc'] = df_clues_clean['answer'].astype(str).apply(lambda x: len(x.split()))
#df_clues['definition_wc'] = df_clues['definition'].astype(str).apply(lambda x: len(x.split()))

In [None]:
df_clues_clean.head()

In [None]:
df_one_word_def_ans = df_clues[(df_clues['answer_wc'] == 1) & (df_clues['definition_wc'] == 1)].copy()

In [None]:
df_one_word_def_ans.info()

In [None]:
df_one_word_def_ans.isna().sum()/df_one_word_def_ans.shape[0]

ISSUE: ~36% of "single word" definitions are NaN. After dropping clues with NaN values in the definition, answer or the clue, we have 234407 clues.