# Ingest and Embed Historical NYT Clues

### Preparing Data

In [1]:
import pandas as pd

df = pd.read_csv(
    "data/clues.tsv", sep="\t", usecols=["pubid", "year", "answer", "clue"]
)

print(f"Initial dataframe shape: {df.shape}")
print(
    f"Earliest non-zero year: {df[df['year'] != 0]['year'].min()}, Latest year: {df['year'].max()}"
)

Initial dataframe shape: (7579226, 4)
Earliest non-zero year: 1913, Latest year: 2025


In [8]:
df = df.dropna(subset=["answer", "clue"])
print(f"After dropping NA: {df.shape}")

df.sort_values("year", ascending=False, inplace=True)
df = df.drop_duplicates(subset=["clue", "answer"], keep="first")  # keep the most recent
print(f"After dropping duplicates: {df.shape}")

df = df[df["answer"].str.fullmatch(r"[A-Za-z]+")]
print(f"After filtering non-alpha answers: {df.shape}")

df = df[~df["answer"].str.contains("XX", na=False)]
print(f"After filtering answers with 'XX': {df.shape}")

After dropping NA: (7490011, 4)
After dropping duplicates: (3689368, 4)
After filtering non-alpha answers: (3688149, 4)
After filtering answers with 'XX': (3680424, 4)


In [18]:
subset = (
    df[(df["year"] >= 2024) & (df["pubid"] == "nyt")]
    .drop_duplicates(subset=["answer"], keep="first")
    .sort_values("answer")
)
print(f"Filtered to 2024 & 2025 NYT, deduplicated on answers: {subset.shape}")

Filtered to 2024 & 2025 NYT, deduplicated on answers: (16104, 4)


In [31]:
sample = subset.sample(5000).sort_values("answer").reset_index(drop=True)
sample["embedding_string"] = (
    "Clue: " + sample["clue"] + " | Answer: " + sample["answer"]
)

In [32]:
sample.head()

Unnamed: 0,pubid,year,answer,clue,embedding_string
0,nyt,2024,AAVE,"Dialect in the Black community, in brief","Clue: Dialect in the Black community, in brief..."
1,nyt,2024,AAVERAGES,Means of excellence?,Clue: Means of excellence? | Answer: AAVERAGES
2,nyt,2024,ABA,Org. for lawyers,Clue: Org. for lawyers | Answer: ABA
3,nyt,2024,ABBAS,"Fathers, in Hebrew","Clue: Fathers, in Hebrew | Answer: ABBAS"
4,nyt,2024,ABBEY,Place of learning in the Middle Ages,Clue: Place of learning in the Middle Ages | A...


In [40]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B", device="cpu")

In [None]:
model.encode("embedding string")

In [None]:
results = []
for i in range(0, len(df), 1_000):
    batch = sample.iloc[i : i + 1_000]
    embeddings = model.encode(
        batch["embedding_string"].tolist(), show_progress_bar=True
    )
    results.extend(embeddings)

In [36]:
sample["embedding_vector"] = results

In [39]:
sample.head()

Unnamed: 0,pubid,year,answer,clue,embedding_string,embedding_vector
0,nyt,2024,AAVE,"Dialect in the Black community, in brief","Clue: Dialect in the Black community, in brief...","[-0.0009760008, -0.08432816, -0.011223102, 0.0..."
1,nyt,2024,AAVERAGES,Means of excellence?,Clue: Means of excellence? | Answer: AAVERAGES,"[0.019850034, -0.08287808, -0.012369995, 0.019..."
2,nyt,2024,ABA,Org. for lawyers,Clue: Org. for lawyers | Answer: ABA,"[-0.0011886876, -0.10796727, -0.010138356, -0...."
3,nyt,2024,ABBAS,"Fathers, in Hebrew","Clue: Fathers, in Hebrew | Answer: ABBAS","[-0.014206971, -0.09720291, -0.008017893, 0.00..."
4,nyt,2024,ABBEY,Place of learning in the Middle Ages,Clue: Place of learning in the Middle Ages | A...,"[-0.0074684564, -0.02394928, -0.0074250177, -0..."


In [37]:
sample.to_json("data/clues.json", orient="records")