# NLP - Crossword Puzzles
**Tony Ghabour**

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#NLP---Crossword-Puzzles" data-toc-modified-id="NLP---Crossword-Puzzles-1">NLP - Crossword Puzzles</a></span></li><li><span><a href="#Initial-Setup" data-toc-modified-id="Initial-Setup-2">Initial Setup</a></span></li><li><span><a href="#Data-Work" data-toc-modified-id="Data-Work-3">Data Work</a></span><ul class="toc-item"><li><span><a href="#Retreival" data-toc-modified-id="Retreival-3.1">Retreival</a></span></li><li><span><a href="#Store-in-MongoDB" data-toc-modified-id="Store-in-MongoDB-3.2">Store in MongoDB</a></span></li><li><span><a href="#Retreive-from-MongoDB" data-toc-modified-id="Retreive-from-MongoDB-3.3">Retreive from MongoDB</a></span></li><li><span><a href="#Processing" data-toc-modified-id="Processing-3.4">Processing</a></span></li></ul></li><li><span><a href="#Puzzle-Class/Objects" data-toc-modified-id="Puzzle-Class/Objects-4">Puzzle Class/Objects</a></span><ul class="toc-item"><li><span><a href="#Grids-(Heatmaps)" data-toc-modified-id="Grids-(Heatmaps)-4.1">Grids (Heatmaps)</a></span></li></ul></li><li><span><a href="#EDA" data-toc-modified-id="EDA-5">EDA</a></span><ul class="toc-item"><li><span><a href="#Answers" data-toc-modified-id="Answers-5.1">Answers</a></span></li><li><span><a href="#Clues" data-toc-modified-id="Clues-5.2">Clues</a></span></li></ul></li><li><span><a href="#NLP-Analysis" data-toc-modified-id="NLP-Analysis-6">NLP Analysis</a></span><ul class="toc-item"><li><span><a href="#Dimensionality-Reduction" data-toc-modified-id="Dimensionality-Reduction-6.1">Dimensionality Reduction</a></span><ul class="toc-item"><li><span><a href="#Vectorize-Corpus" data-toc-modified-id="Vectorize-Corpus-6.1.1">Vectorize Corpus</a></span></li><li><span><a href="#LSA" data-toc-modified-id="LSA-6.1.2">LSA</a></span><ul class="toc-item"><li><span><a href="#Count" data-toc-modified-id="Count-6.1.2.1">Count</a></span></li><li><span><a href="#TF-IDF" data-toc-modified-id="TF-IDF-6.1.2.2">TF-IDF</a></span></li></ul></li><li><span><a href="#NMF" data-toc-modified-id="NMF-6.1.3">NMF</a></span><ul class="toc-item"><li><span><a href="#Count" data-toc-modified-id="Count-6.1.3.1">Count</a></span></li><li><span><a href="#TF-IDF" data-toc-modified-id="TF-IDF-6.1.3.2">TF-IDF</a></span></li></ul></li></ul></li></ul></li><li><span><a href="#Classification" data-toc-modified-id="Classification-7">Classification</a></span></li><li><span><a href="#Future-Work" data-toc-modified-id="Future-Work-8">Future Work</a></span></li></ul></div>

# Initial Setup

In [1]:
import sys
sys.path.append('../src')

In [2]:
#Standard Libraries 
import os
import re
import ast
import xword
import json
import random 
import numpy as np
import pandas as pd
import seaborn as sns
import datetime as dt
import statistics as st
import matplotlib.pyplot as plt

from string import digits
from datetime import datetime 
from collections import Counter 
from matplotlib.ticker import PercentFormatter

%matplotlib inline 

In [3]:
# Database management 
from pymongo import MongoClient

In [4]:
# NLP & Text Processing
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [5]:
# Classification Model Tools
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

# Data Work

## Retreival

In [6]:
all_puzzles = []
base_path = '../data/raw/'
puzzle_paths = xword.get_jsons(base_path)

for json_file_path in puzzle_paths:
    with open(json_file_path) as json_file:
        try:
            all_puzzles.append(json.load(json_file))
        except:
            print(f'Unable to load puzzle: {json_file_path}')
            pass

Unable to load puzzle: ../data/raw/2017/12/03.json
Unable to load puzzle: ../data/raw/2018/03/08.json


## Store in MongoDB

In [7]:
client = MongoClient()

In [8]:
db = client.NYT

In [9]:
#db.create_collection("puzzle_collection") # only run once 
db.list_collection_names()

['puzzle_collection']

In [10]:
pc = db.get_collection("puzzle_collection")

In [11]:
#pc.insert_many(all_puzzles) # only run once 

In [12]:
len(all_puzzles), pc.count_documents({})

(14545, 14545)

## Retreive from MongoDB

In [13]:
projection = {'_id': 0, 
              'acrossmap': 0, 
              'admin': 0, 
              'autowrap': 0,
              'bbars': 0,
              'code': 0, 
              'copyright': 0,
              'rbars': 0, 
              'track': 0, 
              'downmap': 0, 
              'mini': 0, 
              'key': 0, 
              'id': 0, 
              'id2': 0,
              'interpretcolors': 0,   
              'hold': 0,
              'publisher': 0, 
              'uniclue' : 0,
              'valid': 0,
              'type': 0}

filter_ = {'uniclue': {"$ne": True}}

cursor = pc.find(filter_,projection)

In [14]:
pc_df = pd.DataFrame(list(cursor))

In [15]:
pc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14544 entries, 0 to 14543
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   answers       14544 non-null  object
 1   author        14544 non-null  object
 2   circles       346 non-null    object
 3   clues         14544 non-null  object
 4   date          14544 non-null  object
 5   dow           14544 non-null  object
 6   editor        14544 non-null  object
 7   grid          14544 non-null  object
 8   gridnums      14544 non-null  object
 9   jnotes        503 non-null    object
 10  notepad       202 non-null    object
 11  shadecircles  59 non-null     object
 12  size          14544 non-null  object
 13  title         14544 non-null  object
 14  hastitle      473 non-null    object
dtypes: object(15)
memory usage: 1.7+ MB


## Processing

In [16]:
# Create fields in dataframe for number of rows and columns in each crossword puzzle
pc_df['rows'] = pc_df['size'].apply(lambda x: x['rows'])
pc_df['cols'] = pc_df['size'].apply(lambda x: x['cols'])

# Aggregate down+accross clues and answers in separate fields
pc_df['all_clues'] = pc_df['clues'].apply(lambda x: x['across'] + x['down']) 
pc_df['all_answers'] = pc_df['answers'].apply(lambda x: x['across'] + x['down']) 

# Clean up redundant data
pc_df = pc_df.drop(['size', 
                    'clues', 
                    'answers'], axis = 1)

In [None]:
pc_df['clean_clues'] = pc_df['all_clues'].apply(lambda x: xword.process_text(x))
pc_df['clean_answers'] = pc_df['all_answers'].apply(lambda x: xword.process_text(x))

In [None]:
pc_df.to_csv('../data/processed/cleaned_corpus.csv')

# Puzzle Class/Objects

**Specify date range over which to retrieve/inspect puzzles.**

In [None]:
start_date = dt.date(2000, 1, 1)
end_date = dt.date(2004, 1, 1)
dt_range = xword.date_range(start_date, end_date)

In [None]:
list_of_puzzles = xword.get_puzzles(dt_range)
puzzle_count = len(list_of_puzzles)
random_index = random.choice(range(puzzle_count))

In [None]:
sample_puzzle = xword.puzzle()
sample_puzzle.parse_puzzle(list_of_puzzles[random_index])

**Now we can use our class to drill down and investigate attributes of an individual puzzle or group of puzzles.**

In [None]:
sample_puzzle.date, sample_puzzle.dow

In [None]:
sample_puzzle.author

In [None]:
sample_puzzle.editor

In [None]:
sample_puzzle.clues.across

In [None]:
sample_puzzle.clues.down

In [None]:
sample_puzzle.answers.across

In [None]:
sample_puzzle.answers.down

In [None]:
sample_puzzle.blank()

In [None]:
sample_puzzle.solution()

## Grids (Heatmaps)

In [None]:
xword.heat_map(list_of_puzzles, days = ['Monday'])

In [None]:
xword.heat_map(list_of_puzzles, days = ['Tuesday'])

In [None]:
xword.heat_map(list_of_puzzles, days = ['Wednesday'])

In [None]:
xword.heat_map(list_of_puzzles, days = ['Thursday'])

In [None]:
xword.heat_map(list_of_puzzles, days = ['Friday'])

In [None]:
xword.heat_map(list_of_puzzles, days = ['Saturday'])

In [None]:
xword.heat_map(list_of_puzzles, days = ['Sunday'])

In [None]:
xword.heat_map(list_of_puzzles, days = 'all')

# EDA

**Pull in clean data.**

In [None]:
pc_df = pd.read_csv('../data/processed/cleaned_corpus.csv')
pc_df = pc_df.drop(['Unnamed: 0'], axis = 1)

In [None]:
pc_df['all_clues'] = pc_df['all_clues'].apply(lambda x: ast.literal_eval(x))
pc_df['all_answers'] = pc_df['all_answers'].apply(lambda x: ast.literal_eval(x))

#
pc_df['clean_clues'] = pc_df['clean_clues'].apply(lambda x: xword.reduce_col(x))
pc_df['clean_answers'] = pc_df['clean_answers'].apply(lambda x: xword.reduce_col(x))

**Total number of puzzles in corpus.**

In [None]:
len(pc_df)

**Count number of unique authors.**

In [None]:
pc_df.author.nunique()

## Answers

**Let's take a look at the answers...specifically, statistics relating to answer lengths and novelty/uniqueness.**

In [None]:
pc_df['ans_lens'] = pc_df['all_answers'].apply(lambda x: [len(y) for y in x])
pc_df['min_len'] = pc_df['ans_lens'].apply(lambda x: min(x))
pc_df['max_len'] = pc_df['ans_lens'].apply(lambda x: max(x))
pc_df['mean_len'] = pc_df['ans_lens'].apply(lambda x: st.mean(x))
pc_df['median_len'] = pc_df['ans_lens'].apply(lambda x: st.median(x))

In [None]:
pc_df.describe()

In [None]:
answer_df = pc_df.set_index(['date'])

In [None]:
answers_only = (pd.melt(answer_df.all_answers.apply(pd.Series).reset_index(), 
                id_vars=['date'], 
                value_name='all_answers')
               .set_index(['date'])
               .drop('variable', axis=1)
               .dropna()
               .sort_index()
              ).reset_index()

answers_only = answers_only.sort_values(['date'])

In [None]:
answers = set()
new_ans_dates = []
for idx, row in answers_only.iterrows():
    if row[1] in answers:
        pass
    else:
        new_ans_dates.append(row.date)
        answers.add(row[1])
        
new_ans = Counter()
for date in new_ans_dates: 
    new_ans[date] += 1

In [None]:
answers_df = pd.DataFrame.from_dict(new_ans, orient='index').reset_index()
answers_df = answers_df.rename({0:'unique_ans_count', 'index': 'date'}, axis = 1)

In [None]:
answers_df['date_dt'] = answers_df['date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').date())
answers_df['dow_int'] = answers_df.date_dt.apply(lambda x: x.weekday()) 

In [None]:
plot_data = answers_df.groupby(['dow_int']).mean().reset_index()
dow_labels = ('MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN')

x = plot_data['dow_int']
y = plot_data['unique_ans_count']

plt.figure(figsize=(10,5))
plt.xticks(range(7), dow_labels)
plt.yticks(np.arange(0, 25, step=2))

plt.ylabel('Average Count', size = 14)

bars = plt.bar(x, y, color='silver');
bars[6].set_color('crimson')

plt.title('First Appearance by Day of Week', size = 16);

plt.savefig("../img/avg_unique_words.svg", format="svg")

## Clues

In [None]:
pc_df['fillin_pct'] = pc_df.all_clues.apply(lambda x: xword.fillin_pct(x))
pc_df['quotes_pct'] = pc_df.all_clues.apply(lambda x: xword.quotes_pct(x))
pc_df['ques_pct'] = pc_df.all_clues.apply(lambda x: xword.ques_pct(x))
pc_df['self_ref'] = pc_df.all_clues.apply(lambda x: xword.self_ref(x))

In [None]:
pc_df['date_dt'] = pc_df['date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').date())
pc_df['dow_int'] = pc_df.date_dt.apply(lambda x: x.weekday()) 

In [None]:
plot_data = pc_df.groupby(['dow_int']).mean().reset_index() #.drop(['rows', 'cols'], axis = 1)
plot_data

In [None]:
dow_labels = ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday')

plt.figure(figsize=(10,5))
plt.xticks(range(7), dow_labels)

plt.gca().yaxis.set_major_formatter(PercentFormatter(1))

plt.plot(plot_data['dow_int'], plot_data['fillin_pct'], alpha = 1, linestyle=':', color = 'grey')
plt.plot(plot_data['dow_int'], plot_data['quotes_pct'], alpha = 1, linestyle='-.', color = 'grey')
plt.plot(plot_data['dow_int'], plot_data['self_ref'], alpha = 1, linestyle='--', color = 'grey')
plt.plot(plot_data['dow_int'], plot_data['ques_pct'], color = 'crimson');

plt.title('Occurence of Common Clue Types', size = 16)
plt.ylabel('Percentage of Clues', size = 14)
plt.yticks(np.arange(0, .11, step=.01))
plt.legend(['Fill in the Blank', 'Quotations', 'Self-Referential', 'Word Play'], loc='upper center', ncol=4);

plt.savefig("../img/clue_types.svg", format="svg")

# NLP Analysis

In [None]:
docs = pc_df['clean_clues']
docs.head()

## Dimensionality Reduction

In [None]:
topics = 50

### Vectorize Corpus

**Count Vectorizer**

In [None]:
cv = CountVectorizer(ngram_range = (2,3), stop_words = 'english', min_df = .0002) #, max_df = .0005
#cv = CountVectorizer(ngram_range = (2,2), stop_words = 'english', min_df = .0002) #, max_df = .0005

In [None]:
doc_word_cv = cv.fit_transform(docs)
pd.DataFrame(doc_word_cv.toarray(), index=list(docs.index), columns=cv.get_feature_names()).head()

**TF-IDF Vectorizer**

In [None]:
tf = TfidfVectorizer(ngram_range = (2,3), stop_words = 'english', min_df = .0002) #, max_df = .0005)

In [None]:
doc_word_tf = tf.fit_transform(docs)
pd.DataFrame(doc_word_tf.toarray(), index=list(docs.index), columns=tf.get_feature_names()).head()

### LSA

In [None]:
lsa = TruncatedSVD(topics)

#### Count 

In [None]:
vectorizer = cv

In [None]:
doc_topic_lsa_cv = lsa.fit_transform(doc_word_cv)
sum(lsa.explained_variance_ratio_)

In [None]:
topic_word_lsa_cv = pd.DataFrame(lsa.components_.round(3),
                                 index = [f'component_{i}' for i in range(topics)],
                                 columns = vectorizer.get_feature_names())
topic_word_lsa_cv.head()

In [None]:
xword.display_topics(lsa, vectorizer.get_feature_names(), 10)

In [None]:
Vt_cv = pd.DataFrame(doc_topic_lsa_cv.round(5),
                     index = list(docs.index),
                     columns =  [f'component_{i}' for i in range(topics)])
Vt_cv.head()

#### TF-IDF 

In [None]:
vectorizer = tf

In [None]:
doc_topic_lsa_tf = lsa.fit_transform(doc_word_tf)
sum(lsa.explained_variance_ratio_)

In [None]:
topic_word_lsa_tf = pd.DataFrame(lsa.components_.round(3),
                                 index = [f'component_{i}' for i in range(topics)],
                                 columns = vectorizer.get_feature_names())
topic_word_lsa_tf.head()

In [None]:
xword.display_topics(lsa, vectorizer.get_feature_names(), 10)

In [None]:
Vt_tf = pd.DataFrame(doc_topic_lsa_tf.round(5),
                     index = list(docs.index),
                     columns =  [f'component_{i}' for i in range(topics)])
Vt_tf.head()

### NMF

In [None]:
nmf_model = NMF(topics)

#### Count

In [None]:
vectorizer = cv

In [None]:
doc_topic_nmf_cv = nmf_model.fit_transform(doc_word_cv)

In [None]:
topic_word_nmf_cv = pd.DataFrame(nmf_model.components_.round(3),
                                 index = [f'component_{i}' for i in range(topics)],
                                 columns = vectorizer.get_feature_names())
topic_word_nmf_cv.head()

In [None]:
xword.display_topics(nmf_model, vectorizer.get_feature_names(), 10)

In [None]:
H_cv = pd.DataFrame(doc_topic_nmf_cv.round(5),
                    index = list(docs.index),
                    columns =  [f'component_{i}' for i in range(topics)])
H_cv.head()

#### TF-IDF 

In [None]:
vectorizer = tf

In [None]:
doc_topic_nmf_tf = nmf_model.fit_transform(doc_word_tf)

In [None]:
topic_word_nmf_tf = pd.DataFrame(nmf_model.components_.round(3),
                                 index = [f'component_{i}' for i in range(topics)],
                                 columns = vectorizer.get_feature_names())
topic_word_nmf_tf.head()

In [None]:
xword.display_topics(nmf_model, vectorizer.get_feature_names(), 10)

In [None]:
H_tf = pd.DataFrame(doc_topic_nmf_tf.round(5),
                    index = list(docs.index),
                    columns =  [f'component_{i}' for i in range(topics)])
H_tf.head()

# Classification

In [None]:
lr = LogisticRegression(max_iter = 1000, multi_class = 'multinomial')

In [None]:
all_features = Vt_cv.merge(pc_df[['fillin_pct', 'quotes_pct', 'ques_pct', 'self_ref']], 
                           how ='inner', 
                           left_index = True, 
                           right_index = True)

In [None]:
X_1 = Vt_cv
X_2 = all_features
y = pc_df.dow

In [None]:
model_data_1 = train_test_split(X_1, y, test_size=0.3, random_state=420)
model_data_2 = train_test_split(X_2, y, test_size=0.3, random_state=420)

In [None]:
xword.show_cm(lr, model_data_1, colormap = 'seismic', title = "Semantic Features Only")
xword.show_cm(lr, model_data_2, colormap = 'seismic', title = "Semantic + Descriptive Features")  

In [None]:
xword.show_cm(lr, model_data_1, colormap = 'Greys', title = "Semantic Features Only")
xword.show_cm(lr, model_data_2, colormap = 'Greys', title = "Semantic + Descriptive Features")  

# Future Work

* Letter frequency
* Answer length
* Named entity recognition 
* Analysis by author
* Analysis by editor
* Changes over time 
* Refined topic analysis
* Consider using individual clues as docs
* Clustering
* Try other classifiers and ensembling (Naive Bayes?)
* LDA?