In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# other functions we need
from ast import literal_eval

## String of pearls: index

The output CSV contains an array of every cell index filled out. The rows are puzzles and the columns are cells filled out.

In [2]:
# import SQL dependencies
import sqlite3

# connect to .db
con = sqlite3.connect("../data-collection/nyt-sudoku.db")
cur = con.cursor()

doks_perf = pd.read_sql("SELECT * from doks_performance;",
           con)

doks_easy = pd.read_sql("SELECT * from doks_easy;",
           con)

In [3]:
doks_order = pd.DataFrame(doks_perf['doks_order'])

In [4]:
doks_times = pd.DataFrame(doks_perf['times'])

In [5]:
garland_data = pd.DataFrame(doks_order['doks_order'].apply(literal_eval).tolist())

In [6]:
garland_data.to_csv('string_of_pearls_data.csv', index=False)

## String of pearls: section

In [7]:
def get_sudoku_section(cell_index):
    # Check if the index is within the valid range (0 to 80)
    if cell_index < 0 or cell_index > 80:
        raise ValueError("Invalid index. Cell index should be between 0 and 80.")

    # Calculate the row and column indices from the cell index
    row_index = cell_index // 9
    col_index = cell_index % 9

    # Determine the section based on the row and column indices
    section_row = row_index // 3
    section_col = col_index // 3

    # Calculate the section number
    section_number = section_row * 3 + section_col

    # Return the section number (zero-indexed)
    return section_number

In [8]:
garland_data__section = garland_data.applymap(get_sudoku_section)

In [9]:
garland_data__section

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49,50,51,52,53,54,55,56,57,58
0,8,7,1,6,0,0,0,0,1,1,...,,,,,,,,,,
1,1,7,6,8,7,6,0,0,0,3,...,,,,,,,,,,
2,2,6,3,3,0,1,2,2,5,5,...,,,,,,,,,,
3,0,3,1,0,0,0,0,2,2,1,...,,,,,,,,,,
4,1,2,5,3,6,7,6,3,7,8,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,2,5,6,7,3,3,3,0,4,...,6.0,6.0,,,,,,,,
96,0,0,0,8,2,1,1,1,1,2,...,6.0,3.0,3.0,6.0,3.0,4.0,4.0,7.0,7.0,7.0
97,8,2,1,1,2,0,6,3,3,3,...,,,,,,,,,,
98,2,5,8,1,1,0,3,3,4,3,...,,,,,,,,,,


In [10]:
from collections import Counter

In [11]:
# look for most common sections I start in
np.unique(
        # look at section values in the first n steps
        garland_data__section.iloc[:,:1]

        # convert to numpy so we can flatten
        .to_numpy().flatten(),
    
    # built-in argument of np.unique
    return_counts=True
)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=int64),
 array([25, 23, 25,  4,  6,  5,  4,  5,  3], dtype=int64))

In [12]:
np.unique(garland_data__section.iloc[:,:5].values, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=int64),
 array([65, 63, 74, 47, 47, 57, 56, 40, 51], dtype=int64))

In [13]:
np.unique(garland_data__section.iloc[:,5:10].values, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=int64),
 array([59, 77, 56, 56, 46, 53, 55, 49, 49], dtype=int64))

In [14]:
np.unique(garland_data__section.iloc[:,10:15].values, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=int64),
 array([66, 48, 69, 74, 56, 58, 43, 38, 48], dtype=int64))

In [15]:
np.unique(garland_data__section.iloc[:,15:20].values, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=int64),
 array([74, 61, 83, 56, 53, 58, 45, 32, 38], dtype=int64))

In [16]:
np.unique(garland_data__section.iloc[:,20:25].values, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=int64),
 array([45, 53, 53, 68, 58, 67, 57, 39, 60], dtype=int64))

In [17]:
np.unique(garland_data__section.iloc[:,25:30].values, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=int64),
 array([43, 34, 30, 68, 73, 70, 58, 72, 52], dtype=int64))

In [18]:
np.unique(garland_data__section.iloc[:,30:35].values, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=int64),
 array([28, 42, 37, 60, 76, 43, 73, 76, 65], dtype=int64))

In [19]:
np.unique(garland_data__section.iloc[:,35:40].values, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=int64),
 array([33, 35, 30, 39, 52, 59, 56, 97, 99], dtype=int64))

In [20]:
np.unique(garland_data__section.iloc[:,40:45].values, return_counts=True)

(array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8., nan]),
 array([29, 29, 31, 26, 43, 34, 63, 97, 58, 90], dtype=int64))

In [21]:
np.unique(garland_data__section.iloc[:,45:50].values, return_counts=True)

(array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8., nan]),
 array([  2,   6,   5,   2,   3,   7,  11,  13,  12, 439], dtype=int64))

In [22]:
# look for most common sections I end in
Counter(
    # pick the last value in every array
    [x[~np.isnan(x)][-1]
     for x in
     garland_data__section.to_numpy()]
)

Counter({5.0: 11,
         6.0: 12,
         7.0: 19,
         4.0: 10,
         3.0: 8,
         8.0: 18,
         1.0: 12,
         0.0: 6,
         2.0: 4})

## String of pearls: times

In [135]:
garland_times = pd.DataFrame(doks_perf['times'].apply(literal_eval).tolist())

In [136]:
garland_times.to_csv('string_of_pearls_times_data.csv', index=False)

## String of pearls: values

In [23]:
garland_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49,50,51,52,53,54,55,56,57,58
0,70,59,3,54,11,10,20,18,13,12,...,,,,,,,,,,
1,22,57,56,69,67,74,9,1,2,29,...,,,,,,,,,,
2,15,64,27,46,18,12,6,17,34,43,...,,,,,,,,,,
3,20,36,21,2,10,11,19,26,8,5,...,,,,,,,,,,
4,3,25,42,47,73,68,74,45,67,61,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,19,15,52,63,58,37,29,47,0,50,...,73.0,72.0,,,,,,,,
96,9,0,1,80,24,12,5,14,21,16,...,65.0,47.0,29.0,65.0,46.0,32.0,50.0,76.0,67.0,66.0
97,62,24,4,3,7,0,72,45,45,27,...,,,,,,,,,,
98,24,35,70,14,23,10,29,27,32,47,...,,,,,,,,,,
