In [1]:
import wordle_simulation as ws
import wordle_solver_utils as wu
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import random
from scipy.stats import ks_2samp
import numpy as np

# Starting words

## Good starting words

Let's start by finding some recommended starting words.

We will score each word by how often each of the letters appear, and then pick the top ten highest scorers.

In [2]:
words = ws.load_words()
suggested_starting_words = wu.find_options(words, 10)
suggested_starting_words

['soare',
 'arose',
 'aeros',
 'serai',
 'reais',
 'raise',
 'arise',
 'aesir',
 'aloes',
 'toeas']

In [3]:
len(words)

12947

## Bad Starting Words

What if we look for the absolute worst wordle words? To do this instead of getting the ten highest scores, let's get the ten lowest scored words

In [3]:
letter_dict = wu.make_letter_dict(words)

scores = wu.score_words(words, letter_dict)

sorted_options = [x for _, x in sorted(zip(scores, words), reverse=True)]

not_suggested_starting_words = sorted_options[-10:]

not_suggested_starting_words

['jaffa',
 'cocco',
 'gyppy',
 'zocco',
 'zoppo',
 'jinni',
 'civic',
 'qajaq',
 'immix',
 'xylyl']

## Random Starting Words

Let's also get a selection of random starting words.

In [4]:
random.seed(42) # Set seed for repeatibility
random_words = random.sample(words, 10)
random_words

['roops',
 'ranee',
 'kloof',
 'ingle',
 'roomy',
 'wages',
 'manul',
 'latke',
 'educt',
 'roary']

# Run the simulation

With the wordle solver simulation set up else where, lets try guessing a thousand random words using each of the starting options.

In [5]:
list_of_starting_words = suggested_starting_words + not_suggested_starting_words + random_words

dict_list = [ws.run_simulation(1000, word) for word in list_of_starting_words]

In [6]:
df = pd.DataFrame(dict_list, index=list_of_starting_words)

# Sort columns and fill empty
df = df.reindex(sorted(df.columns), axis=1)
df = df.fillna(0)


# Reorder columns
df = df.reindex(list(df.columns[1:]) +list(df.columns[:1]), axis=1)
df = df.rename(columns={-1 : 'Failed'})
df = df.div(1000)

# Set up column to show which category each word belongs to
category = ['good'] * 10 + ['bad'] * 10 + ['random'] * 10
df['category'] = category

df

Unnamed: 0,2,3,4,5,6,Failed,category
soare,0.015,0.158,0.345,0.239,0.144,0.099,good
arose,0.01,0.152,0.325,0.258,0.129,0.126,good
aeros,0.013,0.162,0.327,0.254,0.126,0.118,good
serai,0.017,0.16,0.353,0.227,0.13,0.113,good
reais,0.013,0.165,0.353,0.23,0.12,0.119,good
raise,0.008,0.164,0.327,0.246,0.129,0.126,good
arise,0.009,0.153,0.352,0.245,0.111,0.13,good
aesir,0.018,0.166,0.327,0.243,0.127,0.119,good
aloes,0.017,0.165,0.351,0.229,0.119,0.119,good
toeas,0.015,0.157,0.326,0.246,0.121,0.135,good


In [7]:
average_group_score = df.groupby('category').mean()
average_group_score

Unnamed: 0_level_0,2,3,4,5,6,Failed
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bad,0.0023,0.0523,0.2617,0.3326,0.1853,0.1658
good,0.0135,0.1602,0.3386,0.2417,0.1256,0.1204
random,0.0084,0.1317,0.3422,0.2659,0.1331,0.1187


In [8]:
cdfs = average_group_score.cumsum(axis=1)

In [9]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=cdfs.columns, y=cdfs.loc['good'], name='Good'))
fig.add_trace(go.Scatter(x=cdfs.columns, y=cdfs.loc['random'], name='Random'))
fig.add_trace(go.Scatter(x=cdfs.columns, y=cdfs.loc['bad'], name='Bad'))

fig.update_layout(
    title='Cumulative Distribution Functions of Mean Performance',
    xaxis_title="Number of Guesses",
    yaxis_title="Probability ff Solving",

    xaxis=dict(
        tick0 = 1,
        dtick = 1,
    )
)


fig.show()

In [10]:
def find_ks_stat_of_groups(cdfs, group1, group2):
    return (cdfs.loc[group1] - cdfs.loc[group2]).abs().max()

In [11]:
find_ks_stat_of_groups(cdfs, 'good', 'bad')

0.196

In [12]:
len(cdfs.loc['good'])

6

In [13]:
np.sqrt(16/64)*1.073

0.5365

In [14]:
wu

<module 'wordle_solver_utils' from 'd:\\PythonProjects\\WordleSolver\\wordle_solver_utils.py'>

In [6]:
words_after_removing_irt = wu.remove_words_with_forbidden_characters(words, ['i', 'r', 'r'])
len(words_after_removing_irt)

6429

In [24]:
(1/len(words_after_removing_irt))*100

0.015554518587649713

In [8]:
words_after_keeping_ae = wu.keep_words_with_necessary_characters(words_after_removing_irt, ['a', 'e'])
len(words_after_keeping_ae)

1067

In [23]:
(1/len(words_after_keeping_ae)) * 100

0.09372071227741331

In [9]:
1/len(words)

7.72379701861435e-05

In [28]:
words_after_removing_middle_a = wu.remove_words_that_match_pattern(words_after_keeping_ae, '..a..')
len(words_after_removing_middle_a)

814

In [27]:
(1 /len(words_after_removing_middle_a)) * 100

0.12285012285012285

In [30]:
words_after_keeping_e = wu.keep_words_that_match_pattern(words_after_keeping_ae, '....e')
len(words_after_keeping_e)

307

In [31]:
1/307

0.003257328990228013

In [33]:
word_dictionary = wu.make_letter_dict(words_after_keeping_e)

In [41]:
letter_df = pd.DataFrame(word_dictionary, columns=word_dictionary.keys(), index=[0])
letter_df = letter_df.reindex(sorted(letter_df.columns), axis=1)
letter_df

Unnamed: 0,a,b,c,d,e,f,g,h,j,k,...,p,q,s,t,u,v,w,x,y,z
0,328,34,48,36,360,14,46,38,5,24,...,50,4,99,71,34,37,18,2,14,27


In [83]:
fig = px.bar(letter_df.T, color_discrete_sequence=['#6aaa64'], hover_data={
    'variable':False,
})
fig.update_layout(
    xaxis=dict(
        title='Letter'
    ),
    yaxis=dict(
        title='Letter Score',
        gridcolor='#ebebeb'
    ),
    showlegend=False,
    title='Letter score after elimination',
    paper_bgcolor='#ffffff',
    plot_bgcolor='rgba(0,0,0,0)',
        hoverlabel=dict(
        bgcolor="white",
        font_size=16,
        font_family="Rockwell"
    ),
    

)

fig.update_traces(hovertemplate='GDP: %{x} Life Expectancy: %{y}') 

fig.show()