In [18]:
%load_ext lab_black
import pandas as pd
import numpy as np
from string import ascii_uppercase, ascii_lowercase
from collections import Counter

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [80]:
df_wordle_options = pd.read_csv(
    "all_word_options.csv", header=None, names=["all_words"]
)
df_wordle_options.drop_duplicates(inplace=True)
print(f"Data read successful, found {df_wordle_options.shape[0]:,} unique words")

Data read successful, found 12,972 unique words


In [81]:
df_wordle_options.head()

Unnamed: 0,all_words
0,AAHED
1,AALII
2,AARGH
3,AARTI
4,ABACA


In [82]:
df_wordle_options = df_wordle_options[["all_words"]].join(
    pd.DataFrame([Counter(word) for word in df_wordle_options["all_words"].str.upper()])
    .reindex(list(ascii_uppercase), axis=1)
    .fillna(0)
    .astype(int)
)

In [83]:
df_wordle_options.head()

Unnamed: 0,all_words,A,B,C,D,E,F,G,H,I,...,Q,R,S,T,U,V,W,X,Y,Z
0,AAHED,2,0,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,AALII,2,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
2,AARGH,2,0,0,0,0,0,1,1,0,...,0,1,0,0,0,0,0,0,0,0
3,AARTI,2,0,0,0,0,0,0,0,1,...,0,1,0,1,0,0,0,0,0,0
4,ABACA,3,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [84]:
df_counts_once = df_wordle_options.replace([2, 3, 4, 5], 1)
df_counts_once.head()

Unnamed: 0,all_words,A,B,C,D,E,F,G,H,I,...,Q,R,S,T,U,V,W,X,Y,Z
0,AAHED,1,0,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,AALII,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,AARGH,1,0,0,0,0,0,1,1,0,...,0,1,0,0,0,0,0,0,0,0
3,AARTI,1,0,0,0,0,0,0,0,1,...,0,1,0,1,0,0,0,0,0,0
4,ABACA,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [95]:
summary_df_counts_all = (
    df_wordle_options[list(ascii_uppercase)].sum().to_frame(name="count_of_letter_all")
)
summary_df_counts_all["pct_count_all"] = (
    summary_df_counts_all["count_of_letter_all"]
    / summary_df_counts_all["count_of_letter_all"].sum()
)
summary_df_counts_all.reset_index(inplace=True)
summary_df_counts_all.columns = ["letter", "count_of_letter_all", "pct_count_all"]
summary_df_counts_all

Unnamed: 0,letter,count_of_letter_all,pct_count_all
0,A,5990,0.092353
1,B,1627,0.025085
2,C,2028,0.031267
3,D,2453,0.03782
4,E,6662,0.102714
5,F,1115,0.017191
6,G,1644,0.025347
7,H,1760,0.027135
8,I,3759,0.057956
9,J,291,0.004487


In [96]:
summary_df_counts_once = (
    df_counts_once[list(ascii_uppercase)].sum().to_frame(name="count_of_letter_once")
)
summary_df_counts_once["pct_count_once"] = (
    summary_df_counts_once["count_of_letter_once"]
    / summary_df_counts_once["count_of_letter_once"].sum()
)
summary_df_counts_once.reset_index(inplace=True)
summary_df_counts_once.columns = ["letter", "count_of_letter_once", "pct_count_once"]
summary_df_counts_once

Unnamed: 0,letter,count_of_letter_once,pct_count_once
0,A,5330,0.089226
1,B,1519,0.025429
2,C,1920,0.032141
3,D,2298,0.038469
4,E,5705,0.095504
5,F,990,0.016573
6,G,1543,0.02583
7,H,1708,0.028592
8,I,3589,0.060081
9,J,289,0.004838


In [97]:
df_letter_scores = pd.merge(
    summary_df_counts_all, summary_df_counts_once, how="inner", on="letter"
)
df_letter_scores

Unnamed: 0,letter,count_of_letter_all,pct_count_all,count_of_letter_once,pct_count_once
0,A,5990,0.092353,5330,0.089226
1,B,1627,0.025085,1519,0.025429
2,C,2028,0.031267,1920,0.032141
3,D,2453,0.03782,2298,0.038469
4,E,6662,0.102714,5705,0.095504
5,F,1115,0.017191,990,0.016573
6,G,1644,0.025347,1543,0.02583
7,H,1760,0.027135,1708,0.028592
8,I,3759,0.057956,3589,0.060081
9,J,291,0.004487,289,0.004838


In [87]:
summary_df_counts_all.sort_values(by="pct_count", ascending=False)

Unnamed: 0,count_of_letter,pct_count
S,6665,0.10276
E,6662,0.102714
A,5990,0.092353
O,4438,0.068424
R,4158,0.064107
I,3759,0.057956
L,3371,0.051973
T,3295,0.050802
N,2952,0.045513
U,2511,0.038714


In [88]:
summary_df_counts_once.sort_values(by="pct_count", ascending=False)

Unnamed: 0,count_of_letter,pct_count
S,5936,0.099371
E,5705,0.095504
A,5330,0.089226
O,3911,0.065471
R,3909,0.065438
I,3589,0.060081
L,3114,0.052129
T,3033,0.050773
N,2787,0.046655
U,2436,0.040779


In [11]:
df_wordle_options[df_wordle_options["all_words"] == "AROSE"]

Unnamed: 0,all_words,A,B,C,D,E,F,G,H,I,...,Q,R,S,T,U,V,W,X,Y,Z
549,AROSE,1,0,0,0,1,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0


In [74]:
df_word_frequency = pd.read_csv(
    "word-frequency.csv",
    header=None,
    delimiter=" ",
    names=["word_from_wikipedia", "word_count"],
)
print(f"Pulled in {df_word_frequency.shape[0]:,} words")
df_word_frequency.head()

Pulled in 2,184,780 words


Unnamed: 0,word_from_wikipedia,word_count
0,the,151983633
1,of,71874676
2,and,62210193
3,in,62004799
4,to,43364193


In [75]:
total_word_count = df_word_frequency["word_count"].sum()
df_word_frequency["word_frequency"] = df_word_frequency["word_count"] / total_word_count
df_word_frequency.head(20)

Unnamed: 0,word_from_wikipedia,word_count,word_frequency
0,the,151983633,0.078054
1,of,71874676,0.036913
2,and,62210193,0.031949
3,in,62004799,0.031844
4,to,43364193,0.022271
5,was,26559707,0.01364
6,is,20923272,0.010746
7,for,17970059,0.009229
8,on,17582287,0.00903
9,as,17397452,0.008935


In [76]:
df_word_frequency = df_word_frequency[
    df_word_frequency["word_from_wikipedia"].str.len() == 5
].copy()
print(f"Narrowed down to {df_word_frequency.shape[0]:,} words")

Narrowed down to 176,070 words


In [77]:
df_word_frequency["word_from_wikipedia"] = df_word_frequency[
    "word_from_wikipedia"
].apply(lambda x: x.upper() if x.isalpha() else 0)
# df.col1.apply(lambda x : x.isalnum())

In [78]:
df_final = pd.merge(
    df_wordle_options,
    df_word_frequency,
    how="left",
    left_on="all_words",
    right_on="word_from_wikipedia",
)

In [79]:
df_final.sort_values(by="word_frequency", ascending=False)

Unnamed: 0,all_words,A,B,C,D,E,F,G,H,I,...,T,U,V,W,X,Y,Z,word_from_wikipedia,word_count,word_frequency
12465,WHICH,0,0,1,0,0,0,0,2,1,...,0,0,0,1,0,0,0,WHICH,6412646.0,0.003293
3816,FIRST,0,0,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,FIRST,4840311.0,0.002486
11313,THEIR,0,0,0,0,1,0,0,1,1,...,1,0,0,0,0,0,0,THEIR,4339413.0,0.002229
155,AFTER,1,0,0,0,1,1,0,0,0,...,1,0,0,0,0,0,0,AFTER,4204053.0,0.002159
7819,OTHER,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,OTHER,3004434.0,0.001543
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,ZORIL,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,,,
12966,ZORIS,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,,,
12975,ZURFS,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,1,,,
12977,ZYGAL,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,1,,,


In [98]:
sample_list = [
    ["VERTU"],
    ["VEXER"],
    ["VEZIR"],
    ["VILER"],
    ["VINER"],
    ["VIPER"],
    ["VIRED"],
    ["VIVER"],
    ["WEBER"],
    ["WEIRD"],
    ["WIDER"],
    ["WIPER"],
    ["WIRED"],
    ["WIRER"],
    ["WIVER"],
    ["XERIC"],
    ["YDRED"],
]

In [103]:
sample_df = pd.DataFrame(sample_list, columns=["potential_guesses"])

In [102]:
sample_df

Unnamed: 0,guesses
0,VERTU
1,VEXER
2,VEZIR
3,VILER
4,VINER
5,VIPER
6,VIRED
7,VIVER
8,WEBER
9,WEIRD


In [106]:
df_final.head()

Unnamed: 0,all_words,A,B,C,D,E,F,G,H,I,...,T,U,V,W,X,Y,Z,word_from_wikipedia,word_count,word_frequency
0,AAHED,2,0,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,,,
1,AALII,2,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,AALII,27.0,1.38664e-08
2,AARGH,2,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,AARGH,27.0,1.38664e-08
3,AARTI,2,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,AARTI,931.0,4.78134e-07
4,ABACA,3,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,ABACA,325.0,1.669104e-07
