# Purpose

Do some analysis for playing English Word Games, specifically Wordle

## Wordnet

From  https://wn.readthedocs.io/en/latest/guides/lexicons.html

In [1]:
# Import wordnet
import wn
wn.download("oewn:2024")


[KCached file found: /Users/saru/.wn_data/downloads/cc34c0f3eb82b482b7e4ca46bce18467dd59b243
[KSkipping oewn:2024 (Open Engish Wordnet); already added/T/tmpf_8zyqfe.xml



PosixPath('/Users/saru/.wn_data/downloads/cc34c0f3eb82b482b7e4ca46bce18467dd59b243')

In [2]:
for lex in wn.lexicons():
    print(f'{lex.id}:{lex.version}\t{lex.label}')

oewn:2024	Open Engish Wordnet


In [3]:
en = wn.Wordnet("oewn:2024")
print(f"Num Words = {len(en.words())}")


Num Words = 161705


In [4]:
# show pos(parts of speech), "n=nous"
en.words(pos="n")[0:10]

[Word('oewn--ap-hood-n'),
 Word('oewn--ap-s_Gravenhage-n'),
 Word('oewn-.22-n'),
 Word('oewn-0-n'),
 Word('oewn-1-dodecanol-n'),
 Word('oewn-1-hitter-n'),
 Word('oewn-1-n'),
 Word('oewn-10-n'),
 Word('oewn-100-n'),
 Word('oewn-1000-n')]

In [5]:
# show pos(parts of speech), "v=verb"
en.words(pos="v")[0:10]

[Word('oewn-Agenise-v'),
 Word('oewn-Agenize-v'),
 Word('oewn-Americanise-v'),
 Word('oewn-Americanize-v'),
 Word('oewn-Balkanise-v'),
 Word('oewn-Balkanize-v'),
 Word('oewn-Charleston-v'),
 Word('oewn-Christianise-v'),
 Word('oewn-Christianize-v'),
 Word('oewn-DJ-v')]

In [6]:
wa = wn.words("leaf")[0]
wa

Word('oewn-leaf-n')

In [7]:
wa.forms()

['leaf', 'leaves']

In [8]:
wa.derived_words()

[Word('oewn-leafy-a'),
 Word('oewn-leaflet-n'),
 Word('oewn-leaf-v'),
 Word('oewn-leaflet-n'),
 Word('oewn-leaf-v'),
 Word('oewn-leaf-v'),
 Word('oewn-leaflet-n')]

In [9]:
wa.pos

'n'

In [10]:
words = [wf  for w in wn.words() for wf in w.forms()]
words[0:10]

["'hood",
 "'s Gravenhage",
 "'tween",
 "'tween decks",
 '.22-caliber',
 '.22-calibre',
 '.22',
 '.22 caliber',
 '.22 calibre',
 '.38-caliber']

In [11]:
import re
pattern = "^[a-z]+$"
wfns = [w for w in words if re.match(pattern, w)]
print(f"wfns length = {len(wfns)}")

wfns[0:10]

wfns length = 76399


['a',
 'aa',
 'aah',
 'aalii',
 'aardvark',
 'aardwolf',
 'aardwolves',
 'ab',
 'aba',
 'abaca']

## Five letter words frequencies

In [12]:
fivew = {w for w in wfns if len(w) == 5}
print(f"five letters word length = {len(fivew)}")
list(fivew)[0:10]


five letters word length = 4356


['uncut',
 'hymen',
 'ponce',
 'glans',
 'click',
 'flung',
 'intro',
 'liven',
 'wrack',
 'gonia']

In [13]:

from collections import defaultdict, Counter
counter_start = Counter()
counter_all = Counter()
for w in fivew:
    counter_start.update([w[0]])
    counter_all.update(w)


In [14]:

print("Top N starting alphabet - in 5 letter words\n")
for (alpha, cnt) in counter_start.most_common(26):
    print(f"\t{alpha}\t{cnt}")


Top N starting alphabet - in 5 letter words

	s	590
	c	355
	b	333
	p	290
	a	290
	t	283
	m	230
	f	219
	d	199
	g	198
	l	196
	r	190
	h	139
	w	124
	e	119
	n	95
	o	91
	v	81
	k	75
	i	67
	u	58
	j	50
	q	35
	y	19
	z	15
	x	15


In [15]:

print("Top N alphabet - in 5 letter words\n")
for (alpha, cnt) in counter_all.most_common(26):
    print(f"\t{alpha}\t{cnt}")


Top N alphabet - in 5 letter words

	e	2225
	a	2067
	r	1544
	o	1459
	i	1412
	s	1295
	l	1279
	t	1230
	n	1104
	u	885
	c	839
	y	783
	d	747
	p	725
	m	690
	h	640
	g	552
	b	551
	k	402
	f	378
	w	316
	v	283
	x	142
	z	108
	j	72
	q	52


In [16]:
from dataclasses import dataclass
from collections import Counter
vowels = {"a", "e", "i", "o", "u"}

@dataclass
class WordStats():
    has_repeated_chars : bool
    unique_chars : set[chr]
    unique_vowels : set[chr]
    all_vowels: list[chr]

    def __init__(self, word):
        c = Counter(word)
        self.has_repeated_chars = len(c.items()) != len(word)
        self.unique_chars = set(c)
        self.unique_vowels = self.unique_chars & vowels
        self.all_vowels = [e for e in c.elements() if e in vowels]

test_words = ["abaca", "piano"]
for tw in test_words:
    ws = WordStats(tw)
    print(f"{tw}\t{ws}")

abaca	WordStats(has_repeated_chars=True, unique_chars={'c', 'b', 'a'}, unique_vowels={'a'}, all_vowels=['a', 'a', 'a'])
piano	WordStats(has_repeated_chars=False, unique_chars={'a', 'p', 'n', 'i', 'o'}, unique_vowels={'i', 'o', 'a'}, all_vowels=['i', 'a', 'o'])


In [17]:


def export_words_with_info(words, fname):
    path = f"output/{fname}-unique.tsv"
    print(f"Writing to file {path}")
    with open(path, "w") as f:
        f.write(f"word\tws.reps\tws.uniq_chars\tuniq_vowels\tdup_vowels\tpos\tss\tdefinition\n")
        for w in sorted(words):
            stats = WordStats(w)
            ss_set = en.synsets(w)
            ss = ss_set[0]
            f.write(f"{w}\treps:{stats.has_repeated_chars}\tuc:{len(stats.unique_chars)}\tuv:{len(stats.unique_vowels)}\tdv:{len(stats.all_vowels)-len(stats.unique_vowels)}\t{ss.pos}\t{len(ss_set)}\t{ss.definition()}\n")

    path = f"output/{fname}-all-senses.tsv"
    print(f"Writing to file {path}")
    with open(path, "w") as f:
        f.write(f"word\tpos\tss:n\tdefinition\n")
        for w in sorted(words):
            ss_set = en.synsets(w)
            l_ss_set= len(ss_set)
            for i,ss in enumerate(ss_set, start=1):
                f.write(f"{w}\t{ss.pos}\t{l_ss_set}:{i}\t{ss.definition()}\n")


export_words_with_info(fivew, "wordnet-5-letter-words")

Writing to file output/wordnet-5-letter-words-unique.tsv
Writing to file output/wordnet-5-letter-words-all-senses.tsv
