In [2]:
!pip install wordfreq
from wordfreq import zipf_frequency
import pandas as pd
import math


Collecting wordfreq
  Downloading wordfreq-3.1.1-py3-none-any.whl.metadata (27 kB)
Collecting ftfy>=6.1 (from wordfreq)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting locate<2.0.0,>=1.1.1 (from wordfreq)
  Downloading locate-1.1.1-py3-none-any.whl.metadata (3.9 kB)
Downloading wordfreq-3.1.1-py3-none-any.whl (56.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading locate-1.1.1-py3-none-any.whl (5.4 kB)
Installing collected packages: locate, ftfy, wordfreq
Successfully installed ftfy-6.3.1 locate-1.1.1 wordfreq-3.1.1


In [10]:
# helpers

def zipf_to_wpm(z):
    return 10**z / 1000.0

def get_freq_wpm(word):
    z = zipf_frequency(word, 'en')
    return zipf_to_wpm(z)

def bin_freq_fixed(x):
    if x >= 100:  return "high"
    if x >= 20:   return "medium"
    return "low"

def bin_freq_quant(x):
    if x <= qs[0]: return "low"
    if x <= qs[1]: return "medium"
    return "high"

In [12]:
# verbs
irregulars = ["run","swim","sit","sleep","go","come","sing","shake"]
regulars = ["clap","laugh","jump","talk","dance","wave","smile","cry","crawl","yawn","shout","cough"]

rows = []
for v in irregulars:
    rows.append(("irregular","unerg",v,get_freq_wpm(v)))
for v in regulars:
    rows.append(("regular","unerg",v,get_freq_wpm(v)))

df = pd.DataFrame(rows, columns=["Regularity","VerbType","Verb","FreqWPM"])
df

Unnamed: 0,Regularity,VerbType,Verb,FreqWPM
0,irregular,unerg,run,309.029543
1,irregular,unerg,swim,16.218101
2,irregular,unerg,sit,79.432823
3,irregular,unerg,sleep,112.201845
4,irregular,unerg,go,1071.519305
5,irregular,unerg,come,602.559586
6,irregular,unerg,sing,34.673685
7,irregular,unerg,shake,21.379621
8,regular,unerg,clap,3.467369
9,regular,unerg,laugh,45.708819


In [13]:
og = df
df["FreqBin"] = df["FreqWPM"].apply(bin_freq_fixed)

reg = df[df.Regularity=="regular"].copy().reset_index(drop=True)
irr = df[df.Regularity=="irregular"].copy().reset_index(drop=True)

out_rows = []
for bin_level in ["high","medium","low","unknown"]:
    reg_bin = reg[reg.FreqBin==bin_level].reset_index(drop=True)
    irr_bin = irr[irr.FreqBin==bin_level].reset_index(drop=True)
    max_len = max(len(reg_bin), len(irr_bin))
    for i in range(max_len):
        reg_verb  = reg_bin.loc[i,"Verb"] if i < len(reg_bin) else ""
        irr_verb  = irr_bin.loc[i,"Verb"] if i < len(irr_bin) else ""
        reg_freq  = round(reg_bin.loc[i,"FreqWPM"], 3) if i < len(reg_bin) else ""
        irr_freq  = round(irr_bin.loc[i,"FreqWPM"], 3) if i < len(irr_bin) else ""
        out_rows.append((bin_level, reg_verb, irr_verb, reg_freq, irr_freq))

pairs = pd.DataFrame(out_rows, columns=["freqbin","reg_verb","irreg_verb","regfreq","irregfreq"])

print(pairs)

   freqbin reg_verb irreg_verb  regfreq irregfreq
0     high     talk        run  263.027    309.03
1     high               sleep            112.202
2     high                  go           1071.519
3     high                come             602.56
4   medium    laugh        sit   45.709    79.433
5   medium     jump       sing   48.978    34.674
6   medium    dance      shake   85.114     21.38
7   medium     wave              41.687          
8   medium    smile              43.652          
9   medium      cry              38.905          
10     low     clap       swim    3.467    16.218
11     low    crawl               5.888          
12     low     yawn               1.175          
13     low    shout              11.482          
14     low    cough               8.913          


In [14]:
df2 = og
qs = df2["FreqWPM"].dropna().quantile([0.33, 0.66]).tolist()

df2["FreqBin"] = df2["FreqWPM"].apply(bin_freq_quant)

reg = df2[df2.Regularity=="regular"].copy().reset_index(drop=True)
irr = df2[df2.Regularity=="irregular"].copy().reset_index(drop=True)

out_rows = []
for bin_level in ["high","medium","low","unknown"]:
    reg_bin = reg[reg.FreqBin==bin_level].reset_index(drop=True)
    irr_bin = irr[irr.FreqBin==bin_level].reset_index(drop=True)
    max_len = max(len(reg_bin), len(irr_bin))
    for i in range(max_len):
        reg_verb  = reg_bin.loc[i,"Verb"] if i < len(reg_bin) else ""
        irr_verb  = irr_bin.loc[i,"Verb"] if i < len(irr_bin) else ""
        reg_freq  = round(reg_bin.loc[i,"FreqWPM"], 3) if i < len(reg_bin) else ""
        irr_freq  = round(irr_bin.loc[i,"FreqWPM"], 3) if i < len(irr_bin) else ""
        out_rows.append((bin_level, reg_verb, irr_verb, reg_freq, irr_freq))

pairs = pd.DataFrame(out_rows, columns=["freqbin","reg_verb","irreg_verb","regfreq","irregfreq"])

print(pairs)

   freqbin reg_verb irreg_verb  regfreq irregfreq
0     high     talk        run  263.027    309.03
1     high    dance        sit   85.114    79.433
2     high               sleep            112.202
3     high                  go           1071.519
4     high                come             602.56
5   medium    laugh       sing   45.709    34.674
6   medium     jump              48.978          
7   medium     wave              41.687          
8   medium    smile              43.652          
9   medium      cry              38.905          
10     low     clap       swim    3.467    16.218
11     low    crawl      shake    5.888     21.38
12     low     yawn               1.175          
13     low    shout              11.482          
14     low    cough               8.913          
