# Mudule-4 Handon-1

In [1]:
import pandas as pd
import numpy as np
import re
import os
import string
from ordered_set import OrderedSet

import nltk
nltk.download("stopwords")
nltk.download("punkt")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tkthanatorn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/tkthanatorn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
COCA = pd.DataFrame(
    [
        ["deet", 420],
        ["deft", 1240],
        ["defer", 2237],
        ["defeat", 21940],
        ["defect", 3972],
    ],
    columns=["word", "frequency"],
)

COCA

Unnamed: 0,word,frequency
0,deet,420
1,deft,1240
2,defer,2237
3,defeat,21940
4,defect,3972


In [3]:
COCA_pop = 1001610938
COCA["P(w)"] = COCA["frequency"] / COCA_pop
COCA["rank"] = COCA["frequency"].rank(ascending=False, method="min").astype(int)

COCA

Unnamed: 0,word,frequency,P(w),rank
0,deet,420,4.193245e-07,5
1,deft,1240,1.238006e-06,4
2,defer,2237,2.233402e-06,3
3,defeat,21940,2.190471e-05,1
4,defect,3972,3.965612e-06,2


In [4]:
WIKI = pd.DataFrame(
    [
        ["deet", 124],
        ["deft", 814],
        ["defer", 1416],
        ["defeat", 121408],
    ],
    columns=["word", "frequency"],
)

WIKI

Unnamed: 0,word,frequency
0,deet,124
1,deft,814
2,defer,1416
3,defeat,121408


In [5]:
WIKI_pop = 1.9e9
WIKI["P(w)"] = WIKI["frequency"] / WIKI_pop
WIKI["rank"] = WIKI["frequency"].rank(ascending=False, method="min").astype(int)

WIKI

Unnamed: 0,word,frequency,P(w),rank
0,deet,124,6.526316e-08,4
1,deft,814,4.284211e-07,3
2,defer,1416,7.452632e-07,2
3,defeat,121408,6.389895e-05,1


#### Load IULA corpus data

In [6]:
topdir = "../../data/iula"
all_content = []
for dirpath, dirname, filename in os.walk(topdir):
    for name in filename:
        if name.endswith("plain.txt"):
            with open(os.path.join(dirpath, name)) as f:
                all_content.append(f.read())

#### Preprocessing Function

In [7]:
def preProcess(texts: list[str], stop_dict: dict) -> list[str]:
    data = [
        s.translate(str.maketrans("", "", string.punctuation + "\xa0")) for s in texts
    ]
    data = [s.lower() for s in data]
    data = [
        s.translate(str.maketrans(string.whitespace, " " * len(string.whitespace), ""))
        for s in data
    ]

    tokenized = [word_tokenize(s) for s in data]
    concatenated = np.unique(np.concatenate(tokenized))
    stem_cache = {}
    ps = PorterStemmer()
    for s in concatenated:
        stem_cache[s] = ps.stem(s)
    
    def custom_processor(s: str):
        ps = PorterStemmer()
        s = re.sub(r"[^A-Za-z]", " ", s)
        s = re.sub(r"\s+", " ", s)
        s = word_tokenize(s)
        s = list(OrderedSet(s) - stop_dict)
        s = [word for word in s if len(word) > 2]
        s = [stem_cache[w] if w in stem_cache else ps.stem(w) for w in s]
        s = " ".join(s)
        return s

    data = [custom_processor(s) for s in data]
    return data

stop_dict = set(stopwords.words("english"))
preprocessed = preProcess(all_content, stop_dict)
preprocessed

['earli origin autism new research caus baffl disord focus gene control develop brain mystifi scientist half centuri complex behavior encompass wide varieti symptom usual appear child turn three children unabl interpret emot state other fail recogn anger sorrow manipul intent languag skill often limit find difficult initi sustain convers also frequent exhibit intens preoccup singl subject activ gestur behavior incred debilit includ typic classroom cant dissuad bang head desk make friend overrid interest calendar suffer mental retard prognosi even wors intens therapi improv outcom mani patient imposs live independ normal iq becam involv search autism rel recent almost accid embryologist previous focus variou birth defect attend remark present scientif confer two pediatr ophthalmologist marilyn miller univers illinoi chicago kerstin stromland goteborg sweden describ surpris studi investig eye motil problem victim thalidomid morningsick drug caus epidem studi subject adult expos still wom

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
freq_iula = vectorizer.fit_transform(preprocessed)
freq_iula = pd.DataFrame(
    freq_iula.todense(), columns=vectorizer.get_feature_names_out()
).sum()
freq_iula

aaa        1
aaaaaa     1
aalborg    2
aarhu      1
aaron      3
          ..
zurich     4
zvi        1
zya        1
zygos      1
zygot      9
Length: 27051, dtype: int64

In [9]:
query = ["deet", "deft", "defer", "defect", "defeat"]
transformed_query = [
    vectorizer.inverse_transform(vectorizer.transform([q])) for q in query
]
query_freq = pd.Series(
    [
        freq_iula.T.loc[tq[0]].values[0] if len(tq[0]) > 0 else 0
        for tq in transformed_query
    ],
    index=query,
)

query_freq

deet       0
deft       0
defer      5
defect    71
defeat     8
dtype: int64

In [10]:
IULA = pd.DataFrame(query_freq, columns=["frequency"])
IULA_pop = IULA["frequency"].sum()
IULA["P(w)"] = IULA["frequency"] / IULA_pop
IULA["rank"] = IULA["frequency"].rank(ascending=False).astype(int)

IULA

Unnamed: 0,frequency,P(w),rank
deet,0,0.0,4
deft,0,0.0,4
defer,5,0.059524,3
defect,71,0.845238,1
defeat,8,0.095238,2


In [11]:
norvig = pd.read_csv(
    "http://norvig.com/ngrams/count_1edit.txt",
    sep="\t",
    encoding="ISO-8859-1",
    header=None,
)
norvig.columns = ["term", "edit"]
norvig = norvig.set_index("term")

norvig.head()

Unnamed: 0_level_0,edit
term,Unnamed: 1_level_1
e|i,917
a|e,856
i|e,771
e|a,749
a|i,559


In [12]:
norvig_orig = pd.read_csv('http://norvig.com/ngrams/count_big.txt',sep='\t',encoding =
"ISO-8859-1",header=None)
norvig_orig = norvig_orig.dropna()
norvig_orig.columns=['term','freq']
norvig_orig.head()

Unnamed: 0,term,freq
0,a,21160
1,aah,1
2,aaron,5
3,ab,2
4,aback,3


In [13]:
def get_count(c, norvig_orig):
    return norvig_orig.apply(lambda x: x.term.count(c) * x.freq, axis=1).sum()

In [14]:
import itertools
from multiprocessing import Pool

character_set = list(
    map("".join, itertools.product(string.ascii_lowercase, repeat=1))
) + list(map("".join, itertools.product(string.ascii_lowercase, repeat=2)))
pool = Pool(8)
freq_list = pool.starmap(get_count, zip(character_set, itertools.repeat(norvig_orig)))
freq_df = pd.DataFrame([character_set, freq_list], index=["char", "freq"]).T
freq_df = freq_df.set_index("char")

freq_df

Unnamed: 0_level_0,freq
char,Unnamed: 1_level_1
a,407349
b,73161
c,144964
d,215698
e,632999
...,...
zv,1
zw,1
zx,0
zy,32


In [15]:
probs = [
    (0 / freq_df.loc["f"].values)[0],  # deet
    (norvig.loc["e| "].values / freq_df.loc["e"].values)[0],  # deft
    (norvig.loc["t|r"].values / freq_df.loc["r"].values)[0],  # defer
    (norvig.loc["e|ea"].values / freq_df.loc["ea"].values)[0],  # defeat
    (norvig.loc["e|ec"].values / freq_df.loc["ec"].values)[0],  # defect
]

COCA["P(x|w)"] = probs
COCA.head()

Unnamed: 0,word,frequency,P(w),rank,P(x|w)
0,deet,420,4.193245e-07,5,0.0
1,deft,1240,1.238006e-06,4,3e-06
2,defer,2237,2.233402e-06,3,3.6e-05
3,defeat,21940,2.190471e-05,1,0.012834
4,defect,3972,3.965612e-06,2,0.003167


In [16]:
COCA["109 P(x|w)P(w)"] = 1e9 * COCA["P(w)"] * COCA["P(x|w)"]
COCA.head()

Unnamed: 0,word,frequency,P(w),rank,P(x|w),109 P(x|w)P(w)
0,deet,420,4.193245e-07,5,0.0,0.0
1,deft,1240,1.238006e-06,4,3e-06,0.003912
2,defer,2237,2.233402e-06,3,3.6e-05,0.079366
3,defeat,21940,2.190471e-05,1,0.012834,281.124909
4,defect,3972,3.965612e-06,2,0.003167,12.558705


In [17]:
IULA['P(x|w)'] = probs
IULA['109 P(x|w)P(w)'] = 1e9 * IULA['P(w)'] * IULA['P(x|w)']

IULA.head()

Unnamed: 0,frequency,P(w),rank,P(x|w),109 P(x|w)P(w)
deet,0,0.0,4,0.0,0.0
deft,0,0.0,4,3e-06,0.0
defer,5,0.059524,3,3.6e-05,2115.24
defect,71,0.845238,1,0.012834,10847780.0
defeat,8,0.095238,2,0.003167,301609.8
