In [1]:
import pandas as pd
import numpy as np

input = pd.read_csv('hw1_word_counts_05.txt', sep = ' ', header = None, names = ["word", "count"])
print(input.head(8))

ModuleNotFoundError: No module named 'pandas'

In [2]:
########## Problem a: 15/14 most/least frequent 5-letter words ##########
input = input.sort_values(by = ["count","word"], ascending=False)
print("15 most frequent 5-letters words:")
print(input.head(15))
print("14 least frequent 5-letters words:")
print(input.tail(14))

15 most frequent 5-letters words:
       word   count
5821  THREE  273077
5102  SEVEN  178842
1684  EIGHT  165764
6403  WOULD  159875
18    ABOUT  157448
5804  THEIR  145434
6320  WHICH  142146
73    AFTER  110102
1975  FIRST  109957
1947  FIFTY  106869
4158  OTHER  106052
2073  FORTY   94951
6457  YEARS   88900
5806  THERE   86502
5250  SIXTY   73086
14 least frequent 5-letters words:
       word  count
6443  YALOM      7
5872  TOCOR      7
5093  SERNA      7
4266  PAXON      7
3978  NIAID      7
2041  FOAMY      7
1842  FABRI      7
1107  CLEFT      7
977   CCAIR      7
5985  TROUP      6
4160  OTTIS      6
3554  MAPCO      6
895   CAIXA      6
712   BOSAK      6


In [3]:
# Prior probability P(W = w)

total = input["count"].sum()
input["prior"] = input["count"] / total

In [4]:
########## Problem b: Best guess for different conditions ##########

# Initialize two dataframes for recording current guess: correct and incorrect guess set
# Correct guess set is a 1D array which records single letter in a position, and
# incorrect guess set is a 1D array which contains a set of incorrectly guess letters in a position, e.g.
# if incorrectly guess = {E,A}, then {E,A} should appear for all indices in this array

letter_index = range(1,6)
correct_letter = ["-" for i in range(5)]
correct_evidence_set = pd.DataFrame({'Index': letter_index, 'Letter': correct_letter})

incorrect_letter = [set() for i in range(5)]
incorrect_evidence_set = pd.DataFrame({'Index': letter_index, 'Letter': incorrect_letter})

In [5]:
# Complementary: Indices with correct guess are incorrect guess in other Indices
def set_incorrect_evidences(correct_guess_i, correct_guess_l):
    for i, r in incorrect_evidence_set.iterrows():
        if r['Index'] != correct_guess_i:
            incorrect_evidence_set.at[r['Index']-1, 'Letter'].add(correct_guess_l)
    
# Correct evidences update
# Take "D--I-" case as example

correct_evidence_set.at[0, 'Letter'] = "D"
correct_evidence_set.at[3, 'Letter'] = "I"
#correct_evidence_set.at[1, 'Letter'] = "U"
aleady_guessed_set = set({"D", "I"})

# Update incorrect_evidence_set by correct guess
for i, r in correct_evidence_set.iterrows():
    if r['Letter'] != "-" and r['Letter'] != ' ': 
        set_incorrect_evidences(r['Index'], r['Letter'])

# update incorrect_evidence_set by incorrect guess
incorrect_guess_set = []
if incorrect_guess_set:
    aleady_guessed_set.update(incorrect_guess_set)
    for letter in incorrect_guess_set:
        for index, row in incorrect_evidence_set.iterrows():
            incorrect_evidence_set.at[index, 'Letter'].add(letter)


In [6]:
# Probability of evidence given word: 1 or 0
# for example, given W = "three", probability of "index 0 == t" = 1, and
# probability of "index 0 == a" = 0
# We need to consider incorrect and correct sets at the same time: 
# joint probability of P(evidence|W) = P(correct_evi|W) * P(incorrect_evi|W) = 1 or 0

def Correct_evidence_match(word, correct_evidence_set):
    for i, r in correct_evidence_set.iterrows():
        if r['Letter'] != '-' and word[i] != r['Letter']:
            return 0
    
    return 1

def Incorrect_evidence_match(word, incorrect_evidence_set):
    for i, r in incorrect_evidence_set.iterrows():
        if r['Letter']:
            if word[i] in r['Letter']:
                return 0

    return 1

def prob_evidence_given_W(word):
    return Correct_evidence_match(word, correct_evidence_set) and Incorrect_evidence_match(word, incorrect_evidence_set)

In [7]:
# Prior probability of evidence, which is the denominator of
# P(W|E)

prob_prior_evidence = 0
for input_i, input_r in input.iterrows():
    prob_prior_evidence += prob_evidence_given_W(input_r["word"]) * input_r["prior"]

In [8]:
# P(Li|W) can be simply acquired by check whether a letter is in a word or not: 1 or 0

def prob_letter_given_W(letter, word):
    return letter in word

In [9]:
# Goal: find a letter guess candidate which maximizes the probability of such letter given evidence
# Record this conditional probability for all letters

letters = ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"]
prob_letter_given_evidence = pd.DataFrame({"Letters": letters, "Probability": np.zeros(26)})

In [10]:
# Calculate conditional P(Li|evidence) and find the optimal guess
for letter_i, letter_row in prob_letter_given_evidence.iterrows():
    cur_letter = letter_row["Letters"]
    if cur_letter in aleady_guessed_set:
        print("Probability for: ", cur_letter," is :", 0.0)
        continue
    
    prob_cur_letter_given_evidence = 0
    for w_i, w_row in input.iterrows():
        cur_word = w_row["word"]

        # calculate the P(W = w|evidence)(Key 1)
        numerator = prob_evidence_given_W(cur_word) * w_row["prior"]
        denominator = prob_prior_evidence
        
        prob_W_given_evidence = numerator / denominator
        # Marginalization: calculate P(Li|evidence) (Key2)
        prob_cur_letter_given_evidence += prob_W_given_evidence * prob_letter_given_W(cur_letter, cur_word)

    print("Probability for: ", letter_row['Letters']," is :", prob_cur_letter_given_evidence)
    prob_letter_given_evidence.at[letter_i, 'Probability'] = prob_cur_letter_given_evidence


Probability for:  A  is : 0.8206845238095238
Probability for:  B  is : 0.017113095238095236
Probability for:  C  is : 0.0
Probability for:  D  is : 0.0
Probability for:  E  is : 0.14081101190476186
Probability for:  F  is : 0.0
Probability for:  G  is : 0.0
Probability for:  H  is : 0.0
Probability for:  I  is : 0.0
Probability for:  J  is : 0.006882440476190476
Probability for:  K  is : 0.003162202380952381
Probability for:  L  is : 0.07384672619047619
Probability for:  M  is : 0.017857142857142856
Probability for:  N  is : 0.1860119047619047
Probability for:  O  is : 0.04538690476190476
Probability for:  P  is : 0.006882440476190476
Probability for:  Q  is : 0.0
Probability for:  R  is : 0.18098958333333331
Probability for:  S  is : 0.7395833333333334
Probability for:  T  is : 0.01525297619047619
Probability for:  U  is : 0.0018601190476190477
Probability for:  V  is : 0.7436755952380953
Probability for:  W  is : 0.0
Probability for:  X  is : 0.0
Probability for:  Y  is : 0.0
Probabi

In [11]:
Best_guess = prob_letter_given_evidence.sort_values("Probability", ascending = False).head(1)
print("My next guess is ", Best_guess)

My next guess is    Letters  Probability
0       A     0.820685
