Name: Yuan Chen
ID: 9082757429

In [88]:
import numpy as np
from collections import Counter

In [324]:
def count_char(filename):
    with open(filename, 'r') as f:
        count = Counter(f.read().lower())
    return count

def reorder_dict(dict):
    char_order = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ']
    ordered_dict = {k: dict[k] for k in char_order}
    return ordered_dict

def dict_2_array(dict):
    item_list = list(dict.values())
    return np.array(item_list)

def read_data(L, train_size):
    y = []
    N = 0
    char_count_each_language = {}
    for l in L:
        total_count = {}
        for i in range(train_size):
            N += 1
            y.append(l)
            filename = 'languageID\\' + l + str(i) + '.txt'
            char_count = count_char(filename)
            total_count = Counter(total_count) + Counter(char_count)
        total_count2 = reorder_dict(total_count)
        char_count_each_language[l] = total_count2
    y = np.array(y)
    return y, char_count_each_language
    
def get_prior(y, alpha):
    unique_lang = set(y)
    prob_prior = dict(e=0, j=0, s=0)
    for l in unique_lang:
        prob_prior[l] = round((np.count_nonzero(y == l)+alpha)/(len(y)+3*alpha), 3)
    return prob_prior

def get_class_conditional_prob(char_count_dict, alpha):
    chars = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ']
    char_prob_list = []
    n = 0
    for i in char_count_dict:
        n += char_count_dict[i]
    for c in chars:
        p = (char_count_dict[c] + alpha)/(n+27*alpha)
        char_prob_list.append(p)
    return char_prob_list

def get_likelihood(x, conditional_prob):
    p = conditional_prob
    likelihood = 0
    for idx, count in enumerate(x):
        likelihood += count*np.log(p[idx])
    return(likelihood)

def test(filename, prior, conditional_prob_en, conditional_prob_jp, conditional_prob_spa):
    L = ['e', 'j', 's']
    count_vec = count_char(filename)
    count_vec = reorder_dict(count_vec)
    x = dict_2_array(count_vec)
    likelihood_en = get_likelihood(x, conditional_prob_en)
    likelihood_jp = get_likelihood(x, conditional_prob_jp)
    likelihood_spa = get_likelihood(x, conditional_prob_spa)
    posteriors = [np.log(prior['e'])+likelihood_en, np.log(prior['j'])+likelihood_jp, np.log(prior['s'])+likelihood_spa]
    max_val = max(posteriors)
    idx = posteriors.index(max_val)
    return L[idx]

def batch_test(prior, conditional_prob_en, conditional_prob_jp, conditional_prob_spa):
    L = ['e', 'j', 's']
    y = []
    y_pred = []
    for l in L:
        for i in range(10):
            y.append(l)
            filename = 'languageID\\' + l + str(i) + '.txt'
            y_pred.append(test(filename, prior, conditional_prob_en, conditional_prob_jp, conditional_prob_spa))
    return y, y_pred
#             print(l+str(i+10))

In [325]:
L = ['e', 'j', 's']
train_size = 10
alpha = 0.5

y, char_count_each_language = read_data(L, train_size)

In [326]:
#1
prior = get_prior(y, alpha)
print(prior)

{'e': 0.333, 'j': 0.333, 's': 0.333}


In [327]:
#2
char_count_en = char_count_each_language['e']
char_prob_en = get_class_conditional_prob(char_count_en, alpha)
 \print('Conditional proability of English samples: ', char_prob_en)

Conditional proability of English samples:  [0.0601685114819098, 0.011134974392863043, 0.021509995043779945, 0.021972575582355856, 0.1053692383941847, 0.018932760614571286, 0.017478936064761277, 0.047216256401784236, 0.055410540227986124, 0.001420783082768875, 0.0037336857756484387, 0.028977366595076822, 0.020518751032545846, 0.057921691723112505, 0.06446390219725756, 0.01675202378985627, 0.0005617049396993227, 0.053824549810011564, 0.06618205848339666, 0.08012555757475633, 0.026664463902197257, 0.009284652238559392, 0.015496448042293078, 0.001156451346439782, 0.013844374690236246, 0.0006277878737815959, 0.1792499586981662]


In [328]:
#3
char_count_jp = char_count_each_language['j']
char_prob_jp = get_class_conditional_prob(char_count_jp, alpha)
print('Conditional proability of Japanese samples: ', char_prob_jp)

print()

char_count_spa = char_count_each_language['s']
char_prob_spa = get_class_conditional_prob(char_count_spa, alpha)
print('Conditional proability of Spanish samples: ', char_prob_spa)

Conditional proability of Japanese samples:  [0.1317656102589189, 0.010866906600510151, 0.005485866033054963, 0.01722631818022992, 0.06020475907613823, 0.003878542227191726, 0.014011670568503443, 0.03176211607673224, 0.09703343932352633, 0.0023411020650616725, 0.05740941332681086, 0.001432614696530277, 0.03979873510604843, 0.05671057688947902, 0.09116321324993885, 0.0008735455466648031, 0.00010482546559977637, 0.04280373178657535, 0.0421747789929767, 0.056990111464411755, 0.07061742199238269, 0.0002445927530661449, 0.01974212935462455, 3.4941821866592126e-05, 0.01415143785596981, 0.00772214263251686, 0.12344945665466997]

Conditional proability of Spanish samples:  [0.10456045141993771, 0.008232863618143134, 0.03752582405722919, 0.039745922111559924, 0.1138108599796491, 0.00860287996053159, 0.0071844839813758445, 0.0045327001942585795, 0.049859702136844375, 0.006629459467793161, 0.0002775122567913416, 0.052943171656748174, 0.02580863988159477, 0.054176559464709693, 0.07249236841293824,

In [329]:
#4
filename = 'languageID\e10.txt'
char_count_e10 = count_char(filename)
char_count_e10 = reorder_dict(char_count_e10)
count_vec = dict_2_array(char_count_e10)
prob_vec = get_class_conditional_prob(test_data, alpha)
print('bag-of-word counter vector: ', count_vec)
print('probability vector: ', prob_vec)

bag-of-word counter vector:  [164  32  53  57 311  55  51 140 140   3   6  85  64 139 182  53   3 141
 186 225  65  31  47   4  38   2 498]
probability vector:  [0.058530510585305104, 0.011563778687066359, 0.01903575876178616, 0.020458993061732787, 0.11083437110834371, 0.019747375911759475, 0.018324141611812846, 0.04999110478562533, 0.04999110478562533, 0.0012453300124533001, 0.0023127557374132716, 0.03042163316135919, 0.022949653086639387, 0.049635296210638676, 0.06493506493506493, 0.01903575876178616, 0.0012453300124533001, 0.05034691336061199, 0.06635829923501156, 0.08023483365949119, 0.023305461661626045, 0.0112079701120797, 0.016900907311866217, 0.0016011385874399574, 0.0136986301369863, 0.000889521437466643, 0.1773705746308486]


In [330]:
#5
p_en = get_likelihood(count_vec, char_prob_en)
p_jp = get_likelihood(count_vec, char_prob_jp)
p_spa = get_likelihood(count_vec, char_prob_spa)
print(p_en, p_jp, p_spa)

-7841.865447060635 -8771.433079075032 -8467.282044010557


In [331]:
#6
result = test(filename, prior, char_prob_en, char_prob_jp, char_prob_spa)
print(result)

e


In [334]:
#7
y, y_pred = batch_test(prior, char_prob_en, char_prob_jp, char_prob_spa)
print('Labels: ', y, '\npredictions', y_pred)

Labels:  ['e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'j', 'j', 'j', 'j', 'j', 'j', 'j', 'j', 'j', 'j', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's'] 
predictions ['e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'j', 'j', 'j', 'j', 'j', 'j', 'j', 'j', 'j', 'j', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's']


In [335]:
#8
import random

filename = 'languageID\s16.txt'
f = open(filename, 'r')
chars = f.read()
s = str(chars)
s = random.sample(s, len(s))
char_count = Counter(s)
char_count = reorder_dict(char_count)
x = dict_2_array(char_count)
likelihood_en = get_likelihood(x, char_prob_en)
likelihood_jp = get_likelihood(x, char_prob_jp)
likelihood_spa = get_likelihood(x, char_prob_spa)
posteriors = [np.log(prior['e'])+likelihood_en, np.log(prior['j'])+likelihood_jp, np.log(prior['s'])+likelihood_spa]
max_val = max(posteriors)
idx = posteriors.index(max_val)
print(L[idx])

s
