### 2-1

In [1]:
import numpy as np
import os


"""
    - classes: 
        class space, in this problem is e, j, and s
    - doc_numbers:
        the numbers of docs you want to use to train the model
    - alpha:
        the hyperparameter for additive smoothing
    - char_count:
        the total number of characters
    - char:
        the list of possible characters
"""

data_dir = "languageID"
classes = ["e", "j", "s"]
doc_numbers = [10, 10, 10]
alpha = 0.5
char_count = 27
char_list = list("abcdefghijklmnopqrstuvwxyz ")

docs = []
labels = []
for i, c in enumerate(classes):
    for j in range(doc_numbers[i]):
        with open(os.path.join(data_dir, f"{c}{j}")+".txt") as f:
            docs.append(f.read())
            labels.append(i)

In [2]:
def get_chars(labels,doc,char_count=27,classes=["e", "j", "s"],vocab=char_list):
    char_counts = np.zeros((char_count, len(classes)))
    for i, doc in enumerate(docs):
        label = labels[i]
        for char in doc:
            if char in vocab:
                char_idx = vocab.index(char)
                char_counts[char_idx,label] += 1
    return char_counts

char_counts = get_chars(labels,docs)

In [3]:
        
        
def count_doc_chars(doc,char_list = char_list):
    doc_chars = np.ones((char_count,len(doc)))
    for i in range(len(doc)):
        for char in doc[i]:
            if char in char_list:
                char_idx = char_list.index(char)
                doc_chars[char_idx,i] += 1
    return doc_chars

In [4]:
def calc_priors(doc_numbers,alpha=0.5,classes=["e", "j", "s"]):
    prior = np.zeros(len(classes))
    for i in range(0,len(classes)):
        prior[i] = (doc_numbers[i]+alpha)/(sum(doc_numbers)+alpha*len(doc_numbers))
    return prior

prior = calc_priors(doc_numbers)

In [5]:
def calc_cond_prob(char_counts,alpha=0.5,char_count=27,classes=["e", "j", "s"]):
    conditional_prob = np.zeros((char_count,len(classes)))
    for i in range(0,char_count):
        for j in range(0,len(classes)):
            conditional_prob[i,j] = (char_counts[i,j]+alpha)/(sum(char_counts[:,j])+alpha*char_count)
    return conditional_prob

conditional_prob = calc_cond_prob(char_counts)

In [6]:
def predict(doc,prior,x_char_counts,conditional_prob,vocab=char_list,classes=["e", "j", "s"]):
    log_prob = np.log(prior)
    for string in doc:
        for char in string:
            if char in char_list:
                char_idx = char_list.index(char)
                for j in range(len(classes)):
                    log_prob[j] += x_char_counts[char_idx]*np.log(conditional_prob[char_idx][j])
    print(log_prob)
    return np.argmax(log_prob)

In [7]:
prior

array([0.33333333, 0.33333333, 0.33333333])

### 2-2

In [8]:
conditional_prob[:,0]

array([0.06016851, 0.01113497, 0.02151   , 0.02197258, 0.10536924,
       0.01893276, 0.01747894, 0.04721626, 0.05541054, 0.00142078,
       0.00373369, 0.02897737, 0.02051875, 0.05792169, 0.0644639 ,
       0.01675202, 0.0005617 , 0.05382455, 0.06618206, 0.08012556,
       0.02666446, 0.00928465, 0.01549645, 0.00115645, 0.01384437,
       0.00062779, 0.17924996])

### 2-3

In [9]:
conditional_prob[:,1]

array([1.31765610e-01, 1.08669066e-02, 5.48586603e-03, 1.72263182e-02,
       6.02047591e-02, 3.87854223e-03, 1.40116706e-02, 3.17621161e-02,
       9.70334393e-02, 2.34110207e-03, 5.74094133e-02, 1.43261470e-03,
       3.97987351e-02, 5.67105769e-02, 9.11632132e-02, 8.73545547e-04,
       1.04825466e-04, 4.28037318e-02, 4.21747790e-02, 5.69901115e-02,
       7.06174220e-02, 2.44592753e-04, 1.97421294e-02, 3.49418219e-05,
       1.41514379e-02, 7.72214263e-03, 1.23449457e-01])

In [10]:
conditional_prob[:,2]

array([1.04560451e-01, 8.23286362e-03, 3.75258241e-02, 3.97459221e-02,
       1.13810860e-01, 8.60287996e-03, 7.18448398e-03, 4.53270019e-03,
       4.98597021e-02, 6.62945947e-03, 2.77512257e-04, 5.29431717e-02,
       2.58086399e-02, 5.41765595e-02, 7.24923684e-02, 2.42669051e-02,
       7.67783910e-03, 5.92951189e-02, 6.57704049e-02, 3.56140730e-02,
       3.37023219e-02, 5.88942678e-03, 9.25040856e-05, 2.49761031e-03,
       7.86284728e-03, 2.68261848e-03, 1.68264932e-01])

### 2-4

In [11]:
x_doc = []
x_label = []
with open("languageID/e10.txt") as f:
    x_doc.append(f.read())
    x_label.append(0)

x_char_counts = count_doc_chars(x_doc)
x_char_counts

array([[165.],
       [ 33.],
       [ 54.],
       [ 58.],
       [312.],
       [ 56.],
       [ 52.],
       [141.],
       [141.],
       [  4.],
       [  7.],
       [ 86.],
       [ 65.],
       [140.],
       [183.],
       [ 54.],
       [  4.],
       [142.],
       [187.],
       [226.],
       [ 66.],
       [ 32.],
       [ 48.],
       [  5.],
       [ 39.],
       [  3.],
       [499.]])

In [12]:
### English ###
log_p_e = 0
for i in range(0,char_count):
    log_p_e += x_char_counts[i]*np.log(conditional_prob[i,0])
log_p_e

array([-7950.54651866])

In [13]:
### Janpanese
log_p_j = 0
for i in range(0,char_count):
    log_p_j += x_char_counts[i]*np.log(conditional_prob[i,1])
log_p_j

array([-8891.36768957])

In [14]:
### Spanish
log_p_s = 0
for i in range(0,char_count):
    log_p_s += x_char_counts[i]-np.log(conditional_prob[i,2])
log_p_s

array([2914.99064725])

### 2-6

In [15]:
predict(x_doc,prior,x_char_counts,conditional_prob)

[-1408584.43665842 -1594276.14538999 -1494120.32058031]


0

### 2-7

In [16]:
from sklearn.metrics import confusion_matrix

docs = []
labels = []
for i, c in enumerate(classes):
    for j in range(10,20):
        with open(os.path.join(data_dir, f"{c}{j}")+".txt") as f:
            docs.append(f.read())
            labels.append(i)

In [17]:
x_char_counts = count_doc_chars(docs)

In [18]:
labels_pred = np.zeros(x_char_counts.shape[1])
for i in range(0,x_char_counts.shape[1]):
    labels_pred[i] = predict(docs,prior,x_char_counts[:,i],conditional_prob)
labels_pred

[-24225941.71205364 -26515252.96421054 -24970452.09705942]
[-29184408.44117882 -31884096.56739806 -30001339.77708399]
[-16880503.9431489  -18346187.56552958 -17433280.18013849]
[-14885331.45216399 -16128841.13596447 -15471477.32456662]
[-14855777.48244123 -16184920.28801606 -15302642.00712824]
[-14119227.23127456 -15195798.60444268 -14538046.50390496]
[-23767561.54007601 -26089708.3874888  -24489713.116374  ]
[-21240031.12927104 -23024409.74664845 -21803825.3987597 ]
[-14633854.87035586 -15804981.30670367 -14876573.23612962]
[-5210519.84791597 -5587881.62201746 -5326189.65037526]
[-13623247.69777305 -13083758.41326301 -13857116.82194189]
[-13705957.94107019 -13010492.30239573 -13886116.60839933]
[-11169441.29060478 -10676412.76508264 -11313354.68150552]
[-14458076.36176011 -13738990.72207708 -14614419.41307489]
[-14424278.16723989 -13629323.65163127 -14505479.02855303]
[-11783932.82514378 -11374877.8408341  -11931877.56406177]
[-13304186.1829375  -12647679.70634169 -13418467.59181773]


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.])

In [19]:
cm = confusion_matrix(labels, labels_pred)
print(cm)

[[10  0  0]
 [ 0 10  0]
 [ 0  0 10]]
