In [1]:
import numpy as np
import matplotlib.pyplot as plt
import string
from sklearn.model_selection import train_test_split


In [2]:
robert_frost_file_path = "../data/robert_frost.txt" #robert_frost_small robert_frost
edgar_allan_poe_file_path = "../data/edgar_allan_poe.txt" #edgar_allan_poe_small edgar_allan_poe

In [3]:
def preprocessing_word(lst):
    return [x.lower().translate(str.maketrans('', '', string.punctuation)) for x in lst]

In [4]:
def file_to_tokenize(file_path):
    '''
    This function purpose is to read file and convert to list of tokenize
    '''
    tokenize_list = []
    f = open(file_path, "r")
    for x in f:
        tokenize_list.append(x.split())
        
    # remove empty array
    tokenize_list = [ x for x in tokenize_list if len(x) != 0]
    
    #preprocessing
    tmp = []
    for x in tokenize_list:
        tmp.append(preprocessing_word(x))
        
    tokenize_list = tmp
        
    return tokenize_list

In [5]:
# read file and convert to tokenize by function file_to_tokenize
robert_frost_tokenize_list = file_to_tokenize(robert_frost_file_path)
edgar_allan_poe_tokenize_list = file_to_tokenize(edgar_allan_poe_file_path)
len(robert_frost_tokenize_list),len(edgar_allan_poe_tokenize_list)

(1436, 722)

In [6]:
robert_frost_tokenize_list[:5]

[['two', 'roads', 'diverged', 'in', 'a', 'yellow', 'wood'],
 ['and', 'sorry', 'i', 'could', 'not', 'travel', 'both'],
 ['and', 'be', 'one', 'traveler', 'long', 'i', 'stood'],
 ['and', 'looked', 'down', 'one', 'as', 'far', 'as', 'i', 'could'],
 ['to', 'where', 'it', 'bent', 'in', 'the', 'undergrowth']]

In [7]:
edgar_allan_poe_tokenize_list[:5]

[['lo', 'death', 'hath', 'reard', 'himself', 'a', 'throne'],
 ['in', 'a', 'strange', 'city', 'all', 'alone'],
 ['far', 'down', 'within', 'the', 'dim', 'west'],
 ['where',
  'the',
  'good',
  'and',
  'the',
  'bad',
  'and',
  'the',
  'worst',
  'and',
  'the',
  'best'],
 ['have', 'gone', 'to', 'their', 'eternal', 'rest']]

In [8]:
robert_frost_label_list = list(np.zeros(len(robert_frost_tokenize_list),dtype=int))
edgar_allan_poe_label_list = list(np.ones(len(edgar_allan_poe_tokenize_list),dtype=int))

len(robert_frost_label_list),len(edgar_allan_poe_label_list)

(1436, 722)

In [9]:
all_data_x = robert_frost_tokenize_list+edgar_allan_poe_tokenize_list
all_data_y= robert_frost_label_list+edgar_allan_poe_label_list

len(all_data_x),len(all_data_y)

(2158, 2158)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(all_data_x, all_data_y, test_size=0.2, random_state=42)
len(X_train),len(y_train),len(X_test),len(y_test)

(1726, 1726, 432, 432)

In [11]:
X_train[:5]

[['its', 'hands', 'of', 'gold'],
 ['i', 'may', 'be', 'mad'],
 ['and', 'the', 'fur', 'trade'],
 ['winds', 'blow', 'the', 'open', 'grassy', 'places', 'bleak'],
 ['at', 'having', 'eased', 'her', 'heart', 'of', 'one', 'more', 'copy']]

In [12]:
X_test[:5]

[['there', 'is', 'a', 'house', 'that', 'is', 'no', 'more', 'a', 'house'],
 ['but',
  'you',
  'have',
  'said',
  'it',
  'and',
  'were',
  'off',
  'to',
  'find',
  'it'],
 ['and', 'the', 'birds', 'on', 'her', 'outer', 'windowsill'],
 ['by', 'tying', 'together'],
 ['with', 'furs', 'to', 'sell']]

In [13]:
def mapping_unique_index(lst):
    unique_word_index_dict = {
         'unk' : 0
    }
    word_index = 1
    for data in lst:
        tmp = np.asarray(data)
        unique_tmp = np.unique(tmp)
        for word in unique_tmp:
            if not word in unique_word_index_dict:
                unique_word_index_dict[word] = word_index
                word_index += 1
    return unique_word_index_dict,word_index

In [14]:
X_train_unique_word_index_dict,X_train_word_index = mapping_unique_index(X_train)
X_test_unique_word_index_dict,X_test_word_index = mapping_unique_index(X_test)

In [15]:
dict(list(X_train_unique_word_index_dict.items())[0:5])

{'unk': 0, 'gold': 1, 'hands': 2, 'its': 3, 'of': 4}

In [16]:
# Finding a key that found in test set but not in trainset
# in case, want to visualize
X_train_keys_set= set(list(X_train_unique_word_index_dict.keys()))
X_test_keys_set= set(list(X_test_unique_word_index_dict.keys()))

not_found_keys = X_test_keys_set - X_train_keys_set

In [17]:
# creating dictionary for mapping from index to word 
X_train_unique_index_word_dict = dict((v,k) for k,v in X_train_unique_word_index_dict.items())
dict(list(X_train_unique_index_word_dict.items())[0:5])

{0: 'unk', 1: 'gold', 2: 'hands', 3: 'its', 4: 'of'}

In [18]:
def covert_word_str_to_int(lst,mapping_dict):
    word_int_list = []
    
    for sentence in lst:
        tmp = [mapping_dict[word] if word in mapping_dict else 0 for word in sentence]
        word_int_list.append(tmp)
        
    return word_int_list

In [19]:
# convert from list of string of word to list of index of word
X_train_int = covert_word_str_to_int(X_train,X_train_unique_word_index_dict)
X_test_int = covert_word_str_to_int(X_test,X_train_unique_word_index_dict)

X_train_int = np.asarray(X_train_int)
X_test_int = np.asarray(X_test_int)

len(X_train_int),len(X_test_int)

(1726, 432)

In [20]:
X_train_int[:5]

array([list([3, 2, 4, 1]), list([6, 8, 5, 7]), list([9, 11, 10, 12]),
       list([18, 14, 11, 16, 15, 17, 13]),
       list([19, 22, 21, 24, 23, 4, 26, 25, 20])], dtype=object)

In [21]:
X_test_int[:5]

array([list([300, 146, 37, 201, 104, 146, 30, 25, 37, 201]),
       list([167, 94, 69, 404, 183, 9, 84, 172, 59, 390, 183]),
       list([9, 11, 329, 86, 24, 0, 684]), list([246, 0, 416]),
       list([311, 0, 59, 562])], dtype=object)

In [22]:
# train model seperately for each class  , smoothing by 1
state_transition_0  = np.ones((X_train_word_index,X_train_word_index))
initial_state_0 = np.ones(X_train_word_index)

state_transition_1  = np.ones((X_train_word_index,X_train_word_index))
initial_state_1 = np.ones(X_train_word_index)

In [23]:
# compute counting for state transition and initial state 
def compute_counting(lst,initial_state,state_transition):
    for tokens in lst:
        last_index = None
        for idx in tokens:
            if last_index is None:
                initial_state[idx] += 1
            else:
                state_transition[last_index,idx] +=1
            last_index = idx

    initial_state /= initial_state.sum()
    state_transition /= state_transition.sum(axis=1,keepdims=True)
    
    
    initial_state = np.log(initial_state)
    state_transition =np.log( state_transition)
    return initial_state,state_transition

In [24]:
initial_state_0,state_transition_0 = compute_counting([x for x,y in zip(X_train_int,y_train) if y ==0],initial_state_0,state_transition_0)
initial_state_0[:5],state_transition_0[:5]

(array([-8.23589073, -8.23589073, -8.23589073, -5.40267738, -5.0170149 ]),
 array([[-7.87435882, -7.87435882, -7.87435882, ..., -7.87435882,
         -7.87435882, -7.87435882],
        [-7.87511928, -7.87511928, -7.87511928, ..., -7.87511928,
         -7.87511928, -7.87511928],
        [-7.87511928, -7.87511928, -7.87511928, ..., -7.87511928,
         -7.87511928, -7.87511928],
        [-7.88870952, -7.88870952, -7.19556234, ..., -7.88870952,
         -7.88870952, -7.88870952],
        [-7.93272103, -6.83410874, -7.93272103, ..., -7.93272103,
         -7.93272103, -7.93272103]]))

In [25]:
initial_state_1,state_transition_1 = compute_counting([x for x,y in zip(X_train_int,y_train) if y ==1],initial_state_1,state_transition_1)
initial_state_1[:5],state_transition_1[:5]

(array([-8.07402622, -8.07402622, -8.07402622, -6.97541393, -4.89597239]),
 array([[-7.87435882, -7.87435882, -7.87435882, ..., -7.87435882,
         -7.87435882, -7.87435882],
        [-7.87473913, -7.87473913, -7.87473913, ..., -7.87473913,
         -7.87473913, -7.87473913],
        [-7.87473913, -7.87473913, -7.87473913, ..., -7.87473913,
         -7.87473913, -7.87473913],
        [-7.8800482 , -7.8800482 , -7.8800482 , ..., -7.8800482 ,
         -7.8800482 , -7.8800482 ],
        [-7.92876632, -7.92876632, -7.92876632, ..., -7.92876632,
         -7.92876632, -7.92876632]]))

In [26]:
#compute prior
count_0 = sum(y == 0 for y in y_train)
count_1 = sum(y == 1 for y in y_train)
total_count = len(y_train)

p0 = count_0/total_count
p1 = count_1/total_count

log_p0 = np.log(p0)
log_p1 = np.log(p1)

p0,p1

(0.6633835457705678, 0.3366164542294322)

In [27]:
def compute_log_likelihood(tokens,initial_state,state_transition):
    log_prob = 0
    last_index = None
    for idx in tokens:
        if last_index is None:
            log_prob += initial_state[idx]
        else:
            log_prob +=state_transition[last_index,idx]
        last_index = idx
    
    return log_prob

In [28]:
def predict(test_sentences,initial_state_list,state_transition_list,prior_list):
    
    prob =[]
    for sentence in test_sentences:
        
        prob_each_model = []
        model_number = len(initial_state_list)
        
        for i in range(model_number):
            tmp_prob = compute_log_likelihood(sentence,initial_state_list[i],state_transition_list[i]) + prior_list[i]
            prob_each_model.append(tmp_prob)
       
        pred = np.argmax(prob_each_model)
        prob.append(pred)
        
    return np.asarray(prob)

In [29]:
predicted_train = predict(X_train_int,[initial_state_0,initial_state_1],[state_transition_0,state_transition_1],[log_p0,log_p1]  )
np.mean(predicted_train ==np.asarray(y_train))

0.9959443800695249

In [30]:
predicted_test = predict(X_test_int,[initial_state_0,initial_state_1],[state_transition_0,state_transition_1],[log_p0,log_p1])
np.mean(predicted_test ==np.asarray(y_test))

0.8402777777777778