In [157]:
import re
from dateutil.parser import parse

In [158]:
class Token():
    def __init__(self, val):
        self.val = val
        
    def is_number(self):
        if re.search(r'^\d+$', self.val):
            return True
        return False
        
    def is_contain_digit (self):
        if re.search(r'\d', self.val):
            return True
        return False
    
    def is_date(self):
        try: 
            parse(self.val, fuzzy=True)
            return True
        except ValueError:
            return False

In [159]:
def tokenize(sentence):
    ls = []
    sentence = re.sub(r'[.,]', ' ', sentence)
    for word in re.split(r' +', sentence):
        if word != '':
            ls.append(Token(word))
            
    return ls

In [160]:
def split_sentences(p):
    ls = []
    for s in re.split(r'\n+',p):
        if not re.match(r' +?$', s):
            ls.append(s.strip())
        
    return ls

In [161]:
def unigram(sentences):
    ls = []
    
    sentences = tokenize(sentences)
    for i in range(len(sentences)):
        word = sentences[i].val
        ls.append(f'{word}')
    
    return ls

In [162]:
def bigram(sentences):
    ls = []
    
    sentences = f'<s> {sentences} </s>'
    sentences = tokenize(sentences)
    for i in range(len(sentences) - 1):
        word = sentences[i].val
        word_i = sentences[i + 1].val
        ls.append(f'{word}_{word_i}')
    
    return ls

In [163]:
def trigram(sentences):
    ls = []
    
    sentences = f'<s> <s> {sentences} </s> </s>'
    sentences = tokenize(sentences)
    
    for i in range(len(sentences) - 2):
        word = sentences[i].val
        word_i = sentences[i + 1].val
        word_ii = sentences[i + 2].val
        ls.append(f'{word}_{word_i}_{word_ii}')
    
    return ls

In [234]:
def make_n_gram_dict(data):
    unigram_dict = {}
    bigram_dict = {}
    trigram_dict = {}
    N = 0
    V = 0
    
    num_sentences = 0
    for sentences in split_sentences(data):
        num_sentences += 1
        
        for n_gram in unigram(sentences):
            N += 1
            if n_gram in unigram_dict:
                unigram_dict[n_gram] += 1
            else:
                unigram_dict[n_gram] = 1
                V += 1
        
        for n_gram in bigram(sentences):
            if n_gram in bigram_dict:
                bigram_dict[n_gram] += 1
            else:
                bigram_dict[n_gram] = 1
                
        for n_gram in trigram(sentences):
            if n_gram in trigram_dict:
                trigram_dict[n_gram] += 1
            else:
                trigram_dict[n_gram] = 1
    
    unigram_dict['</s>'] = num_sentences
    return N, V, unigram_dict, bigram_dict, trigram_dict

In [165]:
def P_unigram(n_gram):
    if n_gram in unigram_dict:
        return unigram_dict[n_gram]/N
    
    return 0    

In [220]:
def P_bigram(target, context, alpha=0, V=0):
    part = context + '_' + target
    
    numerator = bigram_dict[part] if part in bigram_dict else 0
    numerator += alpha
    
    denominator = unigram_dict[target] if target in unigram_dict else 0
    denominator += V*alpha
    
    return 0 if denominator==0 else numerator/denominator    

In [221]:
def P_trigram(target, context, alpha=0, V=0):
    part = context + '_' + target
    
    numerator = trigram_dict[part] if part in trigram_dict else 0
    numerator += alpha
    
    denominator = unigram_dict[target] if target in unigram_dict else 0
    denominator += V*alpha
    

    return 0 if denominator==0 else numerator/denominator

In [250]:
def P_bigram_sentences(sentences):
#     Split sentences in to target and context.
#     Multi all part and return value
    
    sentences = f'<s> {sentences} </s>' #padding
    list_word = sentences.split()
    len_sentences = list_word.__len__()
    p_bigram = 1
    
    for i in range(1, len_sentences):
        target = list_word[i]
        context = list_word[i - 1]
        p_bigram *= P_bigram(target, context)

    return p_bigram

In [251]:
def P_trigram_sentences(sentences):
#     Split sentences in to target and context.
#     Multi all part and return value
    
    sentences = f'<s> <s> {sentences} </s> </s>'  #padding
    list_word = sentences.split()
    len_sentences = list_word.__len__()
    
    p_trigram = 1

    for i in range(2, len_sentences):
        target = list_word[i]
        context = list_word[i - 2] + '_' + list_word[i - 1]


        p_trigram *= P_trigram(target, context)
        
    return p_trigram

In [212]:
def perplexity(sentences): 
#     Eval model by unseen text for both
#     bigram and trigram. The less perplexity is,
#     the better model becomes.
    list_word = sentences.split()
    len_sentences = list_word.__len__()
    
    #smooth
    alpha = 1

    #     for bigram
    per_bigram = 1

    for i in range(1, len_sentences):
        target = list_word[i]
        context = list_word[i - 1]

        per_bigram *= 1/(P_bigram(target, context,  alpha, V))

    per_bigram = pow(per_bigram, len_sentences)

    # for trigram
    per_trigram = 1

    for i in range(2, len_sentences):
        target = list_word[i]
        context = list_word[i - 2] + '_' + list_word[i - 1]

        per_trigram *= 1/(P_trigram(target, context, alpha, V))

    per_trigram = pow(per_trigram, len_sentences)

    return per_bigram, per_trigram

In [255]:
def next_gram(text):
    # TODO preprocess text
    # 
    max_p_bigram = 0
    word_bigram = 'none'
    
    max_p_trigram = 0
    word_trigram = 'none'
    
    
    for word in unigram_dict.keys():
        p_bigram = P_bigram_sentences(text + ' ' + word)
        p_trigram = P_trigram_sentences(text + ' ' + word)
        
        if p_bigram > max_p_bigram:
            max_p_bigram = p_bigram 
            word_bigram = word
            
            
        if p_trigram > max_p_trigram:
            max_p_trigram = p_trigram 
            word_trigram = word
            
        
    return f'bigram: {word_bigram}, trigram: {word_trigram}'

In [5]:
import pandas as pd
import numpy as np

In [12]:
file = pd.read_csv('spam.csv')

UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 135-136: invalid continuation byte

In [256]:
def load(file_name):
    with open(file_name) as f:
        # TODO: preprocess f.read()
        return f.read()

In [235]:
unigram_dict = {}
bigram_dict = {}
trigram_dict = {}
FILE_NAME = 'data_sample.txt'
N = 0
V = 0

if __name__ == '__main__':
#     text_test = 'toi la hoc sinh'
    
    data = load(FILE_NAME)
    N, V, unigram_dict, bigram_dict, trigram_dict = make_n_gram_dict(data)
#     print(P_bigram_sentences())
#     print(P_trigram_sentences())
#     perplexity()
#     next_gram('toi la hoc')

'bigram: sinh, trigram: sinh'