In [54]:
import re
from dateutil.parser import parse

In [158]:
class Token():
    def __init__(self, val):
        self.val = val
        
    def is_number(self):
        if re.search(r'^\d+$', self.val):
            return True
        return False
        
    def is_contain_digit (self):
        if re.search(r'\d', self.val):
            return True
        return False
    
    def is_date(self):
        try: 
            parse(self.val, fuzzy=True)
            return True
        except ValueError:
            return False

In [159]:
def tokenize(sentence):
    ls = []
    sentence = re.sub(r'[.,]', ' ', sentence)
    for word in re.split(r' +', sentence):
        if word != '':
            ls.append(Token(word))
            
    return ls

In [160]:
def split_sentences(p):
    ls = []
    for s in re.split(r'\n+',p):
        if not re.match(r' +?$', s):
            ls.append(s.strip())
        
    return ls

In [161]:
def unigram(sentences):
    ls = []
    
    sentences = tokenize(sentences)
    for i in range(len(sentences)):
        word = sentences[i].val
        ls.append(f'{word}')
    
    return ls

In [162]:
def bigram(sentences):
    ls = []
    
    sentences = f'<s> {sentences} </s>'
    sentences = tokenize(sentences)
    for i in range(len(sentences) - 1):
        word = sentences[i].val
        word_i = sentences[i + 1].val
        ls.append(f'{word}_{word_i}')
    
    return ls

In [163]:
def trigram(sentences):
    ls = []
    
    sentences = f'<s> <s> {sentences} </s> </s>'
    sentences = tokenize(sentences)
    
    for i in range(len(sentences) - 2):
        word = sentences[i].val
        word_i = sentences[i + 1].val
        word_ii = sentences[i + 2].val
        ls.append(f'{word}_{word_i}_{word_ii}')
    
    return ls

In [234]:
def make_n_gram_dict(data):
    unigram_dict = {}
    bigram_dict = {}
    trigram_dict = {}
    N = 0
    V = 0
    
    num_sentences = 0
    for sentences in split_sentences(data):
        num_sentences += 1
        
        for n_gram in unigram(sentences):
            N += 1
            if n_gram in unigram_dict:
                unigram_dict[n_gram] += 1
            else:
                unigram_dict[n_gram] = 1
                V += 1
        
        for n_gram in bigram(sentences):
            if n_gram in bigram_dict:
                bigram_dict[n_gram] += 1
            else:
                bigram_dict[n_gram] = 1
                
        for n_gram in trigram(sentences):
            if n_gram in trigram_dict:
                trigram_dict[n_gram] += 1
            else:
                trigram_dict[n_gram] = 1
    
    unigram_dict['</s>'] = num_sentences
    return N, V, unigram_dict, bigram_dict, trigram_dict

In [5]:
import pandas as pd
import numpy as np

In [26]:
data

Unnamed: 0,label,doc
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [39]:
data.iloc[1:10]

Unnamed: 0,label,doc
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [67]:
def classifer(train_set, num_word = 1)

747

In [None]:
def calc(train_set, test_set):
#     label = data.label
#     value = data.doc
    num_spam = train_set.loc[train_set.label == "spam"].__len__()
    
    for test in test_set:
        label = test_set.label
        doc = test_set.value
        
        
        
        predict_unigram = classifer(train_set, doc)
        predict_bigram = classifer(train_set, doc, num_word=2)
    
    
#         valid(predict_unigram, label)
#         valid(predict_bigram, label)

In [52]:
from math import floor

def data_loader(data, test=False, train_split=8, test_split=2):
    '''
        data: dataframe, contain labels and values
        test: if true, split data to test if not
        train_split, test_split: % split
    '''
    _len = data.__len__()
    _split = floor(_len/10)*train_split
    if test:
        return data.iloc[_split:-1]

    return data.iloc[0:_split]

In [53]:
def load(file_name):
    # TODO preprocess
    file = pd.read_csv(file_name,encoding='ISO-8859-1')
    data = pd.DataFrame({'label': file[file.columns[0]], 'doc': file[file.columns[1]]})
    return data

In [60]:
FILE_NAME = 'spam.csv'

if __name__ == '__main__':
    
    data = load(FILE_NAME)
    train_set = data_loader(data)
    test_set = data_loader(data, test=True)

#     calc(train_set, test_set)