<a href="https://colab.research.google.com/github/2020-nlp-c/nlp-statisticsmodel/blob/master/bogyung/MulticlassNB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multinomial NB

In [1]:
import pandas as pd
import numpy as np

class MultinomialNB:
    def __init__(self, data, input_tokens, k = 0.5):
        self.data = data
        self.input_tokens = input_tokens
        self.k = k
    
    def percent(self):
        tokens = []
        for token in self.data.tokens:
            for i in range(len(token)):
                tokens.append(token[i])
        tokens = np.unique(tokens)
        labels = np.unique(self.data.label)
        N = len(labels)
        labels_num = np.zeros((N, len(tokens))) # labels, tokens 순서대로 빈도수 채운 list
        
        labels_dic = {}
        for i, label in enumerate(labels):
            labels_dic[label] = i

        tokens_dic = {}
        for i, token in enumerate(tokens):
            tokens_dic[token] = i

        for i, token_list in enumerate(self.data.tokens):
            for token in token_list:
                labels_num[ labels_dic[self.data.label[i]], tokens_dic[token] ] += 1

        labels_log = np.zeros((N, len(tokens)))
        for i in range(N):
            for j in range(len(tokens)):
                labels_log[i, j] = np.log((self.k + labels_num[i, j])/(2*self.k + labels_num[i].sum()))

        labels_p = np.zeros(N)
        for i in range(N):
            labels_p[i] = np.log(labels_num[i].sum() / labels_num.sum())

        for token in self.input_tokens.split():
            for i in range(N):
                labels_p[i] += labels_log[i, tokens_dic[token]]
                
        labels_exp_sum = 0
        for i in range(N):
            labels_exp_sum += np.exp(labels_p[i])

        p_fin = np.exp(labels_p)/labels_exp_sum
        return p_fin
    
    def result(self):
        labels = np.unique(self.data.label)
        return labels[np.argmax(self.percent())]

In [2]:
# multiclass: spam, normal, ad
mail = [["me free lottery", "spam"],
        ["free get free you", "spam"],
        ["you free scholarship", "normal"],
        ["free to contact me", "normal"],
        ["you won award", "normal"],
        ["you ticket lottery", "spam"],
        ["free ticket", "ad"],
        ["get scholarship", "ad"]]
df = pd.DataFrame(mail, columns = ["tokens", "label"])
df.tokens = df.tokens.map(lambda x: x.split())

In [3]:
df

Unnamed: 0,tokens,label
0,"[me, free, lottery]",spam
1,"[free, get, free, you]",spam
2,"[you, free, scholarship]",normal
3,"[free, to, contact, me]",normal
4,"[you, won, award]",normal
5,"[you, ticket, lottery]",spam
6,"[free, ticket]",ad
7,"[get, scholarship]",ad


In [4]:
mnb = MultinomialNB(data = df, input_tokens = "get lottery")
mnb.percent(), mnb.result()

(array([0.26632428, 0.04585473, 0.68782098]), 'spam')