In [39]:
import sys, math
import MeCab
mecab = MeCab.Tagger('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')

class baysian_filter:
    def __init__(self):
        self.words = set()             
        self.word_count = {}        
        self.category_count = {}
        
    def morphological_analysis(self, text):
        words_with = mecab.parse(text).split('\n')
        return [ w.split('\t') for w in words_with ]

    def headwords(self, words, n, res):
        if words[n][0] == 'EOS':
            return res
        w = words[n][1].split(',')
        res.append([ w[6], w[0] ])
        return self.headwords(words, n + 1, res)

    def parts_of_speech(self, words, n, res):
        if n >= len(words): 
            return res
        if words[n][1] in ['名詞', '形容詞'] :
            res.append(words[n][0])
        return self.parts_of_speech(words, n + 1, res)
        
    def update_word(self, word, category):
        if not category in self.word_count:
            self.word_count[category] = {}
        self.word_count[category][word] = self.calc_word(word, category)
        self.words.add(word)

    # wl: word list
    # c: category
    # n: index
    def update_words(self, wl, c, n):
        if n == len(wl):
            return
        self.update_word(wl[n], c)
        return self.update_words(wl, c, n + 1)

    def update_category(self, category):
        self.category_count[category] = self.calc_category(category)

    def calc_word(self, word, category):
        if not category in self.word_count:
            return 1
        if not word in self.word_count[category]:
            return 1
        return self.word_count[category][word] + 1

    def calc_category(self, category):
        if not category in self.category_count:
            return 1
        return self.category_count[category] + 1

    # training
    def fit(self, text, category):
        t = self.morphological_analysis(text)
        word_list = self.parts_of_speech(self.headwords(t, 0, []), 0, [])
        self.update_words(word_list, category, 0)
        self.update_category(category)
    
    # Πi P(Fi | C) -> Σi log P(Fi | C) 
    def calc_word_prob(self, score, words, category, n):
        if n >= len(words) -1:
            return score
        return self.calc_word_prob(score + math.log(self.word_prob(words[n], category)), 
                                   words, category, 
                                   n + 1)
        
    def score(self, words, category):
        return self.calc_word_prob( 
                               math.log(self.category_prob(category)),
                               words, category, 0)
    
    def predict(self, text):
        words = self.parts_of_speech(
            self.headwords(
                self.morphological_analysis(text)
                , 0, [])
        , 0, [])
        # argmax P(C|F) = P(C)P(F|C)
        best_category, max_score, scores = self.argmax(words, 
               list(self.category_count.keys()), 
               None, 
               -sys.maxsize, 
               [], 
               0 )
        return best_category, max_score, scores
    
    def argmax( self, words, categories, best_category, max_score, scores, n):
        if n >= len(categories):
            return best_category, max_score, scores
        score = self.score(words, categories[n])
        scores.append((categories[n], score))
        if score > max_score:
            best_category = categories[n]
            max_score = score
        return self.argmax( words, categories, best_category, max_score, scores, n + 1)
        
    #  P(C) ... category / all categories
    def category_prob(self, category):
        sum_categories = sum(self.category_count.values())
        category_v = self.category_count[category]
        return category_v / sum_categories

    def get_word_count(self, word, category):
        if word in self.word_count[category]:
            return self.word_count[category][word]
        else:
            return 0
        
    # likelihood P(F|C)
    def word_prob(self, word, category):
        n = self.get_word_count(word, category) + 1 # smoothing
        d = sum(self.word_count[category].values()) + len(self.words)
        return n / d

In [40]:
bf = baysian_filter()

In [47]:
bf.fit("今週のふりかえりと来週のタスクについて", "通常")
bf.fit("昨日の進捗について", "通常")
bf.fit("今月の経営会議について", "通常")
bf.fit("明日の来客について", "通常")
bf.fit("【Chatwork】未読メッセージがあります", "通常")
bf.fit("仕様確認のミーティングについて", "通常")
bf.fit("今週の打ち合わせや作業について", "通常")
bf.fit("販売促進会議の議事録。", "通常")
bf.fit("作業依頼です。", "通常")
bf.fit("12月の予算会議", "通常")
bf.fit("今月の勉強会の開催場所", "通常")

bf.fit("先着10名に、10000ポイントをプレゼント", "スパム")
bf.fit("＜本日最終日＞MAX50 %OFF！セール開催中！", "スパム")
bf.fit("人気作が今だけ値下げ", "スパム")
bf.fit("あなただけにお教えします", "スパム")
bf.fit("あなたと出会いたい人がいます", "スパム")
bf.fit("【毎回好評の商品に、今だけもらえるお試しセットが登場】", "スパム")
bf.fit("割引セール開催中！", "スパム")
bf.fit("新規入会キャンペーンのご案内", "スパム")
bf.fit("無料セミナー開催中", "スパム")
bf.fit("限定セール実施中", "スパム")
bf.fit("100万円プレゼント実施中", "スパム")


In [48]:
# 予測
pre, mas_score, scorelist = bf.predict("セールのお知らせ")

In [49]:
print("結果: {0}".format(pre))
print(mas_score)
print(scorelist)

結果: スパム
-3.756601677399432
[('通常', -5.477009155291413), ('スパム', -3.756601677399432)]


In [50]:
# 予測
pre, mas_score, scorelist = bf.predict("来月の予算について")

In [51]:
print("結果: {0}".format(pre))
print(mas_score)
print(scorelist)

結果: 通常
-5.477009155291413
[('通常', -5.477009155291413), ('スパム', -5.702511826454745)]


In [52]:
# 予測
# txt = "今がお得、iphone値引き中"
txt = "iPhone 7が値下げ★6sは一括500円でポイントももらえる！"
pre, mas_score, scorelist = bf.predict(txt)
print("結果: {0}".format(pre))
print(mas_score)
print(scorelist)

結果: スパム
-26.19924686421797
[('通常', -26.39970201693472), ('スパム', -26.19924686421797)]
