## テキストとラベルの抽出

In [1]:
import re
import urllib2
from bs4 import BeautifulSoup
from prettyprint import pp

html = urllib2.urlopen("https://gunosy.com/")
soup = BeautifulSoup(html, "lxml")

def get_categories(url):
    html = urllib2.urlopen(url)
    soup = BeautifulSoup(html, "lxml")
    a_list = soup.select("body > nav > ul > li > a")[1:-1]
    categories = map(lambda a: (a.get("href"), a.string), a_list)
    return categories
    
categories = get_categories("https://gunosy.com/")

def get_links_and_titles(category_url):
    html = urllib2.urlopen(category_url)
    soup = BeautifulSoup(html, "lxml")
    a_list = soup.select("body > div > div > div.main > div.article_list.gtm-click > div.list_content > div.list_text > div.list_title > a")
    links = map(lambda a: a.get("href"), a_list)
    titles = map(lambda a: a.string.encode('utf-8'), a_list)
    return links,titles


all_titles = []
all_links = []
all_labels = []
for i,url_info in enumerate(categories):
    links,titles = get_links_and_titles(url_info[0])
    all_links.extend(links)
    all_titles.extend(titles)
    all_labels.extend([i] * len(links))    

## 特徴ベクトルの作成

In [2]:
import MeCab

def get_words_list(text):
    mecab = MeCab.Tagger('mecabrc')
    node = mecab.parseToNode(text)
    words_list = []
    while node:
        if node.feature.split(",")[0] == '名詞':
            words_list.append(node.surface.lower())
        node = node.next
    return words_list

def get_words_matrix(texts_list):
    """
    texts_list : ['text1', 'text2',... ]
    """
    return [get_words_list(text) for text in texts_list]


from gensim import corpora, matutils
import numpy as np

res = get_words_matrix(all_titles)
dictionary = corpora.Dictionary(res)
bows = dictionary.doc2bow(res[0]) # [(w_id1, w_id1_cnt), (w_id2, w_id2_cnt),...]
bows = [dictionary.doc2bow(x) for x in res]

X = np.array([(matutils.corpus2dense([vec], num_terms=len(dictionary)).T[0]) 
                   for vec in bows])
y = np.array(all_labels)

## 訓練データとテストデータに分割

In [3]:
from sklearn.cross_validation import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.33, random_state=123)

## 番外編：sklearn.naive_bayes で動作確認

In [4]:
# sklearnで動作確認
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB(alpha=0.1)
clf.fit(train_X,train_y)
print '----- 正解ラベル ----'
print test_y
print '----- 予測ラベル ----'
print clf.predict(test_X)
# print np.argmax(clf.predict_log_proba(test_X), axis=1)
clf.score(test_X,test_y)

----- 正解ラベル ----
[6 7 5 4 1 4 4 6 1 1 5 2 1 4 3 4 3 6 0 2 3 1 1 1 0 3 1 7 2 3 7 7 7 6 7 0 7
 0 0 3 1 4 0 4 4 5 2 5 7 5 0 0 2]
----- 予測ラベル ----
[3 7 5 4 2 4 4 2 7 1 5 0 0 4 3 4 3 3 7 2 0 1 1 1 1 1 1 1 2 3 4 7 1 6 1 0 5
 2 2 0 2 5 1 4 1 2 5 3 7 5 1 0 1]


0.47169811320754718

## ナイーブベイズによる分類器

In [7]:
def calc_cat(y): # calculate P(cat)
    label_kinds = len(np.unique(y))
    ans = np.empty(label_kinds)
    for i in range(label_kinds):
        ans[i] = len(np.argwhere(y == i)[:,0]) / float(len(y))
    return ans

def calc_each_word_bar_cat(X, y): # calculate P(word_id1 | cat)
    label_kinds = len(np.unique(y))
    ans = np.empty((label_kinds, X.shape[1]))
    for i in range(label_kinds):
        index = np.argwhere(y == i)[:,0] # index
        ans[i] = (X[index, :].sum(axis=0) + 1).astype(np.float32) / (X[index, :].sum() + X.shape[1]) # lablace smoothing
    return ans

def calc_log_prob(X, y, word_cat_prob, cat_prob, doc): # doc : 1 feature vector
        doc_word_index = np.where(doc != 0)
        each_word_prob_in_doc  = word_cat_prob[:, doc_word_index[1]] #docに含まれる単語の各P(word | cat)
        each_log_word_cat_prob = np.log(each_word_prob_in_doc).sum(axis=1)
        ans = np.log(cat_prob) + each_log_word_cat_prob
        return ans


## 性能評価 

In [6]:
cat_prob = calc_cat(train_y)
word_cat_prob = calc_each_word_bar_cat(train_X, train_y)
pred_y = []
docs = test_X
for i in range(len(docs)):
    log_prob = calc_log_prob(train_X,train_y,word_cat_prob,cat_prob, (docs[i])[np.newaxis,:])
    pred_y.append(np.argmax(log_prob))
pred_y = np.array(pred_y)

from sklearn.metrics import f1_score

sk_pred_y =  clf.predict(test_X)
print '---- sklearn -----'
print f1_score(test_y, sk_pred_y, average='macro')
print '---- my prediction -----'
print f1_score(test_y, pred_y, average='macro')
print 

---- sklearn -----
0.466897776137
---- my prediction -----
0.441883116883

