## テキストとラベルの抽出

In [None]:
import re
import urllib2
from bs4 import BeautifulSoup
from prettyprint import pp

html = urllib2.urlopen("https://gunosy.com/")
soup = BeautifulSoup(html, "lxml")

def get_categories(url):
    html = urllib2.urlopen(url)
    soup = BeautifulSoup(html, "lxml")
    a_list = soup.select("body > nav > ul > li > a")[1:-1]
    categories = map(lambda a: (a.get("href"), a.string), a_list)
    return categories
    
categories = get_categories("https://gunosy.com/")

def get_content(page_url):
    html = urllib2.urlopen(page_url)
    soup = BeautifulSoup(html, "lxml")
    content_tag = soup.select("body div.main.article_main > div.article.gtm-click")
    return ('').join( [text_tag.get_text() for text_tag in content_tag] ).encode('utf-8')
    
def get_links_and_contents(category_url):
    html = urllib2.urlopen(category_url)
    soup = BeautifulSoup(html, "lxml")
    a_list = soup.select("body > div > div > div.main > div.article_list.gtm-click > div.list_content > div.list_text > div.list_title > a")
    links = map(lambda a: a.get("href"), a_list)
    contents = map(lambda url: get_content(url), links)
    return links,contents

all_contents = []
all_links = []
all_labels = []
for i,url_info in enumerate(categories):
    for page_num in xrange(1,13):
        if page_num % 10 == 0: print url_info[0], page_num
        pager_query = '?page=%d' % page_num
        url = url_info[0] + pager_query
        links,contents = get_links_and_contents(url)
        all_links.extend(links)
        all_contents.extend(contents)
        all_labels.extend([i] * len(links))    

## 特徴ベクトルの作成

In [None]:
import MeCab

def get_words_list(text):
    mecab = MeCab.Tagger('mecabrc')
    node = mecab.parseToNode(text)
    words_list = []
    while node:
        if node.feature.split(",")[0] == '名詞':
            words_list.append(node.surface.lower())
        node = node.next
    return words_list

def get_words_matrix(texts_list):
    """
    texts_list : ['text1', 'text2',... ]
    """
    return [get_words_list(text) for text in texts_list]


from gensim import corpora, matutils
import numpy as np

res = get_words_matrix(all_contents)
dictionary = corpora.Dictionary(res)
dictionary.filter_extremes(no_below=4 , no_above=0.2)

# [ [(w_id1, w_id1_cnt), (w_id2, w_id2_cnt),...] ,
#   [(w_id1, w_id1_cnt), (w_id2, w_id2_cnt),...] ,
#]
bows = [dictionary.doc2bow(x) for x in res]


X = np.array([(matutils.corpus2dense([vec], num_terms=len(dictionary)).T[0]) 
                   for vec in bows])
y = np.array(all_labels)

print 'データ数', X.shape[0]
print '特徴ベクトルの次元数', X.shape[1]

## 訓練データとテストデータに分割

In [None]:
from sklearn.cross_validation import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.333333, random_state=12345)
# train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.333333)

## ナイーブベイズによる分類器

In [None]:
def calc_cat(y): # calculate P(cat)
    label_kinds = len(np.unique(y))
    ans = np.empty(label_kinds)
    for i in range(label_kinds):
        ans[i] = len(np.argwhere(y == i)[:,0]) / float(len(y))
    return ans

def calc_each_word_bar_cat(X, y): # calculate P(word_id1 | cat)
    label_kinds = len(np.unique(y))
    ans = np.empty((label_kinds, X.shape[1]))
    for i in range(label_kinds):
        index = np.argwhere(y == i)[:,0] # index
        ans[i] = (X[index, :].sum(axis=0) + 1).astype(np.float32) / (X[index, :].sum() + X.shape[1]) # lablace smoothing
    return ans

def calc_log_prob(X, y, word_cat_prob, cat_prob, doc): # doc : 1 feature vector
        doc_word_index = np.where(doc != 0)
        each_word_prob_in_doc  = word_cat_prob[:, doc_word_index[1]] #docに含まれる単語の各P(word | cat)
        each_log_word_cat_prob = np.log(each_word_prob_in_doc).sum(axis=1)
        ans = np.log(cat_prob) + each_log_word_cat_prob
        return ans


## 性能評価 

In [None]:
cat_prob = calc_cat(train_y)
word_cat_prob = calc_each_word_bar_cat(train_X, train_y)
pred_y = []
docs = test_X
for i in range(len(docs)):
    log_prob = calc_log_prob(train_X,train_y,word_cat_prob,cat_prob, (docs[i])[np.newaxis,:])
    pred_y.append(np.argmax(log_prob))
pred_y = np.array(pred_y)

from sklearn.metrics import f1_score
print '---- my prediction -----'
print f1_score(test_y, pred_y, average='macro')
print 