## テキストとラベルの抽出

In [1]:
import re
import urllib2
from bs4 import BeautifulSoup
from prettyprint import pp

def get_categories(url):
    html = urllib2.urlopen(url)
    soup = BeautifulSoup(html, "lxml")
    a_list = soup.select("body > nav > ul > li > a")[1:-1]
    categories = map(lambda a: (a.get("href"), a.string), a_list)
    return categories
    
categories = get_categories("https://gunosy.com/")

def get_content(page_url):
    html = urllib2.urlopen(page_url)
    soup = BeautifulSoup(html, "lxml")
    content_tag = soup.select("body div.main.article_main > div.article.gtm-click")
    return ('').join( [text_tag.get_text() for text_tag in content_tag] ).encode('utf-8')
    
def get_links_and_contents(category_url):
    html = urllib2.urlopen(category_url)
    soup = BeautifulSoup(html, "lxml")
    a_list = soup.select("body > div > div > div.main > div.article_list.gtm-click > div.list_content > div.list_text > div.list_title > a")
    links = map(lambda a: a.get("href"), a_list)
    contents = map(lambda url: get_content(url), links)
    return links,contents

all_contents = []
all_links = []
all_labels = []
for i,url_info in enumerate(categories):
    for page_num in xrange(1,6):
        if page_num % 5 == 0: print url_info[0], page_num
        pager_query = '?page=%d' % page_num
        url = url_info[0] + pager_query
        links,contents = get_links_and_contents(url)
        all_links.extend(links)
        all_contents.extend(contents)
        all_labels.extend([i] * len(links))    

https://gunosy.com/categories/1 5
https://gunosy.com/categories/2 5
https://gunosy.com/categories/3 5
https://gunosy.com/categories/4 5
https://gunosy.com/categories/5 5
https://gunosy.com/categories/6 5
https://gunosy.com/categories/7 5
https://gunosy.com/categories/8 5


## 特徴ベクトルの作成

In [2]:
import MeCab

def get_words_list(text):
    mecab = MeCab.Tagger('mecabrc')
    node = mecab.parseToNode(text)
    words_list = []
    while node:
        if node.feature.split(",")[0] == '名詞':
            words_list.append(node.surface.lower())
        node = node.next
    return words_list

def get_words_matrix(texts_list):
    """
    texts_list : ['text1', 'text2',... ]
    """
    return [get_words_list(text) for text in texts_list]


from gensim import corpora, matutils
import numpy as np

res = get_words_matrix(all_contents)
dictionary = corpora.Dictionary(res)
dictionary.filter_extremes(no_below=4 , no_above=0.2)

# [ [(w_id1, w_id1_cnt), (w_id2, w_id2_cnt),...] ,
#   [(w_id1, w_id1_cnt), (w_id2, w_id2_cnt),...] ,
#]
bows = [dictionary.doc2bow(x) for x in res]

X = np.array([(matutils.corpus2dense([vec], num_terms=len(dictionary)).T[0]) 
                   for vec in bows])
y = np.array(all_labels)

print 'データ数', X.shape[0]
print '特徴ベクトルの次元数', X.shape[1]

データ数 800
特徴ベクトルの次元数 4741


## ナイーブベイズによる分類器

In [3]:
def calc_cat(y): # calculate P(cat)
    label_kinds = len(np.unique(y))
    ans = np.empty(label_kinds)
    for i in range(label_kinds):
        ans[i] = len(np.argwhere(y == i)[:,0]) / float(len(y))
    return ans

def calc_each_word_bar_cat(X, y): # calculate P(word_id1 | cat)
    label_kinds = len(np.unique(y))
    ans = np.empty((label_kinds, X.shape[1]))
    for i in range(label_kinds):
        index = np.argwhere(y == i)[:,0] # index
        ans[i] = (X[index, :].sum(axis=0) + 1).astype(np.float32) / (X[index, :].sum() + X.shape[1]) # laplace smoothing
    return ans

def calc_log_prob(word_cat_prob, cat_prob, doc): # doc : 1 feature vector
        doc_word_index = np.where(doc != 0)
        each_word_prob_in_doc  = word_cat_prob[:, doc_word_index[1]] #docに含まれる単語の各P(word | cat)
        each_log_word_cat_prob = np.log(each_word_prob_in_doc).sum(axis=1)
        ans = np.log(cat_prob) + each_log_word_cat_prob
        return ans


## 性能評価を定義

In [4]:
def evaluate_model(train_X,test_X,train_y,test_y):
    cat_prob = calc_cat(train_y)
    word_cat_prob = calc_each_word_bar_cat(train_X, train_y)
    pred_y = []
    for vec in test_X:
        log_prob = calc_log_prob(word_cat_prob,cat_prob, (vec)[np.newaxis,:])
        pred_y.append(np.argmax(log_prob))
    pred_y = np.array(pred_y)
    
    score = f1_score(test_y, pred_y, average='macro')
    return score
    
#evaluate_model(train_X,test_X,train_y,test_y)

## Cross validation で評価

In [6]:
from sklearn.metrics import f1_score
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold

# train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=42)
# evaluate_model(train_X,test_X,train_y,test_y)


skf = StratifiedKFold(y, n_folds=5, shuffle=True, random_state = 6789)
scores = []
for train_index, test_index in skf:
    #print train_index, test_index
    train_X, test_X = X[train_index], X[test_index]
    train_y, test_y = y[train_index], y[test_index]  
    score = evaluate_model(train_X,test_X,train_y,test_y)
    scores.append(score)
scores = np.array(scores)
print scores
print '結果' ,np.mean(scores)

[ 0.8737455   0.89922887  0.8276687   0.8756959   0.81797754]
結果 0.858863300311


## 入力フォームでURLを受け取り→カテゴリ名予測

### 事前に保持

In [9]:
cat_prob = calc_cat(y)
word_cat_prob = calc_each_word_bar_cat(X,y)
dictionary
cvt_y_to_category_name = dict(map(lambda (i,x): (i,x[1].encode('utf-8')), enumerate(categories)))#ラベルy から　カテゴリ名に変換するdict
pp(cvt_y_to_category_name)

SAVE_FLAG = True
if SAVE_FLAG:
    import pickle
    def save_as_pickle(file_path, obj):
        with open(file_path, 'w') as f:
            pickle.dump(obj, f)    
    save_as_pickle('dictionay.dump', dictionary)
    save_as_pickle('cat_prob.dump', cat_prob)
    save_as_pickle('word_cat_prob.dump', word_cat_prob)
    save_as_pickle('cvt_y_to_category_name.dump', cvt_y_to_category_name)
    print '------ Save completed. ------'

{
    "0": "エンタメ", 
    "1": "スポーツ", 
    "2": "おもしろ", 
    "3": "国内", 
    "4": "海外", 
    "5": "コラム", 
    "6": "IT・科学", 
    "7": "グルメ"
}
------ Save completed. ------


### カテゴリ名予測

In [10]:
def get_obj_from_pickle(file_path):
    with open(file_path, 'r') as f:
        return pickle.load(f)

def predict_y_of_input_url(input_url, dictionary,word_cat_prob, cat_prob):
    text_of_input_url = get_content(input_url)
    word_list_of_input_url = get_words_list(text_of_input_url)
    bow_of_input_url = dictionary.doc2bow(word_list_of_input_url) #[(w_id1, w_id1_cnt), (w_id2, w_id2_cnt),...] 
    test_X = np.array([matutils.corpus2dense([bow_of_input_url], num_terms=len(dictionary)).T[0]])
    log_prob = calc_log_prob(word_cat_prob,cat_prob, test_X)
    return np.argmax(log_prob)
    

# deserialize
dir_path = './'
c_prob = get_obj_from_pickle(dir_path+'cat_prob.dump')
w_c_prob = get_obj_from_pickle(dir_path+'word_cat_prob.dump')
cvt_y_to_c_name = get_obj_from_pickle(dir_path+'cvt_y_to_category_name.dump')

import time
st = time.time()

print '-----　入力を受け取ります ------'
input_url = 'https://gunosy.com/articles/at0SJ' # 例
print '入力されたURL :', input_url
pred_y_of_input_url = predict_y_of_input_url(input_url, dictionary,w_c_prob, c_prob)
pred_category_name_of_input_url = cvt_y_to_c_name[pred_y_of_input_url] #数字ラベルを変換
print '----------　予測結果 ------------'
print '予測カテゴリ :', pred_category_name_of_input_url
print
print time.time()-st, '[sec]'

-----　入力を受け取ります ------
入力されたURL : https://gunosy.com/articles/at0SJ
----------　予測結果 ------------
予測カテゴリ : おもしろ

0.844357013702 [sec]
