In [5]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from janome.tokenizer import Tokenizer
import re


def wakati_reading(text):
    j_tokenizer = Tokenizer()
    tokens = j_tokenizer.tokenize(text.replace("'", "").lower())
    
    exclude_pos = [u'助動詞']
    
    #分かち書き
    tokens_w_space = ""
    for token in tokens:
        partOfSpeech = token.part_of_speech.split(',')[0]
        
        if partOfSpeech not in exclude_pos:
            tokens_w_space = tokens_w_space + " " + token.surface

    tokens_w_space = tokens_w_space.strip()
    
    #読み方
    tokens_reading = ""
    for token in tokens:
        partOfSpeech = token.part_of_speech.split(',')[0]
 
        if partOfSpeech not in exclude_pos:
            if token.reading != "*":
                tokens_reading = tokens_reading + " " + token.reading
            elif re.match('^[a-z]+$', token.base_form):
                alpha_reading = ""
                alpha_reading = token.base_form.replace("a", "エー ")
                alpha_reading = alpha_reading.replace("b", "ビー ")
                alpha_reading = alpha_reading.replace("c", "シー ")
                alpha_reading = alpha_reading.replace("d", "ディー ")
                alpha_reading = alpha_reading.replace("e", "イー ")
                alpha_reading = alpha_reading.replace("f", "エフ ")
                alpha_reading = alpha_reading.replace("g", "ジー ")
                alpha_reading = alpha_reading.replace("h", "エイチ ")
                alpha_reading = alpha_reading.replace("i", "アイ ")
                alpha_reading = alpha_reading.replace("j", "ジェー ")
                alpha_reading = alpha_reading.replace("k", "ケー ")
                alpha_reading = alpha_reading.replace("l", "エル ")
                alpha_reading = alpha_reading.replace("m", "エム ")
                alpha_reading = alpha_reading.replace("n", "エヌ ")
                alpha_reading = alpha_reading.replace("o", "オー ")
                alpha_reading = alpha_reading.replace("p", "ピー ")
                alpha_reading = alpha_reading.replace("q", "キュー ")
                alpha_reading = alpha_reading.replace("r", "アール ")
                alpha_reading = alpha_reading.replace("s", "エス ")
                alpha_reading = alpha_reading.replace("t", "ティー ")
                alpha_reading = alpha_reading.replace("u", "ユー ")
                alpha_reading = alpha_reading.replace("v", "ブイ ")
                alpha_reading = alpha_reading.replace("w", "ダブリュー ")
                alpha_reading = alpha_reading.replace("x", "エックス ")
                alpha_reading = alpha_reading.replace("y", "ワイ ")
                alpha_reading = alpha_reading.replace("z", "ゼット ")

                tokens_reading = tokens_reading + " " + alpha_reading
            elif re.match('^[0-9]+$', token.base_form):
                numeric_reading = ""
                numeric_reading = token.base_form.replace("0", "ゼロ ")
                numeric_reading = numeric_reading.replace("1", "イチ ")
                numeric_reading = numeric_reading.replace("2", "ニ ")
                numeric_reading = numeric_reading.replace("3", "サン ")
                numeric_reading = numeric_reading.replace("4", "ヨン ")
                numeric_reading = numeric_reading.replace("5", "ゴ ")
                numeric_reading = numeric_reading.replace("6", "ロク ")
                numeric_reading = numeric_reading.replace("7", "ナナ ")
                numeric_reading = numeric_reading.replace("8", "ハチ ")
                numeric_reading = numeric_reading.replace("9", "キュー ")

                tokens_reading = tokens_reading + " " + numeric_reading.strip()

    tokens_reading = tokens_reading.strip()
    
    feature = tokens_w_space + " " + tokens_reading
    
    return feature

def main():
    data = pd.read_csv('./data/intent_data_jp_ja.csv', sep=',', names=['text', 'intent'])
    le = preprocessing.LabelEncoder()

    data['label'] = le.fit_transform(data['intent'])
    
    data = data.drop(['intent'], axis=1)
    
    data['feature'] = data['text'].apply(lambda x: wakati_reading(x))
    data = data.drop(['text'], axis=1)
    
    train = data.drop(['label'], axis=1)
    train_label = data['label']
    
    train_X, val_X, train_Y, val_Y = train_test_split(train, train_label,
                                                  test_size = .2,
                                                  random_state=12)
    
    tf = TfidfVectorizer(analyzer='word', ngram_range=(1,6), max_features=10000,
                    sublinear_tf=True, token_pattern=u'[A-Za-z0-9\-ぁ-ヶ亜-黑ー]{1,}')

    train_X_tf =  tf.fit_transform(train_X['feature'])
    val_X_tf =  tf.transform(val_X['feature'])

    clf_final = Pipeline([('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1,3), max_features=5000,
                    sublinear_tf=True, token_pattern=u'[A-Za-z0-9\-ぁ-ヶ亜-黑ー]{1,}')),
                           ('clf', GradientBoostingClassifier(
                                random_state = 1337,
                                verbose = 0,
                                n_estimators = 20,
                                learning_rate = 0.1,
                                loss = 'deviance',
                                max_depth = 3
                               ))])
    
    clf_final = clf_final.fit(train_X['feature'], train_Y)
    
    joblib.dump(clf_final, './model/ja_jp_v6.pkl')
    
if __name__ == '__main__':
    main()

In [3]:
import pandas as pd
from sklearn import preprocessing
from janome.tokenizer import Tokenizer
import re

def wakati_reading(text):
    j_tokenizer = Tokenizer()
    tokens = j_tokenizer.tokenize(text.replace("'", "").lower())
    
    exclude_pos = [u'助動詞']
    
    #分かち書き
    tokens_w_space = ""
    for token in tokens:
        partOfSpeech = token.part_of_speech.split(',')[0]
        
        if partOfSpeech not in exclude_pos:
            tokens_w_space = tokens_w_space + " " + token.surface

    tokens_w_space = tokens_w_space.strip()
    
    #読み方
    tokens_reading = ""
    for token in tokens:
        partOfSpeech = token.part_of_speech.split(',')[0]
 
        if partOfSpeech not in exclude_pos:
            if token.reading != "*":
                tokens_reading = tokens_reading + " " + token.reading
            elif re.match('^[a-z]+$', token.base_form):
                alpha_reading = ""
                alpha_reading = token.base_form.replace("a", "エー ")
                alpha_reading = alpha_reading.replace("b", "ビー ")
                alpha_reading = alpha_reading.replace("c", "シー ")
                alpha_reading = alpha_reading.replace("d", "ディー ")
                alpha_reading = alpha_reading.replace("e", "イー ")
                alpha_reading = alpha_reading.replace("f", "エフ ")
                alpha_reading = alpha_reading.replace("g", "ジー ")
                alpha_reading = alpha_reading.replace("h", "エイチ ")
                alpha_reading = alpha_reading.replace("i", "アイ ")
                alpha_reading = alpha_reading.replace("j", "ジェー ")
                alpha_reading = alpha_reading.replace("k", "ケー ")
                alpha_reading = alpha_reading.replace("l", "エル ")
                alpha_reading = alpha_reading.replace("m", "エム ")
                alpha_reading = alpha_reading.replace("n", "エヌ ")
                alpha_reading = alpha_reading.replace("o", "オー ")
                alpha_reading = alpha_reading.replace("p", "ピー ")
                alpha_reading = alpha_reading.replace("q", "キュー ")
                alpha_reading = alpha_reading.replace("r", "アール ")
                alpha_reading = alpha_reading.replace("s", "エス ")
                alpha_reading = alpha_reading.replace("t", "ティー ")
                alpha_reading = alpha_reading.replace("u", "ユー ")
                alpha_reading = alpha_reading.replace("v", "ブイ ")
                alpha_reading = alpha_reading.replace("w", "ダブリュー ")
                alpha_reading = alpha_reading.replace("x", "エックス ")
                alpha_reading = alpha_reading.replace("y", "ワイ ")
                alpha_reading = alpha_reading.replace("z", "ゼット ")

                tokens_reading = tokens_reading + " " + alpha_reading
            elif re.match('^[0-9]+$', token.base_form):
                numeric_reading = ""
                numeric_reading = token.base_form.replace("0", "ゼロ ")
                numeric_reading = numeric_reading.replace("1", "イチ ")
                numeric_reading = numeric_reading.replace("2", "ニ ")
                numeric_reading = numeric_reading.replace("3", "サン ")
                numeric_reading = numeric_reading.replace("4", "ヨン ")
                numeric_reading = numeric_reading.replace("5", "ゴ ")
                numeric_reading = numeric_reading.replace("6", "ロク ")
                numeric_reading = numeric_reading.replace("7", "ナナ ")
                numeric_reading = numeric_reading.replace("8", "ハチ ")
                numeric_reading = numeric_reading.replace("9", "キュー ")

                tokens_reading = tokens_reading + " " + numeric_reading.strip()

    tokens_reading = tokens_reading.strip()
    
    feature = tokens_w_space + " " + tokens_reading
    
    return feature

data = pd.read_csv('./data/intent_data_jp_ja.csv', sep=',', names=['text', 'intent'])
le = preprocessing.LabelEncoder()

data['label'] = le.fit_transform(data['intent'])
    
data = data.drop(['intent'], axis=1)
    
data['feature'] = data['text'].apply(lambda x: wakati_reading(x))
data = data.drop(['text'], axis=1)
    
data.to_csv('./data/feature_data_jp_ja.csv', header=False, index=False, encoding="utf-8")

In [None]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline

def main():
    data = pd.read_csv('./data/feature_data_jp_ja.csv', sep=',', names=['label', 'feature'])
    
    train = data.drop(['label'], axis=1)
    train_label = data['label']
    
    train_X, val_X, train_Y, val_Y = train_test_split(train, train_label,
                                                  test_size = .2,
                                                  random_state=12)
    
    tf = TfidfVectorizer(analyzer='word', ngram_range=(1,6), max_features=10000,
                    sublinear_tf=True, token_pattern=u'[A-Za-z0-9\-ぁ-ヶ亜-黑ー]{1,}')

    train_X_tf =  tf.fit_transform(train_X['feature'])
    val_X_tf =  tf.transform(val_X['feature'])

    clf_final = Pipeline([('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1,3), max_features=5000,
                    sublinear_tf=True, token_pattern=u'[A-Za-z0-9\-ぁ-ヶ亜-黑ー]{1,}')),
                           ('clf', GradientBoostingClassifier(
                                random_state = 1337,
                                verbose = 0,
                                n_estimators = 20,
                                learning_rate = 0.1,
                                loss = 'deviance',
                                max_depth = 3
                               ))])
    
    clf_final = clf_final.fit(train_X['feature'], train_Y)
    
    joblib.dump(clf_final, './model/ja_jp_v7.pkl')
    
if __name__ == '__main__':
    main()