In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report,accuracy_score, f1_score, classification_report , roc_auc_score , precision_recall_curve , average_precision_score, auc , roc_curve
import scipy.stats as st
from janome.tokenizer import Tokenizer
import re
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
data = pd.read_csv('./data/intent_data_jp_ja.csv', sep=',', names=['text', 'intent'])

In [4]:
data.shape

(4215, 2)

In [5]:
data.head(3)

Unnamed: 0,text,intent
0,ラジオ日本聞きたい,JORF
1,ラジオ日本を聞かせて,JORF
2,ラジオ日本を再生,JORF


In [6]:
le = preprocessing.LabelEncoder()

data['label'] = le.fit_transform(data['intent'])

In [7]:
data.shape

(4215, 3)

In [8]:
data

Unnamed: 0,text,intent,label
0,ラジオ日本聞きたい,JORF,5
1,ラジオ日本を聞かせて,JORF,5
2,ラジオ日本を再生,JORF,5
3,ラジオ日本再生,JORF,5
4,ラジオ日本を再生して,JORF,5
5,ラジオ日本を再生,JORF,5
6,ラジオ日本かけて,JORF,5
7,ラジオ日本して,JORF,5
8,ラジオ日本にまわして,JORF,5
9,ラジオ日本に変更して,JORF,5


In [9]:
data.label.unique()

array([ 5,  0,  7, 16,  3, 18, 19, 20, 13,  9,  6, 10, 11,  4,  2,  1, 12,
       15, 14,  8, 17], dtype=int64)

In [10]:
data = data.drop(['intent'], axis=1)

In [11]:
data.head(3)

Unnamed: 0,text,label
0,ラジオ日本聞きたい,5
1,ラジオ日本を聞かせて,5
2,ラジオ日本を再生,5


In [12]:
j_tokenizer = Tokenizer()

def wakati_reading(text):
    tokens = j_tokenizer.tokenize(text.replace("'", "").lower())
    
    exclude_pos = [u'助動詞']
    
    #分かち書き
    tokens_w_space = ""
    for token in tokens:
        partOfSpeech = token.part_of_speech.split(',')[0]
        
        if partOfSpeech not in exclude_pos:
            tokens_w_space = tokens_w_space + " " + token.surface

    tokens_w_space = tokens_w_space.strip()
    
    #読み方
    tokens_reading = ""
    for token in tokens:
        partOfSpeech = token.part_of_speech.split(',')[0]
 
        if partOfSpeech not in exclude_pos:
            if token.reading != "*":
                tokens_reading = tokens_reading + " " + token.reading
            elif re.match('^[a-z]+$', token.base_form):
                alpha_reading = ""
                alpha_reading = token.base_form.replace("a", "エー ")
                alpha_reading = alpha_reading.replace("b", "ビー ")
                alpha_reading = alpha_reading.replace("c", "シー ")
                alpha_reading = alpha_reading.replace("d", "ディー ")
                alpha_reading = alpha_reading.replace("e", "イー ")
                alpha_reading = alpha_reading.replace("f", "エフ ")
                alpha_reading = alpha_reading.replace("g", "ジー ")
                alpha_reading = alpha_reading.replace("h", "エイチ ")
                alpha_reading = alpha_reading.replace("i", "アイ ")
                alpha_reading = alpha_reading.replace("j", "ジェー ")
                alpha_reading = alpha_reading.replace("k", "ケー ")
                alpha_reading = alpha_reading.replace("l", "エル ")
                alpha_reading = alpha_reading.replace("m", "エム ")
                alpha_reading = alpha_reading.replace("n", "エヌ ")
                alpha_reading = alpha_reading.replace("o", "オー ")
                alpha_reading = alpha_reading.replace("p", "ピー ")
                alpha_reading = alpha_reading.replace("q", "キュー ")
                alpha_reading = alpha_reading.replace("r", "アール ")
                alpha_reading = alpha_reading.replace("s", "エス ")
                alpha_reading = alpha_reading.replace("t", "ティー ")
                alpha_reading = alpha_reading.replace("u", "ユー ")
                alpha_reading = alpha_reading.replace("v", "ブイ ")
                alpha_reading = alpha_reading.replace("w", "ダブリュー ")
                alpha_reading = alpha_reading.replace("x", "エックス ")
                alpha_reading = alpha_reading.replace("y", "ワイ ")
                alpha_reading = alpha_reading.replace("z", "ゼット ")

                tokens_reading = tokens_reading + " " + alpha_reading
            elif re.match('^[0-9]+$', token.base_form):
                numeric_reading = ""
                numeric_reading = token.base_form.replace("0", "ゼロ ")
                numeric_reading = numeric_reading.replace("1", "イチ ")
                numeric_reading = numeric_reading.replace("2", "ニ ")
                numeric_reading = numeric_reading.replace("3", "サン ")
                numeric_reading = numeric_reading.replace("4", "ヨン ")
                numeric_reading = numeric_reading.replace("5", "ゴ ")
                numeric_reading = numeric_reading.replace("6", "ロク ")
                numeric_reading = numeric_reading.replace("7", "ナナ ")
                numeric_reading = numeric_reading.replace("8", "ハチ ")
                numeric_reading = numeric_reading.replace("9", "キュー ")

                tokens_reading = tokens_reading + " " + numeric_reading.strip()

    tokens_reading = tokens_reading.strip()
    
    feature = tokens_w_space + " " + tokens_reading
    
    return feature

In [13]:
#Tokenize, remove stop words / pos, and add pronunciation
data['feature'] = data['text'].apply(lambda x: wakati_reading(x))
data = data.drop(['text'], axis=1)

data.head(3)

Unnamed: 0,label,feature
0,5,ラジオ 日本 聞き ラジオ ニッポン キキ
1,5,ラジオ 日本 を 聞か せ て ラジオ ニッポン ヲ キカ セ テ
2,5,ラジオ 日本 を 再生 ラジオ ニッポン ヲ サイセイ


In [14]:
train = data.drop(['label'], axis=1)
train_label = data['label']

In [15]:
#Split to training and validation
train_X, val_X, train_Y, val_Y = train_test_split(train, train_label,
                                                  test_size = .2,
                                                  random_state=12)

In [16]:
train_X.head(3)

Unnamed: 0,feature
3661,終了 し て いい よ シュウリョウ シ テ イイ ヨ
381,bay sm に 変え て ください ビー エー ワイ エス エム ニ カエ テ クダサイ
243,bayfm を 聞き ビー エー ワイ エフ エム ヲ キキ


In [17]:
train_Y.head(3)

3661    12
381      0
243      0
Name: label, dtype: int64

In [18]:
val_X.head(3)

Unnamed: 0,feature
2069,文化放送 を 再生 し て 欲しい ん が ブンカホウソウ ヲ サイセイ シ テ ホシイ ン ガ
1318,nhk ワン を 再生 し て ください エヌ エイチ ケー ワン ヲ サイセイ シ ...
2127,ブンカホウソウ を かけ て ください ヲ カケ テ クダサイ


In [19]:
val_Y.head(3)

2069     9
1318    18
2127     9
Name: label, dtype: int64

In [20]:
#Apply tfidf with a custom Japanese token pattern
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), max_features=5000,
                    sublinear_tf=True, token_pattern=u'[A-Za-z0-9\-ぁ-ヶ亜-黑ー]{1,}')

train_X_tf =  tf.fit_transform(train_X['feature'])
val_X_tf =  tf.transform(val_X['feature'])

In [21]:
train_X.shape

(3372, 1)

In [22]:
train_X_tf.shape

(3372, 5000)

In [23]:
val_X_tf.shape

(843, 5000)

Model development

In [24]:
kfolds = StratifiedKFold(3)

gb_model = GradientBoostingClassifier()

parameters = {'learning_rate': st.uniform(0.01, 0.1), #so called `eta` value
              'max_depth': st.randint(3, 7),
              'n_estimators': st.randint(5, 50),
              'subsample': st.beta(10, 1),
              'random_state': [1337]}

#y = label_binarize(train_Y, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])

clf_cv = RandomizedSearchCV(gb_model, param_distributions=parameters, n_jobs=-1, 
                   cv=kfolds.split(train_X_tf, train_Y),
                   n_iter = 5,
                   scoring='f1_weighted',
                   verbose=1)

clf_cv.fit(train_X_tf, train_Y)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  2.2min finished


RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x0000026B7A616620>,
          error_score='raise',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
          fit_params=None, iid=True, n_iter=5, n_jobs=-1,
          param_distributions={'subsample': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000026B00B0B278>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000026B00A0FEF0>, 'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000026B00A0FFD0>, 'random_state': [1337], 'learning_rat

In [25]:
clf_cv.best_score_

0.9912364120143865

In [26]:
best_parameters, score, _ = max(clf_cv.grid_scores_, key=lambda x: x[1])

print('Raw AUC score:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

Raw AUC score: 0.991236412014
learning_rate: 0.07857631252683478
max_depth: 4
n_estimators: 38
random_state: 1337
subsample: 0.85690600047481891




In [27]:
#Apply the model to the validation set
_predictions = clf_cv.predict(val_X_tf)
_probas = clf_cv.predict_proba(val_X_tf)

target_map = {u'0': 0, u'1': 1, u'2': 2, u'3': 3, u'4': 4, u'5': 5, u'6': 6, u'7': 7, u'8': 8, u'9': 9, u'10': 10, u'11': 11, u'12': 12, u'13': 13, u'14': 14, u'15': 15, u'16': 16, u'17': 17, u'18': 18, u'19': 19, u'20': 20}

predictions = pd.Series(data=_predictions, index=val_X.index, name='predicted_value')
cols = [
    u'probability_of_%s' % label
    for (_, label) in sorted([(int(label_id), label) for (label, label_id) in target_map.items()])
]
probabilities = pd.DataFrame(data=_probas, index=val_X.index, columns=cols)

# Build scored dataset
results_val = val_X.join(predictions, how='left')
results_val = results_val.join(probabilities, how='left')
results_val = results_val.join(val_Y, how='left')

In [28]:
results_val.head(2)

Unnamed: 0,feature,predicted_value,probability_of_0,probability_of_1,probability_of_2,probability_of_3,probability_of_4,probability_of_5,probability_of_6,probability_of_7,...,probability_of_12,probability_of_13,probability_of_14,probability_of_15,probability_of_16,probability_of_17,probability_of_18,probability_of_19,probability_of_20,label
2069,文化放送 を 再生 し て 欲しい ん が ブンカホウソウ ヲ サイセイ シ テ ホシイ ン ガ,9,0.000234,0.000235,0.000235,0.000236,0.000235,0.00033,0.000234,0.000235,...,0.001491,0.000235,0.000228,0.000227,0.000235,0.002359,0.000235,0.000235,0.000433,9
1318,nhk ワン を 再生 し て ください エヌ エイチ ケー ワン ヲ サイセイ シ ...,18,0.000223,0.000224,0.000224,0.000225,0.000224,0.000314,0.000223,0.000223,...,0.001419,0.000223,0.000217,0.000216,0.000224,0.002245,0.980926,0.000224,0.01154,18


In [29]:
#Metrics
print(clf_cv.score(val_X_tf, val_Y))
print(recall_score(val_Y, clf_cv.predict(val_X_tf), average='weighted'))
print(precision_score(val_Y, clf_cv.predict(val_X_tf), average='weighted'))

0.99643486556
0.996441281139
0.996524526024


Create a pipeline and save the model as a pickle file

In [30]:
#Decrease n_estimators for performance
clf_final = Pipeline([('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1,3), max_features=5000,
                    sublinear_tf=True, token_pattern=u'[A-Za-z0-9\-ぁ-ヶ亜-黑ー]{1,}')),
                           ('clf', GradientBoostingClassifier(
                                random_state = 1337,
                                verbose = 0,
                                n_estimators = 20,
                                learning_rate = 0.079,
                                loss = 'deviance',
                                subsample = 0.86,
                                max_depth = 4
                               ))])

In [31]:
clf_final = clf_final.fit(train_X['feature'], train_Y)

predicted = clf_final.predict(val_X['feature'])
np.mean(predicted == val_Y)

0.99644128113879005

In [32]:
#Saving the best model
joblib.dump(clf_final, './model/ja_jp_v7.pkl')

['./model/ja_jp_v8.pkl']