In [53]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn import preprocessing
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.externals import joblib
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report,accuracy_score, f1_score, classification_report , roc_auc_score , precision_recall_curve , average_precision_score, auc , roc_curve
#from imblearn.over_sampling import SMOTE
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
import scipy.stats as st
from janome.tokenizer import Tokenizer
import re
import pickle
from sklearn.pipeline import Pipeline

In [31]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [32]:
data = pd.read_csv('./data/intent_data_jp_ja.csv', sep=',', names=['text', 'intent'])

In [33]:
data.shape

(4207, 2)

In [34]:
data.head(3)

Unnamed: 0,text,intent
0,ラジオ日本聞きたい,JORF
1,ラジオ日本を聞かせて,JORF
2,ラジオ日本を再生,JORF


In [35]:
le = preprocessing.LabelEncoder()

data['label'] = le.fit_transform(data['intent'])

In [36]:
data.shape

(4207, 3)

In [37]:
data

Unnamed: 0,text,intent,label
0,ラジオ日本聞きたい,JORF,5
1,ラジオ日本を聞かせて,JORF,5
2,ラジオ日本を再生,JORF,5
3,ラジオ日本再生,JORF,5
4,ラジオ日本を再生して,JORF,5
5,ラジオ日本を再生,JORF,5
6,ラジオ日本かけて,JORF,5
7,ラジオ日本して,JORF,5
8,ラジオ日本にまわして,JORF,5
9,ラジオ日本に変更して,JORF,5


In [38]:
data.label.unique()

array([ 5,  0,  7, 16,  3, 18, 19, 20, 13,  9,  6, 10, 11,  4,  2,  1, 12,
       15, 14,  8, 17], dtype=int64)

In [39]:
data = data.drop(['intent'], axis=1)

In [40]:
data.head(3)

Unnamed: 0,text,label
0,ラジオ日本聞きたい,5
1,ラジオ日本を聞かせて,5
2,ラジオ日本を再生,5


In [41]:
j_tokenizer = Tokenizer()

def wakati_reading(text):
    tokens = j_tokenizer.tokenize(text.replace("'", "").lower())
    
    exclude_pos = [u'助動詞']
    
    #分かち書き
    tokens_w_space = ""
    for token in tokens:
        partOfSpeech = token.part_of_speech.split(',')[0]
        
        if partOfSpeech not in exclude_pos:
            tokens_w_space = tokens_w_space + " " + token.surface

    tokens_w_space = tokens_w_space.strip()
    
    #読み方
    tokens_reading = ""
    for token in tokens:
        partOfSpeech = token.part_of_speech.split(',')[0]
 
        if partOfSpeech not in exclude_pos:
            if token.reading != "*":
                tokens_reading = tokens_reading + " " + token.reading
            elif re.match('^[a-z]+$', token.base_form):
                alpha_reading = ""
                alpha_reading = token.base_form.replace("a", "エー ")
                alpha_reading = alpha_reading.replace("b", "ビー ")
                alpha_reading = alpha_reading.replace("c", "シー ")
                alpha_reading = alpha_reading.replace("d", "ディー ")
                alpha_reading = alpha_reading.replace("e", "イー ")
                alpha_reading = alpha_reading.replace("f", "エフ ")
                alpha_reading = alpha_reading.replace("g", "ジー ")
                alpha_reading = alpha_reading.replace("h", "エイチ ")
                alpha_reading = alpha_reading.replace("i", "アイ ")
                alpha_reading = alpha_reading.replace("j", "ジェー ")
                alpha_reading = alpha_reading.replace("k", "ケー ")
                alpha_reading = alpha_reading.replace("l", "エル ")
                alpha_reading = alpha_reading.replace("m", "エム ")
                alpha_reading = alpha_reading.replace("n", "エヌ ")
                alpha_reading = alpha_reading.replace("o", "オー ")
                alpha_reading = alpha_reading.replace("p", "ピー ")
                alpha_reading = alpha_reading.replace("q", "キュー ")
                alpha_reading = alpha_reading.replace("r", "アール ")
                alpha_reading = alpha_reading.replace("s", "エス ")
                alpha_reading = alpha_reading.replace("t", "ティー ")
                alpha_reading = alpha_reading.replace("u", "ユー ")
                alpha_reading = alpha_reading.replace("v", "ブイ ")
                alpha_reading = alpha_reading.replace("w", "ダブリュー ")
                alpha_reading = alpha_reading.replace("x", "エックス ")
                alpha_reading = alpha_reading.replace("y", "ワイ ")
                alpha_reading = alpha_reading.replace("z", "ゼット ")

                tokens_reading = tokens_reading + " " + alpha_reading
            elif re.match('^[0-9]+$', token.base_form):
                numeric_reading = ""
                numeric_reading = token.base_form.replace("0", "ゼロ ")
                numeric_reading = numeric_reading.replace("1", "イチ ")
                numeric_reading = numeric_reading.replace("2", "ニ ")
                numeric_reading = numeric_reading.replace("3", "サン ")
                numeric_reading = numeric_reading.replace("4", "ヨン ")
                numeric_reading = numeric_reading.replace("5", "ゴ ")
                numeric_reading = numeric_reading.replace("6", "ロク ")
                numeric_reading = numeric_reading.replace("7", "ナナ ")
                numeric_reading = numeric_reading.replace("8", "ハチ ")
                numeric_reading = numeric_reading.replace("9", "キュー ")

                tokens_reading = tokens_reading + " " + numeric_reading.strip()

    tokens_reading = tokens_reading.strip()
    
    feature = tokens_w_space + " " + tokens_reading
    
    return feature

In [42]:
data['feature'] = data['text'].apply(lambda x: wakati_reading(x))
data = data.drop(['text'], axis=1)

data.head(3)

Unnamed: 0,label,feature
0,5,ラジオ 日本 聞き ラジオ ニッポン キキ
1,5,ラジオ 日本 を 聞か せ て ラジオ ニッポン ヲ キカ セ テ
2,5,ラジオ 日本 を 再生 ラジオ ニッポン ヲ サイセイ


In [44]:
train = data.drop(['label'], axis=1)
train_label = data['label']

In [45]:
#Modeling
train_X, val_X, train_Y, val_Y = train_test_split(train, train_label,
                                                  test_size = .2,
                                                  random_state=12)

In [46]:
train_X.head(3)

Unnamed: 0,feature
1053,ほう そう だい がく を 聞き ホウ ソウ ダイ ガク ヲ キキ
176,ラジオ に ほん を 再生 し て 欲しい の けど ラジオ ニ ホン ヲ サイセイ シ テ...
4111,松任谷 由実 の 動画 を youtube 再生 し て マツトウヤ ユミ ノ ドウ...


In [47]:
train_Y.head(3)

1053     3
176      5
4111    17
Name: label, dtype: int64

In [48]:
val_X.head(3)

Unnamed: 0,feature
913,放送大学 を 再生 し て ください ホウソウダイガク ヲ サイセイ シ テ クダサイ
1387,nhk r 2 が 聞き と 思い エヌ エイチ ケー アール ニ ガ キキ ト...
667,ナック ファイブ ご が 聞き と 思い ナック ファイブ ゴ ガ キキ ト オモイ


In [49]:
val_Y.head(3)

913      3
1387    19
667      7
Name: label, dtype: int64

In [50]:
# Text Features
from sklearn.feature_extraction.text import HashingVectorizer

text_features = {u'feature': 100}

for (feature_name, num_tokens) in text_features.items():
    n_components = num_tokens
    hashv = HashingVectorizer(n_features=num_tokens, token_pattern=u'[A-Za-z0-9\-ぁ-ヶ亜-黑ー]{1,}')
    train_transformed = hashv.fit_transform(train_X[feature_name])
    test_transformed = hashv.transform(val_X[feature_name])

   
    for i in range(0, n_components):
        train_X[feature_name + ":text:" + str(i)] = train_transformed[:,i].todense()
        val_X[feature_name + ":text:" + str(i)] = test_transformed[:,i].todense()
        
    train_X.drop(feature_name, axis=1, inplace=True)
    val_X.drop(feature_name, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [54]:
#save HashingVectorizer
with open("./model/hashing_vectorizer.pkl", 'wb') as handle:
    pickle.dump(hashv, handle)

In [22]:
train_X.head(3)

Unnamed: 0,feature:text:0,feature:text:1,feature:text:2,feature:text:3,feature:text:4,feature:text:5,feature:text:6,feature:text:7,feature:text:8,feature:text:9,...,feature:text:90,feature:text:91,feature:text:92,feature:text:93,feature:text:94,feature:text:95,feature:text:96,feature:text:97,feature:text:98,feature:text:99
1053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,0.0,0.0
176,0.0,0.213201,0.0,0.0,0.0,0.0,0.0,0.213201,0.0,0.0,...,0.0,0.213201,0.0,0.0,0.213201,0.0,0.0,0.0,0.213201,0.0
4111,0.0,0.0,-0.176777,0.0,0.0,0.0,0.0,0.176777,-0.176777,-0.353553,...,-0.353553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
from sklearn.ensemble import GradientBoostingClassifier

clf = XGBClassifier(
                    seed = 1337,
                    n_estimators = 50,
                    learning_rate = 0.1,
                    max_depth = 3
                   )

In [24]:
#Training
clf.fit(train_X, train_Y)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=50, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=1337, silent=True, subsample=1)

In [25]:
#Apply the model to the validation set
_predictions = clf.predict(val_X)
_probas = clf.predict_proba(val_X)

target_map = {u'0': 0, u'1': 1, u'2': 2, u'3': 3, u'4': 4, u'5': 5, u'6': 6, u'7': 7, u'8': 8, u'9': 9, u'10': 10, u'11': 11, u'12': 12, u'13': 13, u'14': 14, u'15': 15, u'16': 16, u'17': 17, u'18': 18, u'19': 19, u'20': 20}

predictions = pd.Series(data=_predictions, index=val_X.index, name='predicted_value')
cols = [
    u'probability_of_%s' % label
    for (_, label) in sorted([(int(label_id), label) for (label, label_id) in target_map.items()])
]
probabilities = pd.DataFrame(data=_probas, index=val_X.index, columns=cols)

# Build scored dataset
results_val = val_X.join(predictions, how='left')
results_val = results_val.join(probabilities, how='left')
results_val = results_val.join(val_Y, how='left')

In [26]:
results_val.head(2)

Unnamed: 0,feature:text:0,feature:text:1,feature:text:2,feature:text:3,feature:text:4,feature:text:5,feature:text:6,feature:text:7,feature:text:8,feature:text:9,...,probability_of_12,probability_of_13,probability_of_14,probability_of_15,probability_of_16,probability_of_17,probability_of_18,probability_of_19,probability_of_20,label
913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.235702,0.0,0.0,...,0.001883,0.000941,0.00113,0.001071,0.000941,0.002352,0.000944,0.000936,0.000938,3
1387,0.0,0.0,0.0,0.0,0.0,0.0,0.235702,0.235702,0.0,0.0,...,0.000722,0.000599,0.000684,0.000681,0.000598,0.004334,0.0006,0.980137,0.001747,19


In [27]:
print(clf.score(val_X, val_Y))
print(recall_score(val_Y, clf.predict(val_X), average='weighted'))
print(precision_score(val_Y, clf.predict(val_X), average='weighted'))

0.991686460808
0.991686460808
0.99241363843


In [280]:
print(clf.score(val_X, val_Y))
print(recall_score(val_Y, clf.predict(val_X), average='weighted'))
print(precision_score(val_Y, clf.predict(val_X), average='weighted'))

0.997624703088
0.997624703088
0.997921615202


In [281]:
results_val.head(200)

Unnamed: 0,feature:text:0,feature:text:1,feature:text:2,feature:text:3,feature:text:4,feature:text:5,feature:text:6,feature:text:7,feature:text:8,feature:text:9,...,probability_of_12,probability_of_13,probability_of_14,probability_of_15,probability_of_16,probability_of_17,probability_of_18,probability_of_19,probability_of_20,label
913,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.235702,0.000000,0.000000,...,0.000111,0.000048,0.000056,0.000050,0.000047,0.000445,0.000050,0.000021,0.000028,3
1387,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.235702,0.235702,0.000000,0.000000,...,0.000017,0.000011,0.000008,0.000007,0.000012,0.000236,0.000064,0.999111,0.000138,19
667,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,-0.267261,0.000000,0.000000,0.000000,...,0.000128,0.000060,0.000074,0.000042,0.000071,0.000811,0.000079,0.000051,0.000046,7
2266,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.002190,0.000106,0.000077,0.000082,0.000126,0.002556,0.000121,0.000049,0.000069,6
2206,0.000000,0.000000,0.000000,0.000000,0.0,-0.288675,0.000000,0.288675,0.000000,0.000000,...,0.000239,0.000025,0.000042,0.000030,0.000069,0.000149,0.000028,0.000012,0.000016,9
188,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.235702,0.000000,0.000000,...,0.000295,0.000058,0.000092,0.000044,0.000067,0.000088,0.000065,0.000056,0.000037,5
3798,0.316228,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.316228,0.000000,0.000000,...,0.001545,0.000010,0.996652,0.001375,0.000012,0.000026,0.000011,0.000006,0.000006,14
3406,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,-0.258199,...,0.000239,0.000021,0.000030,0.000019,0.000035,0.000040,0.000023,0.000018,0.000013,1
1746,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000016,0.000012,0.000009,0.000008,0.000014,0.000054,0.000365,0.000232,0.998023,20
869,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.250000,0.000000,0.000000,...,0.000026,0.000011,0.000012,0.000012,0.999571,0.000092,0.000011,0.000005,0.000006,16


In [313]:
kfolds = StratifiedKFold(3)

xgb_model = XGBClassifier(objective = 'multi:softmax', nthread=-1, reg_alpha=0, reg_lambda=1)

    
parameters = {'learning_rate': st.uniform(0.01, 0.1), #so called `eta` value
              'max_depth': st.randint(3, 7),
              'n_estimators': st.randint(5, 100),
              'min_child_weight': [1, 2, 3],
              'subsample': st.beta(10, 1),
              'colsample_bytree': st.beta(10, 1),
              'seed': [1337]}

#y = label_binarize(train_Y, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])

clf_cv = RandomizedSearchCV(xgb_model, param_distributions=parameters, n_jobs=-1, 
                   cv=kfolds.split(train_X, train_Y),
                   n_iter = 5,
                   scoring='f1_weighted',
                   verbose=1)

clf_cv.fit(train_X, train_Y)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.3min finished


RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x0000026956882BA0>,
          error_score='raise',
          estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softmax', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=5, n_jobs=-1,
          param_distributions={'colsample_bytree': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000026956884DD8>, 'subsample': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000026956884B70>, 'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000026956884358>, '... [1337], 'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000026956884898>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True

In [314]:
clf_cv.best_score_

0.98503322209890365

In [315]:
best_parameters, score, _ = max(clf_cv.grid_scores_, key=lambda x: x[1])

print('Raw AUC score:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

Raw AUC score: 0.985033222099
colsample_bytree: 0.95428828503905105
learning_rate: 0.032570711135805813
max_depth: 4
min_child_weight: 3
n_estimators: 136
seed: 1337
subsample: 0.95867123158197731




In [316]:
#Apply the model to the validation set
_predictions = clf_cv.predict(val_X)
_probas = clf_cv.predict_proba(val_X)

target_map = {u'0': 0, u'1': 1, u'2': 2, u'3': 3, u'4': 4, u'5': 5, u'6': 6, u'7': 7, u'8': 8, u'9': 9, u'10': 10, u'11': 11, u'12': 12, u'13': 13, u'14': 14, u'15': 15, u'16': 16, u'17': 17, u'18': 18, u'19': 19, u'20': 20}

predictions = pd.Series(data=_predictions, index=val_X.index, name='predicted_value')
cols = [
    u'probability_of_%s' % label
    for (_, label) in sorted([(int(label_id), label) for (label, label_id) in target_map.items()])
]
probabilities = pd.DataFrame(data=_probas, index=val_X.index, columns=cols)

# Build scored dataset
results_val = val_X.join(predictions, how='left')
results_val = results_val.join(probabilities, how='left')
results_val = results_val.join(val_Y, how='left')

In [317]:
results_val.head(2)

Unnamed: 0,feature:text:0,feature:text:1,feature:text:2,feature:text:3,feature:text:4,feature:text:5,feature:text:6,feature:text:7,feature:text:8,feature:text:9,...,probability_of_12,probability_of_13,probability_of_14,probability_of_15,probability_of_16,probability_of_17,probability_of_18,probability_of_19,probability_of_20,label
913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.235702,0.0,0.0,...,0.001487,0.0009,0.001098,0.00113,0.000907,0.001689,0.000907,0.000909,0.000906,3
1387,0.0,0.0,0.0,0.0,0.0,0.0,0.235702,0.235702,0.0,0.0,...,0.001236,0.001018,0.001015,0.001038,0.001026,0.003096,0.001068,0.972685,0.001588,19


In [318]:
print(clf_cv.score(val_X, val_Y))
print(recall_score(val_Y, clf_cv.predict(val_X), average='weighted'))
print(precision_score(val_Y, clf_cv.predict(val_X), average='weighted'))

0.992889581741
0.992874109264
0.993247035473


In [62]:
#Rows and columns
results_val.shape

(842, 123)

In [None]:
%matplotlib inline

precision, recall, _ = precision_recall_curve(results_test[['Label']], results_test[['probability_of_Y']])

plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2,
                 color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AUC={0:0.2f}'.format(
          average_precision))
plt.show();

In [29]:
#Saving the best model
joblib.dump(clf, './model/ja_jp_v1.pkl')

['./model/ja_jp_v1.pkl']