In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn import preprocessing
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.externals import joblib
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report,accuracy_score, f1_score, classification_report , roc_auc_score , precision_recall_curve , average_precision_score, auc , roc_curve
#from imblearn.over_sampling import SMOTE
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
import scipy.stats as st
from janome.tokenizer import Tokenizer
import re



In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
data = pd.read_csv('./data/intent_data_jp_ja.csv', sep=',', names=['text', 'intent'])

In [4]:
data.shape

(4207, 2)

In [5]:
data.head(3)

Unnamed: 0,text,intent
0,ラジオ日本聞きたい,JORF
1,ラジオ日本を聞かせて,JORF
2,ラジオ日本を再生,JORF


In [6]:
le = preprocessing.LabelEncoder()

data['label'] = le.fit_transform(data['intent'])

In [7]:
data.shape

(4207, 3)

In [46]:
data

Unnamed: 0,label,feature
0,5,ラジオ 日本 聞き ラジオ ニッポン キキ
1,5,ラジオ 日本 聞か せ ラジオ ニッポン キカ セ
2,5,ラジオ 日本 再生 ラジオ ニッポン サイセイ
3,5,ラジオ 日本 再生 ラジオ ニッポン サイセイ
4,5,ラジオ 日本 再生 し ラジオ ニッポン サイセイ シ
5,5,ラジオ 日本 再生 ラジオ ニッポン サイセイ
6,5,ラジオ 日本 かけ ラジオ ニッポン カケ
7,5,ラジオ 日本 し ラジオ ニッポン シ
8,5,ラジオ 日本 まわし ラジオ ニッポン マワシ
9,5,ラジオ 日本 変更 し ラジオ ニッポン ヘンコウ シ


In [45]:
data.label.unique()

array([ 5,  0,  7, 16,  3, 18, 19, 20, 13,  9,  6, 10, 11,  4,  2,  1, 12,
       15, 14,  8, 17], dtype=int64)

In [9]:
data = data.drop(['intent'], axis=1)

In [10]:
data.head(3)

Unnamed: 0,text,label
0,ラジオ日本聞きたい,5
1,ラジオ日本を聞かせて,5
2,ラジオ日本を再生,5


In [11]:
j_tokenizer = Tokenizer()

def wakati_reading(text):
    tokens = j_tokenizer.tokenize(text.replace("'", "").lower())
    
    exclude_pos = [u'助詞',u'助動詞']
    
    #分かち書き
    tokens_w_space = ""
    for token in tokens:
        partOfSpeech = token.part_of_speech.split(',')[0]
        
        if partOfSpeech not in exclude_pos:
            tokens_w_space = tokens_w_space + " " + token.surface

    tokens_w_space = tokens_w_space.strip()
    
    #読み方
    tokens_reading = ""
    for token in tokens:
        partOfSpeech = token.part_of_speech.split(',')[0]
 
        if partOfSpeech not in exclude_pos:
            if token.reading != "*":
                tokens_reading = tokens_reading + " " + token.reading
            elif re.match('^[a-z]+$', token.base_form):
                alpha_reading = ""
                alpha_reading = token.base_form.replace("a", "エー ")
                alpha_reading = alpha_reading.replace("b", "ビー ")
                alpha_reading = alpha_reading.replace("c", "シー ")
                alpha_reading = alpha_reading.replace("d", "ディー ")
                alpha_reading = alpha_reading.replace("e", "イー ")
                alpha_reading = alpha_reading.replace("f", "エフ ")
                alpha_reading = alpha_reading.replace("g", "ジー ")
                alpha_reading = alpha_reading.replace("h", "エイチ ")
                alpha_reading = alpha_reading.replace("i", "アイ ")
                alpha_reading = alpha_reading.replace("j", "ジェー ")
                alpha_reading = alpha_reading.replace("k", "ケー ")
                alpha_reading = alpha_reading.replace("l", "エル ")
                alpha_reading = alpha_reading.replace("m", "エム ")
                alpha_reading = alpha_reading.replace("n", "エヌ ")
                alpha_reading = alpha_reading.replace("o", "オー ")
                alpha_reading = alpha_reading.replace("p", "ピー ")
                alpha_reading = alpha_reading.replace("q", "キュー ")
                alpha_reading = alpha_reading.replace("r", "アール ")
                alpha_reading = alpha_reading.replace("s", "エス ")
                alpha_reading = alpha_reading.replace("t", "ティー ")
                alpha_reading = alpha_reading.replace("u", "ユー ")
                alpha_reading = alpha_reading.replace("v", "ブイ ")
                alpha_reading = alpha_reading.replace("w", "ダブリュー ")
                alpha_reading = alpha_reading.replace("x", "エックス ")
                alpha_reading = alpha_reading.replace("y", "ワイ ")
                alpha_reading = alpha_reading.replace("z", "ゼット ")

                tokens_reading = tokens_reading + " " + alpha_reading
            elif re.match('^[0-9]+$', token.base_form):
                numeric_reading = ""
                numeric_reading = token.base_form.replace("0", "ゼロ ")
                numeric_reading = numeric_reading.replace("1", "イチ ")
                numeric_reading = numeric_reading.replace("2", "ニ ")
                numeric_reading = numeric_reading.replace("3", "サン ")
                numeric_reading = numeric_reading.replace("4", "ヨン ")
                numeric_reading = numeric_reading.replace("5", "ゴ ")
                numeric_reading = numeric_reading.replace("6", "ロク ")
                numeric_reading = numeric_reading.replace("7", "ナナ ")
                numeric_reading = numeric_reading.replace("8", "ハチ ")
                numeric_reading = numeric_reading.replace("9", "キュー ")

                tokens_reading = tokens_reading + " " + numeric_reading.strip()

    tokens_reading = tokens_reading.strip()
    
    feature = tokens_w_space + " " + tokens_reading
    
    return feature

In [12]:
data['feature'] = data['text'].apply(lambda x: wakati_reading(x))
data = data.drop(['text'], axis=1)

data.head(3)

Unnamed: 0,label,feature
0,5,ラジオ 日本 聞き ラジオ ニッポン キキ
1,5,ラジオ 日本 聞か せ ラジオ ニッポン キカ セ
2,5,ラジオ 日本 再生 ラジオ ニッポン サイセイ


In [13]:
#train, test, train_label, test_label, = train_test_split(data, 
#                                                   data['label'], 
#                                                   test_size = .2, 
#                                                   random_state=12)

In [23]:
train = data.drop(['label'], axis=1)
train_label = data['label']

In [24]:
#Modeling
train_X, val_X, train_Y, val_Y = train_test_split(train, train_label,
                                                  test_size = .2,
                                                  random_state=12)

In [25]:
train_X.head(3)

Unnamed: 0,feature
1053,ほう そう だい がく 聞き ホウ ソウ ダイ ガク キキ
176,ラジオ ほん 再生 し 欲しい の ラジオ ホン サイセイ シ ホシイ ノ
4111,松任谷 由実 動画 youtube 再生 し マツトウヤ ユミ ドウガ ワイ オー ...


In [26]:
train_Y.head(3)

1053     3
176      5
4111    17
Name: label, dtype: int64

In [27]:
val_X.head(3)

Unnamed: 0,feature
913,放送大学 再生 し ください ホウソウダイガク サイセイ シ クダサイ
1387,nhk r 2 聞き 思い エヌ エイチ ケー アール ニ キキ オモイ
667,ナック ファイブ ご 聞き 思い ナック ファイブ ゴ キキ オモイ


In [28]:
val_Y.head(3)

913      3
1387    19
667      7
Name: label, dtype: int64

In [29]:
# Text Features
from sklearn.feature_extraction.text import HashingVectorizer

text_features = {u'feature': 100}

for (feature_name, num_tokens) in text_features.items():
    n_components = num_tokens
    hashv = HashingVectorizer(n_features=num_tokens, token_pattern=u'[A-Za-z0-9\-ぁ-ヶ亜-黑ー]{1,}')
    train_transformed = hashv.fit_transform(train_X[feature_name])
    test_transformed = hashv.transform(val_X[feature_name])

   
    for i in range(0, n_components):
        train_X[feature_name + ":text:" + str(i)] = train_transformed[:,i].todense()
        val_X[feature_name + ":text:" + str(i)] = test_transformed[:,i].todense()
        
    train_X.drop(feature_name, axis=1, inplace=True)
    val_X.drop(feature_name, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [31]:
train_X.head(3)

Unnamed: 0,feature:text:0,feature:text:1,feature:text:2,feature:text:3,feature:text:4,feature:text:5,feature:text:6,feature:text:7,feature:text:8,feature:text:9,...,feature:text:90,feature:text:91,feature:text:92,feature:text:93,feature:text:94,feature:text:95,feature:text:96,feature:text:97,feature:text:98,feature:text:99
1053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.288675,0.0,0.0
176,0.0,0.288675,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.288675,0.0,0.0,0.0,0.0,0.0
4111,0.0,0.0,-0.213201,0.0,0.0,0.0,0.0,0.0,-0.213201,-0.426401,...,-0.426401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
from sklearn.ensemble import GradientBoostingClassifier

clf = XGBClassifier(
                    seed = 1337,
                    n_estimators = 200,
                    learning_rate = 0.1,
                    max_depth = 3
                   )

In [33]:
#Training
clf.fit(train_X, train_Y)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=1337, silent=True, subsample=1)

In [48]:
#Apply the model to the validation set
_predictions = clf.predict(val_X)
_probas = clf.predict_proba(val_X)

target_map = {u'0': 0, u'1': 1, u'2': 2, u'3': 3, u'4': 4, u'5': 5, u'6': 6, u'7': 7, u'8': 8, u'9': 9, u'10': 10, u'11': 11, u'12': 12, u'13': 13, u'14': 14, u'15': 15, u'16': 16, u'17': 17, u'18': 18, u'19': 19, u'20': 20}

predictions = pd.Series(data=_predictions, index=val_X.index, name='predicted_value')
cols = [
    u'probability_of_%s' % label
    for (_, label) in sorted([(int(label_id), label) for (label, label_id) in target_map.items()])
]
probabilities = pd.DataFrame(data=_probas, index=val_X.index, columns=cols)

# Build scored dataset
results_val = val_X.join(predictions, how='left')
results_val = results_val.join(probabilities, how='left')
results_val = results_val.join(val_Y, how='left')

In [49]:
results_val.head(2)

Unnamed: 0,feature:text:0,feature:text:1,feature:text:2,feature:text:3,feature:text:4,feature:text:5,feature:text:6,feature:text:7,feature:text:8,feature:text:9,...,probability_of_12,probability_of_13,probability_of_14,probability_of_15,probability_of_16,probability_of_17,probability_of_18,probability_of_19,probability_of_20,label
913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000474,5.8e-05,7.3e-05,6.9e-05,4.8e-05,0.00212,4.1e-05,3.2e-05,3.1e-05,3
1387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.267261,0.0,0.0,...,0.00014,2.6e-05,2.3e-05,3.4e-05,3e-05,0.00851,0.00018,0.989856,0.000358,19


In [50]:
print(clf.score(val_X, val_Y))
print(recall_score(val_Y, clf.predict(val_X), average='weighted'))
print(precision_score(val_Y, clf.predict(val_X), average='weighted'))

0.992874109264
0.992874109264
0.993631547732


In [51]:
results_val.head(200)

Unnamed: 0,feature:text:0,feature:text:1,feature:text:2,feature:text:3,feature:text:4,feature:text:5,feature:text:6,feature:text:7,feature:text:8,feature:text:9,...,probability_of_12,probability_of_13,probability_of_14,probability_of_15,probability_of_16,probability_of_17,probability_of_18,probability_of_19,probability_of_20,label
913,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000474,0.000058,0.000073,0.000069,0.000048,0.002120,0.000041,0.000032,0.000031,3
1387,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.267261,0.000000,0.000000,...,0.000140,0.000026,0.000023,0.000034,0.000030,0.008510,0.000180,0.989856,0.000358,19
667,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,-0.534522,0.000000,0.000000,0.000000,...,0.000268,0.000043,0.000167,0.000056,0.000050,0.000468,0.000042,0.000035,0.000033,7
2266,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.004600,0.000103,0.000094,0.000160,0.000118,0.000321,0.000101,0.000080,0.000077,6
2206,0.000000,0.000000,0.000000,0.000000,0.0,-0.353553,0.000000,0.000000,0.000000,0.000000,...,0.001197,0.000031,0.000027,0.000048,0.000035,0.000712,0.000030,0.000024,0.000023,9
188,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.058467,0.000737,0.000657,0.001143,0.000842,0.003098,0.000722,0.000572,0.000553,5
3798,0.353553,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.001490,0.000009,0.992976,0.005066,0.000010,0.000064,0.000008,0.000007,0.000006,14
3406,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,-0.301511,...,0.000073,0.000022,0.000020,0.000029,0.000026,0.000673,0.000022,0.000019,0.000017,1
1746,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000015,0.000009,0.000009,0.000012,0.000011,0.000015,0.000643,0.000195,0.997486,20
869,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000070,0.000010,0.000016,0.000012,0.998865,0.000763,0.000007,0.000006,0.000005,16


In [21]:
kfolds = StratifiedKFold(3)

xgb_model = XGBClassifier(objective = 'multi:softprob', nthread=-1, reg_alpha=0, reg_lambda=1)

    
parameters = {'learning_rate': st.uniform(0.01, 0.1), #so called `eta` value
              'max_depth': st.randint(3, 7),
              'n_estimators': st.randint(100, 350),
              'min_child_weight': [1, 2, 3],
              'subsample': st.beta(10, 1),
              'colsample_bytree': st.beta(10, 1),
              'seed': [1337]}

y = label_binarize(train_label, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])

clf = RandomizedSearchCV(xgb_model, param_distributions=parameters, n_jobs=-1, 
                   cv=kfolds.split(train, train_label),
                   n_iter = 5,
                   scoring='roc_auc',
                   verbose=1)

clf.fit(train, train_label)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\runpy.py in _run_module_as_main(mod_name='ipykernel.__main__', alter_argv=1)
    188         sys.exit(msg)
    189     main_globals = sys.modules["__main__"].__dict__
    190     if alter_argv:
    191         sys.argv[0] = mod_spec.origin
    192     return _run_code(code, main_globals, None,
--> 193                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel.__main__', loader=<_f...oto\\lib\\site-packages\\ipykernel\\__main__.py')
    194 
    195 def run_module(mod_name, init_globals=None,
    196                run_name=None, alter_sys=False):
    197     """Execute a module's code without importing it

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\runpy.py in _run_code(code=<code object <module> at 0x000001FDFAF84ED0, fil...lib\site-packages\ipykernel\__main__.py", line 1>, run_globals={'__builtins__': <module 'builtins' (built-in)>, '__cached__': r'C:\Users\yfujimoto\AppData\Local\Continuum\Anaco...ges\ipykernel\__pycache__\__main__.cpython-35.pyc', '__doc__': None, '__file__': r'C:\Users\yfujimoto\AppData\Local\Continuum\Anaco...yfujimoto\lib\site-packages\ipykernel\__main__.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': 'ipykernel', '__spec__': ModuleSpec(name='ipykernel.__main__', loader=<_f...oto\\lib\\site-packages\\ipykernel\\__main__.py'), 'app': <module 'ipykernel.kernelapp' from 'C:\\Users\\y...to\\lib\\site-packages\\ipykernel\\kernelapp.py'>}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel.__main__', loader=<_f...oto\\lib\\site-packages\\ipykernel\\__main__.py'), pkg_name='ipykernel', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x000001FDFAF84ED0, fil...lib\site-packages\ipykernel\__main__.py", line 1>
        run_globals = {'__builtins__': <module 'builtins' (built-in)>, '__cached__': r'C:\Users\yfujimoto\AppData\Local\Continuum\Anaco...ges\ipykernel\__pycache__\__main__.cpython-35.pyc', '__doc__': None, '__file__': r'C:\Users\yfujimoto\AppData\Local\Continuum\Anaco...yfujimoto\lib\site-packages\ipykernel\__main__.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': 'ipykernel', '__spec__': ModuleSpec(name='ipykernel.__main__', loader=<_f...oto\\lib\\site-packages\\ipykernel\\__main__.py'), 'app': <module 'ipykernel.kernelapp' from 'C:\\Users\\y...to\\lib\\site-packages\\ipykernel\\kernelapp.py'>}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\ipykernel\__main__.py in <module>()
      1 if __name__ == '__main__':
      2     from ipykernel import kernelapp as app
----> 3     app.launch_new_instance()

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\traitlets\config\application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\ipykernel\kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    469             return self.subapp.start()
    470         if self.poller is not None:
    471             self.poller.start()
    472         self.kernel.start()
    473         try:
--> 474             ioloop.IOLoop.instance().start()
    475         except KeyboardInterrupt:
    476             pass
    477 
    478 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\zmq\eventloop\ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    172             )
    173         return loop
    174     
    175     def start(self):
    176         try:
--> 177             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    178         except ZMQError as e:
    179             if e.errno == ETERM:
    180                 # quietly return on ETERM
    181                 pass

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\tornado\ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    882                 self._events.update(event_pairs)
    883                 while self._events:
    884                     fd, events = self._events.popitem()
    885                     try:
    886                         fd_obj, handler_func = self._handlers[fd]
--> 887                         handler_func(fd_obj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    888                     except (OSError, IOError) as e:
    889                         if errno_from_exception(e) == errno.EPIPE:
    890                             # Happens when the client closes the connection
    891                             pass

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\tornado\stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\zmq\eventloop\zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\zmq\eventloop\zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\zmq\eventloop\zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\tornado\stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\ipykernel\kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    271         if self.control_stream:
    272             self.control_stream.on_recv(self.dispatch_control, copy=False)
    273 
    274         def make_dispatcher(stream):
    275             def dispatcher(msg):
--> 276                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    277             return dispatcher
    278 
    279         for s in self.shell_streams:
    280             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\ipykernel\kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': 'kfolds = StratifiedKFold(3)\n\nxgb_model = XGBClas...          verbose=1)\n\nclf.fit(train, train_label)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2017, 12, 14, 18, 6, 40, 349178, tzinfo=datetime.timezone.utc), 'msg_id': '03CEF4A38B9C46AEA209609426D18A14', 'msg_type': 'execute_request', 'session': 'DBF5EE8FBEF44A0D9B8719C8F7A39DFF', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '03CEF4A38B9C46AEA209609426D18A14', 'msg_type': 'execute_request', 'parent_header': {}})
    223             self.log.error("UNKNOWN MESSAGE TYPE: %r", msg_type)
    224         else:
    225             self.log.debug("%s: %s", msg_type, msg)
    226             self.pre_handler_hook()
    227             try:
--> 228                 handler(stream, idents, msg)
        handler = <bound method Kernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'DBF5EE8FBEF44A0D9B8719C8F7A39DFF']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': 'kfolds = StratifiedKFold(3)\n\nxgb_model = XGBClas...          verbose=1)\n\nclf.fit(train, train_label)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2017, 12, 14, 18, 6, 40, 349178, tzinfo=datetime.timezone.utc), 'msg_id': '03CEF4A38B9C46AEA209609426D18A14', 'msg_type': 'execute_request', 'session': 'DBF5EE8FBEF44A0D9B8719C8F7A39DFF', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '03CEF4A38B9C46AEA209609426D18A14', 'msg_type': 'execute_request', 'parent_header': {}}
    229             except Exception:
    230                 self.log.error("Exception in message handler:", exc_info=True)
    231             finally:
    232                 self.post_handler_hook()

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\ipykernel\kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'DBF5EE8FBEF44A0D9B8719C8F7A39DFF'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': 'kfolds = StratifiedKFold(3)\n\nxgb_model = XGBClas...          verbose=1)\n\nclf.fit(train, train_label)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2017, 12, 14, 18, 6, 40, 349178, tzinfo=datetime.timezone.utc), 'msg_id': '03CEF4A38B9C46AEA209609426D18A14', 'msg_type': 'execute_request', 'session': 'DBF5EE8FBEF44A0D9B8719C8F7A39DFF', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '03CEF4A38B9C46AEA209609426D18A14', 'msg_type': 'execute_request', 'parent_header': {}})
    385         if not silent:
    386             self.execution_count += 1
    387             self._publish_execute_input(code, parent, self.execution_count)
    388 
    389         reply_content = self.do_execute(code, silent, store_history,
--> 390                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    391 
    392         # Flush output before sending the reply.
    393         sys.stdout.flush()
    394         sys.stderr.flush()

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\ipykernel\ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code='kfolds = StratifiedKFold(3)\n\nxgb_model = XGBClas...          verbose=1)\n\nclf.fit(train, train_label)', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    191 
    192         self._forward_input(allow_stdin)
    193 
    194         reply_content = {}
    195         try:
--> 196             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = 'kfolds = StratifiedKFold(3)\n\nxgb_model = XGBClas...          verbose=1)\n\nclf.fit(train, train_label)'
        store_history = True
        silent = False
    197         finally:
    198             self._restore_input()
    199 
    200         if res.error_before_exec is not None:

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\ipykernel\zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=('kfolds = StratifiedKFold(3)\n\nxgb_model = XGBClas...          verbose=1)\n\nclf.fit(train, train_label)',), **kwargs={'silent': False, 'store_history': True})
    496             )
    497         self.payload_manager.write_payload(payload)
    498 
    499     def run_cell(self, *args, **kwargs):
    500         self._last_traceback = None
--> 501         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = ('kfolds = StratifiedKFold(3)\n\nxgb_model = XGBClas...          verbose=1)\n\nclf.fit(train, train_label)',)
        kwargs = {'silent': False, 'store_history': True}
    502 
    503     def _showtraceback(self, etype, evalue, stb):
    504         # try to preserve ordering of tracebacks and print statements
    505         sys.stdout.flush()

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\IPython\core\interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell='kfolds = StratifiedKFold(3)\n\nxgb_model = XGBClas...          verbose=1)\n\nclf.fit(train, train_label)', store_history=True, silent=False, shell_futures=True)
   2718                 self.displayhook.exec_result = result
   2719 
   2720                 # Execute the user code
   2721                 interactivity = "none" if silent else self.ast_node_interactivity
   2722                 self.run_ast_nodes(code_ast.body, cell_name,
-> 2723                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'all'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2724 
   2725                 # Reset this so later displayed values do not modify the
   2726                 # ExecutionResult
   2727                 self.displayhook.exec_result = None

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\IPython\core\interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Expr object>], cell_name='<ipython-input-21-84202bea9749>', interactivity='all', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   2826                     return True
   2827 
   2828             for i, node in enumerate(to_run_interactive):
   2829                 mod = ast.Interactive([node])
   2830                 code = compiler(mod, cell_name, "single")
-> 2831                 if self.run_code(code, result):
        self.run_code = <bound method InteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x000001FD8BF0BC90, file "<ipython-input-21-84202bea9749>", line 22>
        result = <IPython.core.interactiveshell.ExecutionResult object>
   2832                     return True
   2833 
   2834             # Flush softspace
   2835             if softspace(sys.stdout, 0):

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\IPython\core\interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x000001FD8BF0BC90, file "<ipython-input-21-84202bea9749>", line 22>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   2880         outflag = 1  # happens in more places, so it's easier as default
   2881         try:
   2882             try:
   2883                 self.hooks.pre_run_code_hook()
   2884                 #rprint('Running code', repr(code_obj)) # dbg
-> 2885                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x000001FD8BF0BC90, file "<ipython-input-21-84202bea9749>", line 22>
        self.user_global_ns = {'Counter': <class 'collections.Counter'>, 'HashingVectorizer': <class 'sklearn.feature_extraction.text.HashingVectorizer'>, 'In': ['', 'import numpy as np\nimport pandas as pd\nimport sk...\nfrom janome.tokenizer import Tokenizer\nimport re', 'from IPython.core.interactiveshell import Intera...l\nInteractiveShell.ast_node_interactivity = "all"', "data = pd.read_csv('./data/intent_data_jp_ja.csv', sep=',', names=['text', 'intent'])", 'data.shape', 'data.head(3)', "le = preprocessing.LabelEncoder()\n\ndata['label'] = le.fit_transform(data['intent'])", 'data.shape', 'data.head(3)', "data = data.drop(['intent'], axis=1)", 'data.head(3)', 'j_tokenizer = Tokenizer()\n\ndef wakati_reading(te...ce + " " + tokens_reading\n    \n    return feature', "data['feature'] = data['text'].apply(lambda x: w...\ndata = data.drop(['text'], axis=1)\n\ndata.head(3)", 'train, test, train_label, test_label, = train_te...                                 random_state=12)', "train = train.drop(['label'], axis=1)\ntrain.head(3)", 'train_label.head(3)', "test = test.drop(['label'], axis=1)\ntest.head(3)", 'test_label.head(3)', '# Text Features\nfrom sklearn.feature_extraction....    test.drop(feature_name, axis=1, inplace=True)', 'train.head(3)', ...], 'InteractiveShell': <class 'IPython.core.interactiveshell.InteractiveShell'>, 'Out': {4: (4207, 2), 5:          text intent
0   ラジオ日本聞きたい   JORF
1  ラジオ日本を聞かせて   JORF
2    ラジオ日本を再生   JORF, 7: (4207, 3), 8:          text intent  label
0   ラジオ日本聞きたい   JORF...日本を聞かせて   JORF      5
2    ラジオ日本を再生   JORF      5, 10:          text  label
0   ラジオ日本聞きたい      5
1  ラジオ日本を聞かせて      5
2    ラジオ日本を再生      5, 12:    label                    feature
0      5    ...ラジオ ニッポン キカ セ
2      5    ラジオ 日本 再生 ラジオ ニッポン サイセイ, 14:                                                 ...松任谷 由実 動画   youtube   再生 し マツトウヤ ユミ ドウガ ワイ オー ..., 15: 1053     3
176      5
4111    17
Name: label, dtype: int64, 16:                                          feature...
667           ナック ファイブ ご 聞き 思い ナック ファイブ ゴ キキ オモイ, 17: 913      3
1387    19
667      7
Name: label, dtype: int64, ...}, 'PCA': <class 'sklearn.decomposition.pca.PCA'>, 'RandomizedSearchCV': <class 'sklearn.model_selection._search.RandomizedSearchCV'>, 'StratifiedKFold': <class 'sklearn.model_selection._split.StratifiedKFold'>, 'Tokenizer': <class 'janome.tokenizer.Tokenizer'>, 'XGBClassifier': <class 'xgboost.sklearn.XGBClassifier'>, ...}
        self.user_ns = {'Counter': <class 'collections.Counter'>, 'HashingVectorizer': <class 'sklearn.feature_extraction.text.HashingVectorizer'>, 'In': ['', 'import numpy as np\nimport pandas as pd\nimport sk...\nfrom janome.tokenizer import Tokenizer\nimport re', 'from IPython.core.interactiveshell import Intera...l\nInteractiveShell.ast_node_interactivity = "all"', "data = pd.read_csv('./data/intent_data_jp_ja.csv', sep=',', names=['text', 'intent'])", 'data.shape', 'data.head(3)', "le = preprocessing.LabelEncoder()\n\ndata['label'] = le.fit_transform(data['intent'])", 'data.shape', 'data.head(3)', "data = data.drop(['intent'], axis=1)", 'data.head(3)', 'j_tokenizer = Tokenizer()\n\ndef wakati_reading(te...ce + " " + tokens_reading\n    \n    return feature', "data['feature'] = data['text'].apply(lambda x: w...\ndata = data.drop(['text'], axis=1)\n\ndata.head(3)", 'train, test, train_label, test_label, = train_te...                                 random_state=12)', "train = train.drop(['label'], axis=1)\ntrain.head(3)", 'train_label.head(3)', "test = test.drop(['label'], axis=1)\ntest.head(3)", 'test_label.head(3)', '# Text Features\nfrom sklearn.feature_extraction....    test.drop(feature_name, axis=1, inplace=True)', 'train.head(3)', ...], 'InteractiveShell': <class 'IPython.core.interactiveshell.InteractiveShell'>, 'Out': {4: (4207, 2), 5:          text intent
0   ラジオ日本聞きたい   JORF
1  ラジオ日本を聞かせて   JORF
2    ラジオ日本を再生   JORF, 7: (4207, 3), 8:          text intent  label
0   ラジオ日本聞きたい   JORF...日本を聞かせて   JORF      5
2    ラジオ日本を再生   JORF      5, 10:          text  label
0   ラジオ日本聞きたい      5
1  ラジオ日本を聞かせて      5
2    ラジオ日本を再生      5, 12:    label                    feature
0      5    ...ラジオ ニッポン キカ セ
2      5    ラジオ 日本 再生 ラジオ ニッポン サイセイ, 14:                                                 ...松任谷 由実 動画   youtube   再生 し マツトウヤ ユミ ドウガ ワイ オー ..., 15: 1053     3
176      5
4111    17
Name: label, dtype: int64, 16:                                          feature...
667           ナック ファイブ ご 聞き 思い ナック ファイブ ゴ キキ オモイ, 17: 913      3
1387    19
667      7
Name: label, dtype: int64, ...}, 'PCA': <class 'sklearn.decomposition.pca.PCA'>, 'RandomizedSearchCV': <class 'sklearn.model_selection._search.RandomizedSearchCV'>, 'StratifiedKFold': <class 'sklearn.model_selection._split.StratifiedKFold'>, 'Tokenizer': <class 'janome.tokenizer.Tokenizer'>, 'XGBClassifier': <class 'xgboost.sklearn.XGBClassifier'>, ...}
   2886             finally:
   2887                 # Reset our crash handler in place
   2888                 sys.excepthook = old_excepthook
   2889         except SystemExit as e:

...........................................................................
C:\Users\yfujimoto\Documents\Python Scripts\Japanese_Radio\<ipython-input-21-84202bea9749> in <module>()
     17                    cv=kfolds.split(train, train_label),
     18                    n_iter = 5,
     19                    scoring='roc_auc',
     20                    verbose=1)
     21 
---> 22 clf.fit(train, train_label)

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\sklearn\model_selection\_search.py in fit(self=RandomizedSearchCV(cv=<generator object _BaseKFo...n_train_score=True, scoring='roc_auc', verbose=1), X=      feature:text:0  feature:text:1  feature:te...4               0.0  

[3365 rows x 1000 columns], y=1053     3
176      5
4111    17
3627    12
1662...414    19
Name: label, Length: 3365, dtype: int64, groups=None, **fit_params={})
    633                                   return_train_score=self.return_train_score,
    634                                   return_n_test_samples=True,
    635                                   return_times=True, return_parameters=False,
    636                                   error_score=self.error_score)
    637           for parameters, (train, test) in product(candidate_params,
--> 638                                                    cv.split(X, y, groups)))
        cv.split = <bound method _CVIterableWrapper.split of _CVIte... 2567]), array([1910, 2028, ..., 3363, 3364]))])>
        X =       feature:text:0  feature:text:1  feature:te...4               0.0  

[3365 rows x 1000 columns]
        y = 1053     3
176      5
4111    17
3627    12
1662...414    19
Name: label, Length: 3365, dtype: int64
        groups = None
    639 
    640         # if one choose to see train score, "out" will contain train score info
    641         if self.return_train_score:
    642             (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object BaseSearchCV.fit.<locals>.<genexpr>>)
    784             if pre_dispatch == "all" or n_jobs == 1:
    785                 # The iterable was consumed all at once by the above for loop.
    786                 # No need to wait for async callbacks to trigger to
    787                 # consumption.
    788                 self._iterating = False
--> 789             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    790             # Make sure that we get a last message telling us we are done
    791             elapsed_time = time.time() - self._start_time
    792             self._print('Done %3i out of %3i | elapsed: %s finished',
    793                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Thu Dec 14 12:09:26 2017
PID: 4712Python 3.5.3: C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\python.exe
...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _fit_and_score>, (XGBClassifier(base_score=0.5, colsample_bylevel=...ilent=True,
       subsample=0.98474380408059958),       feature:text:0  feature:text:1  feature:te...4               0.0  

[3365 rows x 1000 columns], 1053     3
176      5
4111    17
3627    12
1662...414    19
Name: label, Length: 3365, dtype: int64, {'score': make_scorer(roc_auc_score, needs_threshold=True)}, array([ 999, 1012, 1024, ..., 3362, 3363, 3364]), array([   0,    1,    2, ..., 1317, 1334, 1345]), 1, {'colsample_bytree': 0.99797383138681828, 'learning_rate': 0.056420607614068294, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 116, 'seed': 1337, 'subsample': 0.98474380408059958}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': True})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (XGBClassifier(base_score=0.5, colsample_bylevel=...ilent=True,
       subsample=0.98474380408059958),       feature:text:0  feature:text:1  feature:te...4               0.0  

[3365 rows x 1000 columns], 1053     3
176      5
4111    17
3627    12
1662...414    19
Name: label, Length: 3365, dtype: int64, {'score': make_scorer(roc_auc_score, needs_threshold=True)}, array([ 999, 1012, 1024, ..., 3362, 3363, 3364]), array([   0,    1,    2, ..., 1317, 1334, 1345]), 1, {'colsample_bytree': 0.99797383138681828, 'learning_rate': 0.056420607614068294, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 116, 'seed': 1337, 'subsample': 0.98474380408059958})
        kwargs = {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': True}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator=XGBClassifier(base_score=0.5, colsample_bylevel=...ilent=True,
       subsample=0.98474380408059958), X=      feature:text:0  feature:text:1  feature:te...4               0.0  

[3365 rows x 1000 columns], y=1053     3
176      5
4111    17
3627    12
1662...414    19
Name: label, Length: 3365, dtype: int64, scorer={'score': make_scorer(roc_auc_score, needs_threshold=True)}, train=array([ 999, 1012, 1024, ..., 3362, 3363, 3364]), test=array([   0,    1,    2, ..., 1317, 1334, 1345]), verbose=1, parameters={'colsample_bytree': 0.99797383138681828, 'learning_rate': 0.056420607614068294, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 116, 'seed': 1337, 'subsample': 0.98474380408059958}, fit_params={}, return_train_score=True, return_parameters=False, return_n_test_samples=True, return_times=True, error_score='raise')
    462                              " make sure that it has been spelled correctly.)")
    463 
    464     else:
    465         fit_time = time.time() - start_time
    466         # _score will return dict if is_multimetric is True
--> 467         test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
        test_scores = {}
        estimator = XGBClassifier(base_score=0.5, colsample_bylevel=...ilent=True,
       subsample=0.98474380408059958)
        X_test =       feature:text:0  feature:text:1  feature:te...                0.0  

[1128 rows x 1000 columns]
        y_test = 1053     3
176      5
4111    17
3627    12
1662...64      5
Name: label, Length: 1128, dtype: int64
        scorer = {'score': make_scorer(roc_auc_score, needs_threshold=True)}
        is_multimetric = True
    468         score_time = time.time() - start_time - fit_time
    469         if return_train_score:
    470             train_scores = _score(estimator, X_train, y_train, scorer,
    471                                   is_multimetric)

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator=XGBClassifier(base_score=0.5, colsample_bylevel=...ilent=True,
       subsample=0.98474380408059958), X_test=      feature:text:0  feature:text:1  feature:te...                0.0  

[1128 rows x 1000 columns], y_test=1053     3
176      5
4111    17
3627    12
1662...64      5
Name: label, Length: 1128, dtype: int64, scorer={'score': make_scorer(roc_auc_score, needs_threshold=True)}, is_multimetric=True)
    497 
    498     Will return a single float if is_multimetric is False and a dict of floats,
    499     if is_multimetric is True
    500     """
    501     if is_multimetric:
--> 502         return _multimetric_score(estimator, X_test, y_test, scorer)
        estimator = XGBClassifier(base_score=0.5, colsample_bylevel=...ilent=True,
       subsample=0.98474380408059958)
        X_test =       feature:text:0  feature:text:1  feature:te...                0.0  

[1128 rows x 1000 columns]
        y_test = 1053     3
176      5
4111    17
3627    12
1662...64      5
Name: label, Length: 1128, dtype: int64
        scorer = {'score': make_scorer(roc_auc_score, needs_threshold=True)}
    503     else:
    504         if y_test is None:
    505             score = scorer(estimator, X_test)
    506         else:

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\sklearn\model_selection\_validation.py in _multimetric_score(estimator=XGBClassifier(base_score=0.5, colsample_bylevel=...ilent=True,
       subsample=0.98474380408059958), X_test=      feature:text:0  feature:text:1  feature:te...                0.0  

[1128 rows x 1000 columns], y_test=1053     3
176      5
4111    17
3627    12
1662...64      5
Name: label, Length: 1128, dtype: int64, scorers={'score': make_scorer(roc_auc_score, needs_threshold=True)})
    527 
    528     for name, scorer in scorers.items():
    529         if y_test is None:
    530             score = scorer(estimator, X_test)
    531         else:
--> 532             score = scorer(estimator, X_test, y_test)
        score = undefined
        scorer = make_scorer(roc_auc_score, needs_threshold=True)
        estimator = XGBClassifier(base_score=0.5, colsample_bylevel=...ilent=True,
       subsample=0.98474380408059958)
        X_test =       feature:text:0  feature:text:1  feature:te...                0.0  

[1128 rows x 1000 columns]
        y_test = 1053     3
176      5
4111    17
3627    12
1662...64      5
Name: label, Length: 1128, dtype: int64
    533 
    534         if hasattr(score, 'item'):
    535             try:
    536                 # e.g. unwrap memmapped scalars

...........................................................................
C:\Users\yfujimoto\AppData\Local\Continuum\Anaconda3\envs\yfujimoto\lib\site-packages\sklearn\metrics\scorer.py in __call__(self=make_scorer(roc_auc_score, needs_threshold=True), clf=XGBClassifier(base_score=0.5, colsample_bylevel=...ilent=True,
       subsample=0.98474380408059958), X=      feature:text:0  feature:text:1  feature:te...                0.0  

[1128 rows x 1000 columns], y=1053     3
176      5
4111    17
3627    12
1662...64      5
Name: label, Length: 1128, dtype: int64, sample_weight=None)
    176         """
    177         super(_ThresholdScorer, self).__call__(clf, X, y,
    178                                                sample_weight=sample_weight)
    179         y_type = type_of_target(y)
    180         if y_type not in ("binary", "multilabel-indicator"):
--> 181             raise ValueError("{0} format is not supported".format(y_type))
        y_type = 'multiclass'
    182 
    183         if is_regressor(clf):
    184             y_pred = clf.predict(X)
    185         else:

ValueError: multiclass format is not supported
___________________________________________________________________________

In [None]:
clf.best_score_

In [None]:
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])

print('Raw AUC score:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
#Apply the model to the test set
_predictions = clf.predict(test)
_probas = clf.predict_proba(test)

target_map = {u'N': 0, u'Y': 1}

predictions = pd.Series(data=_predictions, index=test.index, name='predicted_value')
cols = [
    u'probability_of_%s' % label
    for (_, label) in sorted([(int(label_id), label) for (label, label_id) in target_map.iteritems()])
]
probabilities = pd.DataFrame(data=_probas, index=test.index, columns=cols)

# Build scored dataset
results_test = test.join(predictions, how='left')
results_test = results_test.join(probabilities, how='left')
results_test = results_test.join(test_label, how='left')

In [None]:
results_test.head(2)

In [None]:
print(clf.score(test, test_label))
print(recall_score(test_label, clf.predict(test)))
print(precision_score(test_label, clf.predict(test)))

In [None]:
#Rows and columns
results_test.shape

In [None]:
%matplotlib inline

prob = clf.predict_proba(test)[:, 1]
fp, tp, thresholds = roc_curve(test_label , prob)

plt.figure(figsize = (8, 6))
plt.plot(fp, tp)
plt.title("ROC curve")
plt.xlabel( "False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show();

In [None]:
average_precision = average_precision_score(results_test[['Label']], results_test[['probability_of_Y']])

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))

In [None]:
%matplotlib inline

precision, recall, _ = precision_recall_curve(results_test[['Label']], results_test[['probability_of_Y']])

plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2,
                 color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AUC={0:0.2f}'.format(
          average_precision))
plt.show();

In [None]:
#Saving the best model
joblib.dump(clf.best_estimator_, './Model/'+snapshot+'_XGBoost_300K_NoSMOTE_Over300KBILimit_EstimateParts_ImageScore_TextNERTop500_Binary_NoSub_GridCV_15Iter_v3.pkl')