-----------

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
import sklearn

import numpy as np
import pandas as pd

from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

import pickle
import joblib

import sys
import xgboost as xgb

In [2]:
def load_train():
    train = pd.read_csv('train.csv')
    labels = train.type.values
    lbl_enc = preprocessing.LabelEncoder()
    labels = lbl_enc.fit_transform(labels)
    train = train.drop('id', axis=1)
    train = train.drop('fiberID', axis=1)
    train = train.drop('type', axis=1)
    return train.values, labels.astype('int32')

In [3]:
def load_test():
    test = pd.read_csv('test.csv')
    test = test.drop('id', axis=1)
    test = test.drop('fiberID', axis=1)
    return test.values

In [4]:
def score(params):
    print("Training with params : ")
    print(params)
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_test, label=y_test)
    # watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    model = xgb.train(params, dtrain, num_round)
    predictions = model.predict(dvalid).reshape((X_test.shape[0], 9))
    score = log_loss(y_test, predictions)
    print("\tScore {0}\n\n".format(score))
    return {'loss': score, 'status': STATUS_OK}

In [5]:
def optimize(trials):
    space = {
             'eta' : hp.quniform('eta', 0.025, 0.5, 0.025),
             'max_depth' : hp.choice('max_depth', np.arange(3, 5, dtype=int)),
             'min_child_weight' : hp.quniform('min_child_weight', 1, 6, 1),
             'gamma' : hp.quniform('gamma', 0.5, 1, 0.05),
             'colsample_bytree' : hp.quniform('colsample_bytree', 0.5, 1, 0.05),
             'num_class' : 19,
             'eval_metric': 'mlogloss',
             'objective': 'multi:softprob',
             'verbosity' : 1
             }

    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250)

    print(best)

In [6]:
def write_submission(preds, output):
    sample = pd.read_csv('sample_submission.csv')
    train = pd.read_csv('train.csv')
    labels = train.type.values
    labels = np.unique(labels)
    preds = pd.DataFrame(
        preds, index=sample.id.values, columns=labels)
    preds = preds[list(sample.columns[1:])]
    preds.to_csv(output, index_label='id')

In [182]:
X, y = load_train()

In [183]:
print("Splitting data into train and valid ...\n\n")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1234)

Splitting data into train and valid ...




-------

In [184]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [185]:
scaler = StandardScaler()
X_train, X_test = scaler.fit_transform(X_train), scaler.fit_transform(X_test) 
pca = PCA(0.8)
pca.fit(X_train)

PCA(copy=True, iterated_power='auto', n_components=0.8, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [186]:
pca_features = pca.transform(X_train)
pca_df = pd.DataFrame(data = pca_features, columns=["pca1", "pca2", "pca3", "pca4", "pca5", "pca6"])
X_train = pd.DataFrame(X_train)
X_train = pd.concat([X_train, pca_df], axis=1)

In [187]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,pca1,pca2,pca3,pca4,pca5,pca6
0,0.002588,0.006242,0.005500,0.006841,0.008338,0.002877,0.001180,0.004697,0.008155,0.005878,...,0.007987,0.006106,0.008560,0.011013,-0.029353,-0.005042,-0.000628,-0.001090,-0.000414,-0.000079
1,0.002728,0.009597,0.005636,0.005216,0.004657,0.003000,0.001551,0.001261,0.003066,-0.000174,...,0.006911,0.000093,0.000301,-0.000189,-0.015283,-0.004348,0.001640,0.000292,-0.006906,0.002217
2,0.002699,0.020642,0.029383,0.030829,0.028230,0.003083,0.003920,0.036825,0.030399,0.024335,...,0.025056,0.028007,0.031486,0.032167,-0.109840,-0.006332,-0.001155,0.005974,0.004185,-0.003611
3,0.002838,0.027081,0.027645,0.011578,0.003690,0.003121,0.005660,0.036125,0.012728,0.002555,...,0.033313,0.025011,0.012983,0.006894,-0.077178,-0.005925,0.002343,0.011704,-0.025431,0.001155
4,0.002617,0.003094,-0.003423,-0.001897,-0.003113,0.002890,0.000702,-0.006261,-0.002656,-0.004875,...,-0.001582,-0.009036,-0.008606,-0.010007,0.018983,-0.004104,0.005742,0.001067,-0.003685,0.003748
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159987,0.002551,0.004081,0.003624,0.004893,0.004736,0.002462,-0.000190,-0.006120,-0.001673,-0.002697,...,-0.006439,-0.008963,-0.007613,-0.007555,0.017032,-0.004791,0.008234,-0.003932,0.007142,0.001750
159988,0.002586,0.006241,0.007811,0.010532,0.013100,0.002699,0.001200,0.008124,0.012177,0.011166,...,0.008080,0.008258,0.011779,0.015104,-0.039558,-0.005232,-0.000855,-0.000330,0.003721,-0.001144
159989,0.002818,0.023875,0.025129,0.011982,0.005798,0.003472,0.005701,0.025830,0.013857,0.004008,...,0.027876,0.021939,0.013062,0.008275,-0.076384,-0.004491,0.000364,0.008958,-0.025146,0.001604
159990,0.002828,0.026849,0.028505,0.019638,0.014473,0.003566,0.004931,0.032713,0.021769,0.013923,...,0.029706,0.024800,0.020226,0.017621,-0.094170,-0.004119,0.000574,0.005049,-0.012567,-0.001332


In [188]:
pca_features = pca.transform(X_test)
pca_df = pd.DataFrame(data = pca_features, columns=["pca1", "pca2", "pca3", "pca4", "pca5", "pca6"])
X_test = pd.DataFrame(X_test)
X_test = pd.concat([X_test, pca_df], axis=1)

-----

In [189]:
trials = Trials()

In [None]:
optimize(trials)

---------

In [92]:
xg_clf = xgb.XGBClassifier(eval_metric = 'mlogloss', objective = 'multi:softprob',colsample_bytree = 0.7, learning_rate = 0.05,
                max_depth = 6, alpha = 10, n_estimators = 200, sub_sample=0.7)

In [93]:
xg_clf.fit(X_train,y_train)

XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3, eval_metric='mlogloss',
              gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=10, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [94]:
preds = xg_clf.predict_proba(X_test)

In [96]:
log_loss(y_test, preds)

1.096285024003643

----------

In [97]:
sub = load_test()

In [98]:
preds_sub = xg_clf.predict_proba(sub)

In [144]:
write_submission(preds_sub, "result.csv")

-----

In [50]:
joblib.dump(xg_clf, 'xgb.pkl') 

['xgb.pkl']

In [51]:
xg_clf = joblib.load('xgb.pkl') 

-----

In [17]:
scorer = sklearn.metrics.make_scorer(sklearn.metrics.f1_score, average = 'weighted')
param_test1 = {'max_depth':range(3,10,2),'min_child_weight':range(1,6,2)}
gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
                                                  min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                                  objective= 'multi:softprob', scale_pos_weight=1, seed=27, eval_metric = 'mlogloss'), 
                        param_grid = param_test1, scoring=scorer,iid=False, cv=5)
gsearch1.fit(X_train, y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

  'precision', 'predicted', average, warn_for)


AttributeError: 'GridSearchCV' object has no attribute 'grid_scores_'

In [23]:
gsearch1.best_params_, gsearch1.best_score_

({'max_depth': 9, 'min_child_weight': 3}, 0.8735475931433669)

({'max_depth': 9, 'min_child_weight': 3}, 0.8735475931433669)

In [8]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}

scorer = sklearn.metrics.make_scorer(sklearn.metrics.f1_score, average = 'weighted')
gsearch3 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=9,
                                                  min_child_weight=3, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                                  objective= 'multi:softprob', scale_pos_weight=1,seed=27, eval_metric = 'mlogloss'), 
                        param_grid = param_test3, scoring=scorer, iid=False, cv=5)
gsearch3.fit(X_train, y_train)
gsearch3.best_params_, gsearch3.best_score_

  'precision', 'predicted', average, warn_for)


({'gamma': 0.1}, 0.8736117856698282)

({'gamma': 0.1}, 0.8736117856698282)

In [8]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
scorer = sklearn.metrics.make_scorer(sklearn.metrics.f1_score, average = 'weighted')
gsearch4 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=9,
                                                  min_child_weight=3, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
                                                  objective= 'multi:softprob', scale_pos_weight=1,seed=27, eval_metric = 'mlogloss'), 
                        param_grid = param_test4, scoring=scorer, iid=False, cv=5)
gsearch4.fit(X_train, y_train)
gsearch4.best_params_, gsearch4.best_score_

  'precision', 'predicted', average, warn_for)


NameError: name 'gsearch3' is not defined

In [9]:
gsearch4.best_params_, gsearch4.best_score_

({'colsample_bytree': 0.8, 'subsample': 0.7}, 0.8736593775727819)

In [10]:
param_test5 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
scorer = sklearn.metrics.make_scorer(sklearn.metrics.f1_score, average = 'weighted')
gsearch5 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=9,
                                                  min_child_weight=3, gamma=0.1, subsample=0.7, colsample_bytree=0.8,
                                                  objective= 'multi:softprob', scale_pos_weight=1,seed=27, eval_metric = 'mlogloss'), 
                        param_grid = param_test5, scoring=scorer, iid=False, cv=5, verbose = 1)
gsearch5.fit(X_train, y_train)
gsearch5.best_params_, gsearch5.best_score_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed: 253.6min finished


({'reg_alpha': 1e-05}, 0.8735360324514202)

In [190]:
xg_clf = xgb.XGBClassifier( learning_rate =0.01, n_estimators=1000, max_depth=9, reg_alpha = 1e-5, 
                                                  min_child_weight=3, gamma=0.1, subsample=0.7, colsample_bytree=0.8,
                                                  objective= 'multi:softprob', scale_pos_weight=1,seed=27, eval_metric = 'mlogloss')

In [191]:
xg_clf.fit(X_train,y_train)
preds = xg_clf.predict_proba(X_test)
log_loss(y_test, preds)

1.9634585693419817

In [28]:
sub = load_test()
preds_sub = xg_clf.predict_proba(sub)
write_submission(preds_sub, "result.csv")

In [29]:
joblib.dump(xg_clf, 'xgb_tuned_2.pkl') 
xg_clf = joblib.load('xgb_tuned_2.pkl') 