In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from scipy import sparse
import xgboost as xgb
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss

In [3]:
train_X = pd.read_pickle('../input/' + 'train_X_2017-02-16-20-51.pkl')
test_X = pd.read_pickle('../input/' + 'test_X_2017-02-16-20-51.pkl')
test_listing = pd.read_pickle('../input/' + 'listing_id.pkl')

train_y = pd.read_pickle('../input/' + 'y_2017-02-16-20-51.pkl') 
print(train_X.shape, test_X.shape)

((49352, 229), (74659, 229))


In [4]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [5]:
cv_scores = []
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

[0]	train-mlogloss:1.03167	test-mlogloss:1.03287
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:0.977718	test-mlogloss:0.980138
[2]	train-mlogloss:0.930434	test-mlogloss:0.934114
[3]	train-mlogloss:0.891815	test-mlogloss:0.896335
[4]	train-mlogloss:0.855457	test-mlogloss:0.861022
[5]	train-mlogloss:0.824848	test-mlogloss:0.831517
[6]	train-mlogloss:0.798066	test-mlogloss:0.805897
[7]	train-mlogloss:0.774562	test-mlogloss:0.783518
[8]	train-mlogloss:0.753523	test-mlogloss:0.763608
[9]	train-mlogloss:0.73537	test-mlogloss:0.746205
[10]	train-mlogloss:0.718173	test-mlogloss:0.729961
[11]	train-mlogloss:0.703067	test-mlogloss:0.715917
[12]	train-mlogloss:0.690331	test-mlogloss:0.704011
[13]	train-mlogloss:0.677493	test-mlogloss:0.692171
[14]	train-mlogloss:0.667338	test-mlogloss:0.682857
[15]	train-mlogloss:0.656746	test-mlogloss:0.673127
[16]	train-mlogloss:0.647775	t

In [11]:
preds, model = runXGB(train_X, train_y, test_X, num_rounds=int(200/.8))
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_listing
out_df.to_csv("xgb_starter_SRK_FE.csv", index=False)