In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn import preprocessing
from sklearn.cross_validation import KFold

from sklearn.metrics import mean_absolute_error

%matplotlib inline



In [2]:
train = pd.read_csv('train.csv')

In [3]:
cat_feats = train.select_dtypes(include=["object"]).columns

for feat in cat_feats:
    train[feat + '_id'] = preprocessing.LabelEncoder().fit_transform(train[feat].values)

In [4]:
num_feats = [feat for feat in train.columns if 'cont' in feat]
id_feats  = [feat for feat in train.columns if '_id' in feat]

X = train[num_feats + id_feats].values
y = train['loss'].values

In [5]:
model = xgb.XGBRegressor(
    max_depth = 12,
    learning_rate = 0.2,
    n_estimators = 20,
    silent = 0,
    objective = 'reg:linear',
    nthread = -1,
    # gamma = 5290.,
    # min_child_weight = 4.2922,
    subsample = 0.7,
    colsample_bytree = 0.6,
    seed = 2017
)

In [6]:
nfolds = 3
folds = KFold(len(y), n_folds=nfolds, shuffle = True, random_state = 2017)


for num_iter, (train_index, test_index) in enumerate(folds):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test   = X[test_index], y[test_index]
    
    model.fit(X_train, y_train,
       eval_metric='mae',
       eval_set=[(X[train_index], y[train_index]), (X[test_index], y[test_index])],
       verbose=True)
    
    y_pred = model.predict(X_test)
    y_pred[y_pred<0] = 0
    
    score = mean_absolute_error(y_test, y_pred)
    print("Fold{0}, score={1}".format(num_iter+1, score))

[0]	validation_0-mae:2444.59	validation_1-mae:2456.73
[1]	validation_0-mae:1994.44	validation_1-mae:2015.5
[2]	validation_0-mae:1679.35	validation_1-mae:1714.77
[3]	validation_0-mae:1467.29	validation_1-mae:1519.74
[4]	validation_0-mae:1331.48	validation_1-mae:1401.69
[5]	validation_0-mae:1238.07	validation_1-mae:1327.88
[6]	validation_0-mae:1179.27	validation_1-mae:1282.38
[7]	validation_0-mae:1137.55	validation_1-mae:1256.62
[8]	validation_0-mae:1104.23	validation_1-mae:1238.52
[9]	validation_0-mae:1085.17	validation_1-mae:1231.48
[10]	validation_0-mae:1070.75	validation_1-mae:1227.66
[11]	validation_0-mae:1058.24	validation_1-mae:1226.64
[12]	validation_0-mae:1050.15	validation_1-mae:1225.68
[13]	validation_0-mae:1040.02	validation_1-mae:1225.85
[14]	validation_0-mae:1029.32	validation_1-mae:1224.26
[15]	validation_0-mae:1021.08	validation_1-mae:1223.43
[16]	validation_0-mae:1013.28	validation_1-mae:1223.66
[17]	validation_0-mae:1004.45	validation_1-mae:1223.76
[18]	validation_0-mae

## Task

One cell above there's a model wich use y like a target variable.
Modeify the code in order to use transformed targert variable by logarithm...


some TIPS:
1. y_log_train = np.log(y_train)
2. model.fit(X_train, y_log_train, ...
3. y_log_pred = model.predict(X_test)
4. y_pred = np.exp(y_log_pred)


In [10]:
nfolds = 3
folds = KFold(len(y), n_folds=nfolds, shuffle = True, random_state = 2017)

for num_iter, (train_index, test_index) in enumerate(folds):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test   = X[test_index], y[test_index]
    
    y_log_train = np.log(y_train + 1)
    y_log_test = np.log(y_test + 1)
    
    model.fit(X_train, y_log_train,
       eval_metric='mae',
       eval_set=[(X_train, y_log_train), (X_test, y_log_test)],
       verbose=True)
    
    y_log_pred = model.predict(X_test)
    y_pred = np.exp(y_log_pred) - 1
    y_pred[y_pred<0] = 0
    
    score = mean_absolute_error(y_test, y_pred)
    print("Fold{0}, score={1}".format(num_iter+1, score))

[0]	validation_0-mae:5.74862	validation_1-mae:5.75208
[1]	validation_0-mae:4.5996	validation_1-mae:4.60274
[2]	validation_0-mae:3.68053	validation_1-mae:3.68353
[3]	validation_0-mae:2.94528	validation_1-mae:2.94818
[4]	validation_0-mae:2.35766	validation_1-mae:2.36079
[5]	validation_0-mae:1.88847	validation_1-mae:1.89223
[6]	validation_0-mae:1.51523	validation_1-mae:1.51957
[7]	validation_0-mae:1.22132	validation_1-mae:1.22775
[8]	validation_0-mae:0.992724	validation_1-mae:1.00206
[9]	validation_0-mae:0.818921	validation_1-mae:0.832099
[10]	validation_0-mae:0.691232	validation_1-mae:0.7083
[11]	validation_0-mae:0.599814	validation_1-mae:0.621182
[12]	validation_0-mae:0.535346	validation_1-mae:0.560397
[13]	validation_0-mae:0.490003	validation_1-mae:0.518635
[14]	validation_0-mae:0.45736	validation_1-mae:0.490402
[15]	validation_0-mae:0.434806	validation_1-mae:0.471185
[16]	validation_0-mae:0.418197	validation_1-mae:0.458144
[17]	validation_0-mae:0.406375	validation_1-mae:0.449303
[18]	