# Allstate week 1

As a starting point, this week we will be running a starter script using a single XGBoost.

The purpose is to get familiar with the data, programming environment and XGBoost.


In [1]:
import xgboost as xgb
import pandas as pd
from sklearn import preprocessing, pipeline, metrics, grid_search, cross_validation
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.metrics import mean_absolute_error
%matplotlib inline

In [2]:
def logregobj(labels, preds):
    con = 2
    x =preds-labels
    grad =con*x / (np.abs(x)+con)
    hess =con**2 / (np.abs(x)+con)**2
    return grad, hess 

def log_mae(y,yhat,shift):
    return mean_absolute_error(np.exp(y)-shift, np.exp(yhat)-shift)

log_mae_scorer = metrics.make_scorer(log_mae, greater_is_better = False)

def search_model(train_x, train_y, est, param_grid, n_jobs, cv, refit=False):
##Grid Search for the best model
    model = grid_search.GridSearchCV(estimator  = est,
                                     param_grid = param_grid,
                                     scoring    = log_mae_scorer,
                                     verbose    = 10,
                                     n_jobs  = n_jobs,
                                     iid        = True,
                                     refit    = refit,
                                     cv      = cv)
    # Fit Grid Search Model
    model.fit(train_x, train_y)
    print("Best score: %0.3f" % model.best_score_)
    print("Best parameters set:", model.best_params_)
    print("Scores:", model.grid_scores_)
    return model


def search_model_mae (train_x, train_y, est, param_grid, n_jobs, cv, refit=False):
##Grid Search for the best model
    model = grid_search.GridSearchCV(estimator  = est,
                                     param_grid = param_grid,
                                     scoring    = 'neg_mean_absolute_error',
                                     verbose    = 10,
                                     n_jobs  = n_jobs,
                                     iid        = True,
                                     refit    = refit,
                                     cv      = cv)
    # Fit Grid Search Model
    model.fit(train_x, train_y)
    print("Best score: %0.3f" % model.best_score_)
    print("Best parameters set:", model.best_params_)
    print("Scores:", model.grid_scores_)
    return model

## Load Data

In [5]:
# Load data
start = time.time() 
train_data = pd.read_csv('../input/train.csv')
train_size=train_data.shape[0]
print ("Loading train data finished in %0.3fs" % (time.time() - start))        

test_data = pd.read_csv('../input/test.csv')
print ("Loading test data finished in %0.3fs" % (time.time() - start))        

Loading train data finished in 2.287s
Loading test data finished in 3.809s


In [6]:
train_data.head(5)

Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,1,A,B,A,B,A,A,A,A,B,...,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843,2213.18
1,2,A,B,A,A,A,A,A,A,B,...,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,1283.6
2,5,A,B,A,A,B,A,A,A,B,...,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,3005.09
3,10,B,B,A,B,A,A,A,A,B,...,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642,939.85
4,11,A,B,A,B,A,A,A,A,B,...,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606,2763.85


## Merge train and test

This will save our time on duplicating logics for train and test and will also ensure the transformations applied on train and test are the same.

In [7]:
full_data=pd.concat([train_data
                       ,test_data])
del( train_data, test_data)
print ("Full Data set created.")

Full Data set created.


## Group features

In this step we will group the features into different groups so we can preprocess them seperately afterward.

In [9]:
data_types = full_data.dtypes  
cat_cols = list(data_types[data_types=='object'].index)
num_cols = list(data_types[data_types=='int64'].index) + list(data_types[data_types=='float64'].index)

id_col = 'id'
target_col = 'loss'
num_cols.remove('id')
num_cols.remove('loss')

print ("Categorical features:", cat_cols)
print ( "Numerica features:", num_cols)
print ( "ID: %s, target: %s" %( id_col, target_col))

('Categorical features:', ['cat1', 'cat10', 'cat100', 'cat101', 'cat102', 'cat103', 'cat104', 'cat105', 'cat106', 'cat107', 'cat108', 'cat109', 'cat11', 'cat110', 'cat111', 'cat112', 'cat113', 'cat114', 'cat115', 'cat116', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat19', 'cat2', 'cat20', 'cat21', 'cat22', 'cat23', 'cat24', 'cat25', 'cat26', 'cat27', 'cat28', 'cat29', 'cat3', 'cat30', 'cat31', 'cat32', 'cat33', 'cat34', 'cat35', 'cat36', 'cat37', 'cat38', 'cat39', 'cat4', 'cat40', 'cat41', 'cat42', 'cat43', 'cat44', 'cat45', 'cat46', 'cat47', 'cat48', 'cat49', 'cat5', 'cat50', 'cat51', 'cat52', 'cat53', 'cat54', 'cat55', 'cat56', 'cat57', 'cat58', 'cat59', 'cat6', 'cat60', 'cat61', 'cat62', 'cat63', 'cat64', 'cat65', 'cat66', 'cat67', 'cat68', 'cat69', 'cat7', 'cat70', 'cat71', 'cat72', 'cat73', 'cat74', 'cat75', 'cat76', 'cat77', 'cat78', 'cat79', 'cat8', 'cat80', 'cat81', 'cat82', 'cat83', 'cat84', 'cat85', 'cat86', 'cat87', 'cat88', 'cat89', 'cat9', 'cat90', 'c

## Categorical features 
### 1. Label Encoding (Factorizing)
### 2. One Hot Encoding (get dummies) - to be discussed in week 2
### 3. Leave-one-out Encoding - to be discussed in week 2

In [10]:
LBL = preprocessing.LabelEncoder()

for cat_col in cat_cols:
#     print ("Factorize feature %s" % (cat))
    full_data[cat_col] = LBL.fit_transform(full_data[cat_col])
print ("Label Encoding categorical features is done.")


Label Encoding categorical features is done.


## Custom objective function and validation function

Instead of using the original data, validation function and objective function, we will be applying log function on the target and using custom functions.

This combination of approach has been proved to be effective for regression competitions using MAE metric.

In [11]:
full_cols = cat_cols + num_cols

offset = 200
param_grid = {'objective':[logregobj],
              'learning_rate':[0.02],
              'n_estimators':[1500],
              'max_depth': [9],
              'min_child_weight':[50],
              'subsample': [0.78],
              'colsample_bytree':[0.67],
              'gamma':[0.9],
              'nthread': [-1],
              'seed' : [1234]}

model = search_model(full_data[:train_size][full_cols].values,
                     np.log(full_data[:train_size].loss.values+200),
                     xgb.XGBRegressor(),
                     param_grid,
                     n_jobs = 1,
                     cv = 4,
                     refit = True)



Fitting 4 folds for each of 1 candidates, totalling 4 fits
[CV] colsample_bytree=0.67, learning_rate=0.02, nthread=-1, min_child_weight=50, n_estimators=1500, subsample=0.78, seed=1234, objective=<function logregobj at 0x1163b56e0>, max_depth=9, gamma=0.9 
[CV]  colsample_bytree=0.67, learning_rate=0.02, nthread=-1, min_child_weight=50, n_estimators=1500, subsample=0.78, seed=1234, objective=<function logregobj at 0x1163b56e0>, max_depth=9, gamma=0.9, score=-1132.879663 -10.2min
[CV] colsample_bytree=0.67, learning_rate=0.02, nthread=-1, min_child_weight=50, n_estimators=1500, subsample=0.78, seed=1234, objective=<function logregobj at 0x1163b56e0>, max_depth=9, gamma=0.9 


[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed: 10.2min


[CV]  colsample_bytree=0.67, learning_rate=0.02, nthread=-1, min_child_weight=50, n_estimators=1500, subsample=0.78, seed=1234, objective=<function logregobj at 0x1163b56e0>, max_depth=9, gamma=0.9, score=-1139.928564 -45.6min
[CV] colsample_bytree=0.67, learning_rate=0.02, nthread=-1, min_child_weight=50, n_estimators=1500, subsample=0.78, seed=1234, objective=<function logregobj at 0x1163b56e0>, max_depth=9, gamma=0.9 
[CV]  colsample_bytree=0.67, learning_rate=0.02, nthread=-1, min_child_weight=50, n_estimators=1500, subsample=0.78, seed=1234, objective=<function logregobj at 0x1163b56e0>, max_depth=9, gamma=0.9, score=-1135.962090 -10.1min
[CV] colsample_bytree=0.67, learning_rate=0.02, nthread=-1, min_child_weight=50, n_estimators=1500, subsample=0.78, seed=1234, objective=<function logregobj at 0x1163b56e0>, max_depth=9, gamma=0.9 
[CV]  colsample_bytree=0.67, learning_rate=0.02, nthread=-1, min_child_weight=50, n_estimators=1500, subsample=0.78, seed=1234, objective=<function lo

[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed: 107.8min
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 107.8min finished


Best score: -1135.719
('Best parameters set:', {'colsample_bytree': 0.67, 'learning_rate': 0.02, 'nthread': -1, 'min_child_weight': 50, 'n_estimators': 1500, 'subsample': 0.78, 'seed': 1234, 'objective': <function logregobj at 0x1163b56e0>, 'max_depth': 9, 'gamma': 0.9})
('Scores:', [mean: -1135.71852, std: 2.66695, params: {'colsample_bytree': 0.67, 'learning_rate': 0.02, 'nthread': -1, 'min_child_weight': 50, 'n_estimators': 1500, 'subsample': 0.78, 'seed': 1234, 'objective': <function logregobj at 0x1163b56e0>, 'max_depth': 9, 'gamma': 0.9}])


## Predict and make submission

In [13]:
pred_y = np.exp(model.predict(full_data[train_size:][full_cols].values)) - offset

results = pd.DataFrame()
results['id'] = full_data[train_size:].id
results['loss'] = pred_y
results.to_csv("../output/sub_xgb_starter.csv", index=False)
print ("Submission created.")

Submission created.
