In [1]:
import sys
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

import xgboost as xgb

from sklearn.preprocessing import LabelEncoder

train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")



In [2]:
#train_df.head()
#train_df.info()
#train_df.describe()
#train_df.describe(include=['O'])
#train_df.Dependents.unique()

In [3]:
train_df['Gender'].fillna(train_df['Gender'].mode()[0], inplace=True)
train_df['Married'].fillna(train_df['Married'].mode()[0], inplace=True)
train_df['Dependents'].fillna(train_df['Dependents'].mode()[0], inplace=True)
train_df['Self_Employed'].fillna(train_df['Self_Employed'].mode()[0], inplace=True)
train_df['LoanAmount'].fillna(train_df['LoanAmount'].mean(), inplace=True)
train_df['Loan_Amount_Term'].fillna(train_df['Loan_Amount_Term'].mean(), inplace=True)
train_df['Credit_History'].fillna(train_df['Credit_History'].mean(), inplace=True)

In [4]:
#train_df.info()
y_train = train_df['Loan_Status']
#y_train.head()
y_train = y_train.apply({'Y':1, 'N':0}.get)
x_train = train_df.drop(['Loan_Status'], axis=1)

In [5]:
train_df1 = x_train.copy();
test_df1 = test_df.copy();

le = LabelEncoder()
train_df1['Loan_ID'] = le.fit_transform(train_df['Loan_ID'])
test_df1['Loan_ID'] = le.fit_transform(test_df['Loan_ID'])

train_df1 = pd.get_dummies(train_df1)
test_df1 = pd.get_dummies(test_df1)

#train_df1.info()
#train_df1.head()
x_train = train_df1
x_submit = test_df1

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x_train,y_train, test_size=.01)

In [7]:
dtrain = xgb.DMatrix(x_train.values, label=y_train, missing=np.nan)
dtest = xgb.DMatrix(x_test.values, label=y_test, missing=np.nan)

In [8]:
params = {'objective': 'binary:logistic', 'booster': 'gbtree', 'silent': 0,
		 'max_depth': 10, 'eta': 0.1,
		 'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 20,
		 'max_delta_step': 0, 'gamma': 0}

In [9]:
params['eval_metric'] = "mae"
num_round = 600

In [10]:
clf = xgb.train(params,
                dtrain,
                num_boost_round=num_round,
                evals=[(dtest, "Test")],
                early_stopping_rounds=10
               )

[0]	Test-mae:0.471325
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:0.445183
[2]	Test-mae:0.421263
[3]	Test-mae:0.3999
[4]	Test-mae:0.378042
[5]	Test-mae:0.361626
[6]	Test-mae:0.345161
[7]	Test-mae:0.330374
[8]	Test-mae:0.315483
[9]	Test-mae:0.303382
[10]	Test-mae:0.292919
[11]	Test-mae:0.28511
[12]	Test-mae:0.2776
[13]	Test-mae:0.266955
[14]	Test-mae:0.259912
[15]	Test-mae:0.253368
[16]	Test-mae:0.249095
[17]	Test-mae:0.24482
[18]	Test-mae:0.240738
[19]	Test-mae:0.236424
[20]	Test-mae:0.230433
[21]	Test-mae:0.227584
[22]	Test-mae:0.223287
[23]	Test-mae:0.21876
[24]	Test-mae:0.216452
[25]	Test-mae:0.213617
[26]	Test-mae:0.209145
[27]	Test-mae:0.208894
[28]	Test-mae:0.208551
[29]	Test-mae:0.208764
[30]	Test-mae:0.204848
[31]	Test-mae:0.200274
[32]	Test-mae:0.197465
[33]	Test-mae:0.197614
[34]	Test-mae:0.194913
[35]	Test-mae:0.195363
[36]	Test-mae:0.192843
[37]	Test-mae:0.191424
[38]	Test-mae:0.189541
[39]	Test-mae:0.190254
[40]	Test-mae:0.189813
[41]	Test-mae:0.18

In [11]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

In [12]:
min_mae = float("Inf")
best_param = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our paramseters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_param = (max_depth,min_child_weight)
print("Best param: {}, {}, MAE: {}".format(best_param[0], best_param[1], min_mae))

params['max_depth'] = best_param[0]
params['min_child_weight'] = best_param[1]

CV with max_depth=9, min_child_weight=5


will be corrected to return the positional minimum in the future.
Use 'series.values.argmin' to get the position of the minimum now.


	MAE 0.301563 for 84 rounds
CV with max_depth=9, min_child_weight=6
	MAE 0.3067076 for 84 rounds
CV with max_depth=9, min_child_weight=7
	MAE 0.30812019999999996 for 84 rounds
CV with max_depth=10, min_child_weight=5
	MAE 0.301563 for 84 rounds
CV with max_depth=10, min_child_weight=6
	MAE 0.3067076 for 84 rounds
CV with max_depth=10, min_child_weight=7
	MAE 0.30812019999999996 for 84 rounds
CV with max_depth=11, min_child_weight=5
	MAE 0.301563 for 84 rounds
CV with max_depth=11, min_child_weight=6
	MAE 0.3067076 for 84 rounds
CV with max_depth=11, min_child_weight=7
	MAE 0.30812019999999996 for 84 rounds
Best param: 9, 5, MAE: 0.301563


In [13]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [14]:
min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

params['subsample'] = best_params[0]
params['colsample_bytree'] = best_params[1]

CV with subsample=1.0, colsample=1.0


will be corrected to return the positional minimum in the future.
Use 'series.values.argmin' to get the position of the minimum now.


	MAE 0.2865678 for 231 rounds
CV with subsample=1.0, colsample=0.9
	MAE 0.2905562 for 153 rounds
CV with subsample=1.0, colsample=0.8
	MAE 0.2916692 for 208 rounds
CV with subsample=1.0, colsample=0.7
	MAE 0.2956174 for 113 rounds
CV with subsample=0.9, colsample=1.0
	MAE 0.29426379999999996 for 129 rounds
CV with subsample=0.9, colsample=0.9
	MAE 0.2941296 for 168 rounds
CV with subsample=0.9, colsample=0.8
	MAE 0.29418540000000004 for 174 rounds
CV with subsample=0.9, colsample=0.7
	MAE 0.2923684 for 170 rounds
CV with subsample=0.8, colsample=1.0
	MAE 0.298458 for 84 rounds
CV with subsample=0.8, colsample=0.9
	MAE 0.30144380000000004 for 85 rounds
CV with subsample=0.8, colsample=0.8
	MAE 0.301563 for 84 rounds
CV with subsample=0.8, colsample=0.7
	MAE 0.3049014 for 87 rounds
CV with subsample=0.7, colsample=1.0
	MAE 0.3008408 for 115 rounds
CV with subsample=0.7, colsample=0.9
	MAE 0.3023186 for 99 rounds
CV with subsample=0.7, colsample=0.8
	MAE 0.30567479999999997 for 102 rounds

In [15]:
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_round,
        seed=42,
        nfold=5,
        metrics=['mae'],
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))

params['eta'] = best_params

CV with eta=0.3


will be corrected to return the positional minimum in the future.
Use 'series.values.argmin' to get the position of the minimum now.


	MAE 0.288078 for 78 rounds

CV with eta=0.2
	MAE 0.2904692 for 85 rounds

CV with eta=0.1
	MAE 0.2865678 for 231 rounds

CV with eta=0.05
	MAE 0.29560200000000003 for 206 rounds

CV with eta=0.01
	MAE 0.3010408 for 599 rounds

CV with eta=0.005
	MAE 0.3128862 for 599 rounds

Best params: 0.1, MAE: 0.2865678


In [16]:
print(params)

{'max_depth': 9, 'colsample_bytree': 1.0, 'eval_metric': 'mae', 'subsample': 1.0, 'gamma': 0, 'max_delta_step': 0, 'eta': 0.1, 'objective': 'binary:logistic', 'silent': 0, 'booster': 'gbtree', 'min_child_weight': 5}


In [17]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-mae:0.463721
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:0.431152
[2]	Test-mae:0.397101
[3]	Test-mae:0.370302
[4]	Test-mae:0.3473
[5]	Test-mae:0.323209
[6]	Test-mae:0.303317
[7]	Test-mae:0.283976
[8]	Test-mae:0.272195
[9]	Test-mae:0.256142
[10]	Test-mae:0.242917
[11]	Test-mae:0.232867
[12]	Test-mae:0.222785
[13]	Test-mae:0.214155
[14]	Test-mae:0.209537
[15]	Test-mae:0.201603
[16]	Test-mae:0.193128
[17]	Test-mae:0.189037
[18]	Test-mae:0.179885
[19]	Test-mae:0.175966
[20]	Test-mae:0.171197
[21]	Test-mae:0.16556
[22]	Test-mae:0.165196
[23]	Test-mae:0.163866
[24]	Test-mae:0.160963
[25]	Test-mae:0.157063
[26]	Test-mae:0.155724
[27]	Test-mae:0.154239
[28]	Test-mae:0.150547
[29]	Test-mae:0.151657
[30]	Test-mae:0.151251
[31]	Test-mae:0.147675
[32]	Test-mae:0.146777
[33]	Test-mae:0.144766
[34]	Test-mae:0.142767
[35]	Test-mae:0.138879
[36]	Test-mae:0.139982
[37]	Test-mae:0.140021
[38]	Test-mae:0.140594
[39]	Test-mae:0.142393
[40]	Test-mae:0.139162
[41]	Test-mae:

In [18]:
num_boost_round = model.best_iteration + 1
best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

[0]	Test-mae:0.463721
[1]	Test-mae:0.431152
[2]	Test-mae:0.397101
[3]	Test-mae:0.370302
[4]	Test-mae:0.3473
[5]	Test-mae:0.323209
[6]	Test-mae:0.303317
[7]	Test-mae:0.283976
[8]	Test-mae:0.272195
[9]	Test-mae:0.256142
[10]	Test-mae:0.242917
[11]	Test-mae:0.232867
[12]	Test-mae:0.222785
[13]	Test-mae:0.214155
[14]	Test-mae:0.209537
[15]	Test-mae:0.201603
[16]	Test-mae:0.193128
[17]	Test-mae:0.189037
[18]	Test-mae:0.179885
[19]	Test-mae:0.175966
[20]	Test-mae:0.171197
[21]	Test-mae:0.16556
[22]	Test-mae:0.165196
[23]	Test-mae:0.163866
[24]	Test-mae:0.160963
[25]	Test-mae:0.157063
[26]	Test-mae:0.155724
[27]	Test-mae:0.154239
[28]	Test-mae:0.150547
[29]	Test-mae:0.151657
[30]	Test-mae:0.151251
[31]	Test-mae:0.147675
[32]	Test-mae:0.146777
[33]	Test-mae:0.144766
[34]	Test-mae:0.142767
[35]	Test-mae:0.138879
[36]	Test-mae:0.139982
[37]	Test-mae:0.140021
[38]	Test-mae:0.140594
[39]	Test-mae:0.142393
[40]	Test-mae:0.139162
[41]	Test-mae:0.140789
[42]	Test-mae:0.13872
[43]	Test-mae:0.138262
[4

In [19]:
mean_absolute_error(best_model.predict(dtest), y_test)

0.11081457350935255

In [20]:
best_model.save_model("my_model.model")

loaded_model = xgb.Booster()
loaded_model.load_model("my_model.model")

dtest = xgb.DMatrix(x_submit.values, missing=np.nan)

test_preds = load_model.predict(dtest)

In [21]:
seeds = [1122, 2244, 3366, 4488, 5500]
#seeds = [1122]
test_preds = np.zeros((len(x_submit), len(seeds)))

for run in range(len(seeds)):
	sys.stdout.write("\rXGB RUN:{}/{}".format(run+1, len(seeds)))
	sys.stdout.flush()
	params['seed'] = seeds[run]
	clf = xgb.train(params,
                    dtrain,
                    num_boost_round=num_round
                   )
	dtest = xgb.DMatrix(x_submit.values, missing=np.nan)
	test_preds[:, run] = clf.predict(dtest)
    
test_preds = np.mean(test_preds, axis=1)

XGB RUN:5/5

In [22]:
test_df['Loan_Status'] = test_preds
test_df['Loan_Status'] = test_df['Loan_Status'] > 0.5
test_df['Loan_Status'] = test_df['Loan_Status'].apply({True:'Y', False:'N'}.get)
test_df.to_csv("../output/final_solution.csv", index=False)