In [None]:
import pandas as pd
import xgboost as xgb
import csv
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn import preprocessing
from sklearn import cross_validation

In [None]:
train = pd.read_csv("./input/train_prep1.csv")
test = pd.read_csv("./input/test_prep1.csv")

test_forIDs = pd.read_csv("./input/test.csv")

train_ids = train.ID
test_ids = test_forIDs.ID
train_target = train.TARGET

train = train.drop(["ID","TARGET"],axis=1)
test = test.drop(["ID"],axis=1)

In [None]:
xgtrain = xgb.DMatrix(train.values, train_target)
xgtest = xgb.DMatrix(test.values)

In [None]:
xgboost_params = { 
   "objective": "binary:logistic",
   "booster": "gbtree",
   "eval_metric": "auc",
   "eta": 0.09, # 0.06, #0.01,
   "min_child_weight": 240,
   "subsample": 0.5,
   "colsample_bytree": 0.9,
   "max_depth": 30,
   "lambda": 0.1
}

In [None]:
xgb.cv(xgboost_params,xgtrain,num_boost_round=10, nfold=5)

### RUN BASE XGB MODEL

In [None]:
boost_round = 2000
clf = xgb.train(xgboost_params,xgtrain,num_boost_round=boost_round,verbose_eval=True,maximize=False)

In [None]:
print('Predict...')
test_preds = clf.predict(xgtest, ntree_limit=clf.best_iteration)
pd.DataFrame({"ID": test_ids, "TARGET": test_preds}).to_csv('xgb_submission1.csv',index=False)

### RUN BASE XGB MODEL WITHOUT PREPROCESSING

In [None]:
#Running xgboost without removing highly correlating parameters
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

train_ids = train.ID
test_ids = test_forIDs.ID
train_target = train.TARGET

train = train.drop(["ID","TARGET"],axis=1)
test = test.drop(["ID"],axis=1)

In [None]:
xgtrain = xgb.DMatrix(train.values, train_target)
xgtest = xgb.DMatrix(test.values)

In [None]:
xgboost_params = { 
   "objective": "binary:logistic",
   "booster": "gbtree",
   "eval_metric": "auc",
   "eta": 0.09, # 0.06, #0.01,
   "min_child_weight": 240,
   "subsample": 0.5,
   "colsample_bytree": 0.9,
   "max_depth": 30,
   "lambda": 0.1
}

In [None]:
xgb.cv(xgboost_params,xgtrain,num_boost_round=10, nfold=5)

In [None]:
boost_round = 2000
clf = xgb.train(xgboost_params,xgtrain,num_boost_round=boost_round,verbose_eval=True,maximize=False)

print('Predict...')
test_preds = clf.predict(xgtest, ntree_limit=clf.best_iteration)
pd.DataFrame({"ID": test_ids, "TARGET": test_preds}).to_csv('xgb_submission2_noPrep.csv',index=False)

### RUN BASE XGB MODEL removing zero SD parameters

In [None]:
#Running xgboost without removing highly correlating parameters
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

train_ids = train.ID
test_ids = test_forIDs.ID
train_target = train.TARGET

train = train.drop(["ID","TARGET"],axis=1)
test = test.drop(["ID"],axis=1)

In [None]:
#remove parameters where sd is zero
rmList = list()
for param in list(train):
    if(np.std(train[param]) == 0):
        rmList.append(param)
        
train = train.drop(rmList,axis=1)
test= test.drop(rmList,axis=1)

In [None]:
xgtrain = xgb.DMatrix(train.values, train_target)
xgtest = xgb.DMatrix(test.values)

In [None]:
xgboost_params = { 
   "objective": "binary:logistic",
   "booster": "gbtree",
   "eval_metric": "auc",
   "eta": 0.09, # 0.06, #0.01,
   "min_child_weight": 240,
   "subsample": 0.5,
   "colsample_bytree": 0.9,
   "max_depth": 30,
   "lambda": 0.1
}

In [None]:
xgb.cv(xgboost_params,xgtrain,num_boost_round=10, nfold=5)

In [None]:
boost_round = 2000
clf = xgb.train(xgboost_params,xgtrain,num_boost_round=boost_round,verbose_eval=True,maximize=False)

print('Predict...')
test_preds = clf.predict(xgtest, ntree_limit=clf.best_iteration)
pd.DataFrame({"ID": test_ids, "TARGET": test_preds}).to_csv('xgb_submission3_noZeroSD.csv',index=False)

## Run extra tree classifier

In [None]:
extc = ExtraTreesClassifier(n_estimators=700,max_features= 50,criterion= 'entropy',min_samples_split= 5,
                            max_depth= 50, min_samples_leaf= 5)
calibrated_clf = CalibratedClassifierCV(extc,method='isotonic', cv=5)

In [None]:
#calibrated_clf = CalibratedClassifierCV(extc,method='isotonic', cv=5)
calibrated_clf.fit(train,train_target) 
#train_pred = extc.predict_proba(X_train)

In [None]:
print('Predict...')
test_pred = extc.predict_proba(test)
scores = cross_validation.cross_val_score(extc,train,train_target,cv = 5, scoring = "roc_auc",verbose=1,n_jobs= 5)

In [None]:
pd.DataFrame({"ID": test_ids, "TARGET": test_pred[:,1]}).to_csv('extratree_calib_1.csv',index=False)

## Run Calibrated XGB on reduced parameters

In [None]:
xgboost_params = { 
   "objective": "binary:logistic",
   "booster": "gbtree",
   "eval_metric": "auc",
   "eta": 0.09, # 0.06, #0.01,
   "min_child_weight": 240,
   "subsample": 0.5,
   "colsample_bytree": 0.9,
   "max_depth": 30,
   "lambda": 0.1
}

In [None]:
boost_round = 2000
clf = xgb.XGBClassifier(xgboost_params)
calibrated_clf = CalibratedClassifierCV(clf,method='isotonic', cv=5)
calibrated_clf.fit(train,train_target,boost_round) 

In [None]:
test_pred = calibrated_clf.predict_proba(test)

In [None]:
pd.DataFrame({"ID": test_ids, "TARGET": test_pred[:,1]}).to_csv('xgb_calib_1.csv',index=False)

In [None]:
?xgb.XGBClassifier