In [18]:
import warnings
warnings.filterwarnings('ignore')

In [19]:
import xgboost as xgb
from xgboost import XGBClassifier

#import lightgbm as gbm
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from collections import Counter
from sklearn import cross_validation, metrics 
import matplotlib.pylab as plt


# use r preceding windows filepath when \ returns error
train = pd.read_csv(r'C:\Users\user\Desktop\Kaggle\Titanic\train.csv')
test = pd.read_csv(r'C:\Users\user\Desktop\Kaggle\Titanic\test.csv')

In [20]:
#Improved Age Interpolation based on Pclass, Parch, Sibsp

index_NaN_age = list(train["Age"][train["Age"].isnull()].index)

for i in index_NaN_age :
    age_med = train["Age"].median()
    age_pred = train["Age"][((train['SibSp'] == train.iloc[i]["SibSp"]) & (train['Parch'] == train.iloc[i]["Parch"]) & (train['Pclass'] == train.iloc[i]["Pclass"]))].median()
    if not np.isnan(age_pred) :
        train['Age'].iloc[i] = age_pred
    else :
        train['Age'].iloc[i] = age_med
        
# Filling missing value of Age in test

index_NaN_age = list(test["Age"][test["Age"].isnull()].index)

for i in index_NaN_age :
    age_med = test["Age"].median()
    age_pred = test["Age"][((test['SibSp'] == test.iloc[i]["SibSp"]) & (test['Parch'] == test.iloc[i]["Parch"]) & (test['Pclass'] == test.iloc[i]["Pclass"]))].median()
    if not np.isnan(age_pred) :
        test['Age'].iloc[i] = age_pred
    else :
        test['Age'].iloc[i] = age_med

#Add title variable
dataset_title = [i.split(",")[1].split(".")[0].strip() for i in train["Name"]]
train["Title"] = pd.Series(dataset_title)
train["Title"].head()

dataset_title = [i.split(",")[1].split(".")[0].strip() for i in test["Name"]]
test["Title"] = pd.Series(dataset_title)
test["Title"].head()

# Convert to categorical values Title train
train["Title"] = train["Title"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
train["Title"] = train["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
train["Title"] = train["Title"].astype(int)

# Convert to categorical values Title test
test["Title"] = test["Title"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
test["Title"] = test["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
test["Title"] = test["Title"].astype(int)

# Drop Name variable
train.drop(labels = ["Name"], axis = 1, inplace = True)
test.drop(labels = ["Name"], axis = 1, inplace = True)

# Create a family size descriptor from SibSp and Parch

train["Fsize"] = train["SibSp"] + train["Parch"] + 1
# Create new feature of family size
train['Single'] = train['Fsize'].map(lambda s: 1 if s == 1 else 0)
train['SmallF'] = train['Fsize'].map(lambda s: 1 if s == 2  else 0)
train['MedF']   = train['Fsize'].map(lambda s: 1 if 3 <= s <= 4 else 0)
train['LargeF'] = train['Fsize'].map(lambda s: 1 if s >= 5 else 0)

test["Fsize"] = test["SibSp"] + test["Parch"] + 1
test['Single'] = test['Fsize'].map(lambda s: 1 if s == 1 else 0)
test['SmallF'] = test['Fsize'].map(lambda s: 1 if s == 2  else 0)
test['MedF']   = test['Fsize'].map(lambda s: 1 if 3 <= s <= 4 else 0)
test['LargeF'] = test['Fsize'].map(lambda s: 1 if s >= 5 else 0)


# Create the column Child and assign to 'NaN'
train["Child"] = float('NaN')
test["Child"] = float('NaN')

# Assign 1 to passengers < 20, 0 to those >= 20*******************************
age_var = 9
train["Child"][train["Age"] < age_var] = 1
train["Child"][train["Age"] >= age_var] = 0

test["Child"][test["Age"] < age_var] = 1
test["Child"][test["Age"] >= age_var] = 0

# Convert male and female groups to integer form
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1
test["Sex"][test["Sex"] == "male"] = 0
test["Sex"][test["Sex"] == "female"] = 1

# Impute the Embarked variable
train["Embarked"] = train["Embarked"].fillna("S")
# Embarked to int
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2
test["Embarked"][test["Embarked"] == "S"] = 0
test["Embarked"][test["Embarked"] == "C"] = 1
test["Embarked"][test["Embarked"] == "Q"] = 2

test.Fare[152] = test.Fare.median()

In [21]:
from sklearn import preprocessing
for f in train.columns: 
    if train[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder() 
        lbl.fit(list(train[f].values)) 
        train[f] = lbl.transform(list(train[f].values))
        
for f in test.columns: 
    if test[f].dtype=='object': 
       lbl = preprocessing.LabelEncoder() 
       lbl.fit(list(test[f].values)) 
       test[f] = lbl.transform(list(test[f].values))

In [22]:
kfold = StratifiedKFold(n_splits=10)

#Scoring Function**********************************************************************************
def compute_score(clf, X, y, scoring='accuracy'):
    xval = cross_val_score(clf, X, y, cv = 5, scoring=scoring)
    return np.mean(xval)

In [23]:
train_data = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked", "Child", 
                    "Title", "Fsize", "Single", "SmallF", "MedF", "LargeF"]]
train_features = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked", "Child", 
                    "Title", "Fsize", "Single", "SmallF", "MedF", "LargeF"]].values 


target = train["Survived"].values    

test_features = test[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked", "Child",
                      "Title", "Fsize", "Single", "SmallF", "MedF", "LargeF"]].values   

test_data = test[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked", "Child",
                      "Title", "Fsize", "Single", "SmallF", "MedF", "LargeF"]]

print(train_data.columns.shape)
print(train_features.shape)

(14,)
(891, 14)


In [24]:
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=target)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                            verbose_eval=True,metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], target,eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    
    #print(xgb.cv.results)    
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(target, dtrain_predictions))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(target, dtrain_predprob))
                    
#    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
#    feat_imp.plot(kind='bar', title='Feature Importances')
#    plt.ylabel('Feature Importance Score')

In [25]:
predictors = train_data.columns#[x for x in train_data.columns]# if x not in [ target]] #[train_features]#

xgb1 = XGBClassifier(
                     learning_rate =0.1,
                     n_estimators=206,
                     reg_alpha =0.02,
                     reg_lambda =0.0,                    
                     max_depth=9,
                     min_child_weight=4,
                     gamma=0.1,
                     subsample=0.6,
                     colsample_bytree=0.5,
                     objective= 'binary:logistic',
                     nthread=3,
                     scale_pos_weight=1,
                     seed=97263)

modelfit(xgb1, train_data, predictors)

[0]	train-auc:0.875244+0.00633504	test-auc:0.856761+0.0136696
[1]	train-auc:0.884573+0.00982084	test-auc:0.85641+0.0129018
[2]	train-auc:0.894673+0.00824553	test-auc:0.867478+0.0235756
[3]	train-auc:0.899356+0.00675123	test-auc:0.872204+0.020904
[4]	train-auc:0.905007+0.00333085	test-auc:0.874442+0.0220905
[5]	train-auc:0.906378+0.00258794	test-auc:0.875595+0.0248951
[6]	train-auc:0.908232+0.00237912	test-auc:0.876592+0.0235064
[7]	train-auc:0.910062+0.00253393	test-auc:0.876549+0.0252005
[8]	train-auc:0.911967+0.00329363	test-auc:0.880104+0.0241729
[9]	train-auc:0.912025+0.00400464	test-auc:0.880494+0.0213869
[10]	train-auc:0.913011+0.00457314	test-auc:0.881848+0.0213153
[11]	train-auc:0.913523+0.00454911	test-auc:0.882138+0.021344
[12]	train-auc:0.91359+0.00447936	test-auc:0.881364+0.021507
[13]	train-auc:0.914074+0.00452832	test-auc:0.881039+0.0207107
[14]	train-auc:0.913345+0.00528393	test-auc:0.880233+0.0206195
[15]	train-auc:0.914828+0.00485776	test-auc:0.881406+0.020063
[16]	tra

[131]	train-auc:0.950683+0.00356028	test-auc:0.886707+0.0222558
[132]	train-auc:0.95075+0.00354234	test-auc:0.887646+0.0221876
[133]	train-auc:0.9509+0.00377836	test-auc:0.887007+0.0225294
[134]	train-auc:0.950814+0.00368231	test-auc:0.886339+0.0223258
[135]	train-auc:0.951013+0.00364128	test-auc:0.88648+0.0225967
[136]	train-auc:0.951294+0.00350436	test-auc:0.886388+0.023181
[137]	train-auc:0.951461+0.00367054	test-auc:0.886163+0.0228412
[138]	train-auc:0.951735+0.0036818	test-auc:0.885364+0.0226422
[139]	train-auc:0.951774+0.00364077	test-auc:0.886189+0.0227747
[140]	train-auc:0.951836+0.0036848	test-auc:0.886321+0.0234906
[141]	train-auc:0.952154+0.00371638	test-auc:0.886191+0.0229666
[142]	train-auc:0.952189+0.00376923	test-auc:0.886231+0.0235866
[143]	train-auc:0.952284+0.00362515	test-auc:0.8864+0.0234238
[144]	train-auc:0.952428+0.00360193	test-auc:0.886041+0.0232621
[145]	train-auc:0.952532+0.00367822	test-auc:0.885338+0.0233628
[146]	train-auc:0.952544+0.00343519	test-auc:0.88

In [30]:
run_gs = True
if run_gs:
    XGB_Param = XGBClassifier()
    
   
    
    param_grid_1 = { 'learning_rate' : [0.1], 
                    'reg_alpha':[0.02],
                     'reg_lambda':[0.0],
                    'nthread':[3],
                    'n_estimators':range(100,2000,50) ,
                    'min_child_weight':[4],
                     'max_depth':[9],
                     'gamma':[0.1],
                     'subsample':[0.6],
                     'colsample_bytree':[0.5],
                     'objective': ['binary:logistic'],                       
                     'scale_pos_weight':[1], 
                      'seed':[97263]}
    
    
    
    param_grid_1X = { 'learning_rate' : [0.09,0.1,0.11], 
                    'reg_alpha':[0.01,0.02,0.03],
                     'reg_lambda':[0.0,0.1],
                    'nthread':[3],
                    'n_estimators':[206] ,  #206
                    'min_child_weight':[3,4,5],
                     'max_depth':[8,9,10],
                     'gamma':[0,0.1,0.2],
                     'subsample':[0.5,0.6,0.7],
                     'colsample_bytree':[0.4,0.5,0.6],
                     'objective': ['binary:logistic'],                       
                     'scale_pos_weight':[0.9,1], 
                      'seed':[97263]}

    
    param_grid_13 = { 'learning_rate' : [0.1], 
                    'reg_alpha':[0.02],
                     'reg_lambda':[0.0],
                    'nthread':[3],
                    'n_estimators':[206] ,
                    'min_child_weight':[4],
                     'max_depth':[9],
                     'gamma':[0.1],
                     'subsample':[0.6],
                     'colsample_bytree':[0.5],
                     'objective': ['binary:logistic'],                       
                     'scale_pos_weight':[1], 
                      'seed':range(0,100000,321)}


    param_grid_12 = { 'learning_rate' : [0.1], 
                    'reg_alpha':[0.02],
                     'reg_lambda':[0.0],
                    'nthread':[3],
                    'n_estimators':[207] ,
                    'min_child_weight':[4],
                     'max_depth':[9],
                     'gamma':[0.1],
                     'subsample':[0.6],
                     'colsample_bytree':[0.5],
                     'objective': ['binary:logistic'],                       
                     'scale_pos_weight':[1], 
                      'seed':[5432]}
    
    
    param_grid_11 = { 'reg_alpha':[i/100.0 for i in range(0,5)],
                     'reg_lambda':[i/10.0 for i in range(0,5)],
                    'nthread':[3],
                    'n_estimators':[207] ,
                    'min_child_weight':[4],
                     'max_depth':[9],
                     'gamma':[0.1],
                     'subsample':[0.6],
                     'colsample_bytree':[0.5],
                     'objective': ['binary:logistic'],                       
                     'scale_pos_weight':[1], 
                      'seed':[5432]}
    
    
    param_grid_10 = { 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
                     'reg_lambda':[1e-5, 1e-2, 0.1, 1, 100],
                    'nthread':[3],
                    'n_estimators':[207] ,
                    'min_child_weight':[4],
                     'max_depth':[9],
                     'gamma':[0.1],
                     'subsample':[0.6],
                     'colsample_bytree':[0.5],
                     'objective': ['binary:logistic'],                       
                     'scale_pos_weight':[1], 
                      'seed':[5432]}
    
    
    
    
    param_grid_9 = { 'nthread':[3],
                    'n_estimators':[207] ,
                    'min_child_weight':[4],
                     'max_depth':[9],
                     'gamma':[0.1],
                     'subsample':[0.6],
                     'colsample_bytree':[0.5],
                     'objective': ['binary:logistic'],                       
                     'scale_pos_weight':[1], 
                      'seed':[5432]}
    
    param_grid_8 = { 'nthread':[3],
                    'n_estimators':[207] ,
                    'min_child_weight':[4],
                     'max_depth':[9],
                     'gamma':[0.1],
                     'subsample':[i/10.0 for i in range(2,9)],
                     'colsample_bytree':[i/10.0 for i in range(2,9)],
                     'objective': ['binary:logistic'],                       
                     'scale_pos_weight':[1], 
                      'seed':[5432]}
    
    
    
    param_grid_7 = { 'nthread':[3],
                    'n_estimators':[208] ,
                    'min_child_weight':[4],
                     'max_depth':[9],
                     'gamma':[i/20.0 for i in range(1,3)],
                     'subsample':[0.6],
                     'colsample_bytree':[0.5],
                     'objective': ['binary:logistic'],                       
                     'scale_pos_weight':[1], 
                      'seed':[5432]}
    
    
    param_grid_6 = { 'nthread':[3],
                    'n_estimators':[208] ,
                    'min_child_weight':[4],
                     'max_depth':[9],
                     'gamma':[i/10.0 for i in range(0,5)],
                     'subsample':[0.6],
                     'colsample_bytree':[0.5],
                     'objective': ['binary:logistic'],                       
                     'scale_pos_weight':[1], 
                      'seed':[5432]}
    
    
    param_grid_5 = { 'nthread':[3],
                    'n_estimators':[208] ,
                    'min_child_weight':range(0,10,2),
                     'max_depth':range(5,13,2),
                     'gamma':[0.25],
                     'subsample':[0.6],
                     'colsample_bytree':[0.5],
                     'objective': ['binary:logistic'],                       
                     'scale_pos_weight':[1], 
                      'seed':[5432]}
    
    param_grid_4 = { 'nthread':[3],
                     'min_child_weight':[2,3,4],
                     'max_depth':[9,11,13,15,17],
                     'gamma':[0,0.25,0.5],
                     'subsample':[0.4,0.5,0.6],
                     'colsample_bytree':[0.5, 0.95, 1],
                     'objective': ['binary:logistic'],                       
                     'scale_pos_weight':[1], 
                      'seed':[5432]}



    param_grid_2 = { 'nthread':[3],
                     'min_child_weight':[1,3,5],
                     'max_depth':[3,5,7,9],
                     'gamma':[0,1],
                     'subsample':[0.5,0.8,1],
                     'colsample_bytree':[0.5, 0.95, 1],
                     'objective': ['binary:logistic'],                       
                     'scale_pos_weight':[0,1], 
                      'seed':[5432]}
    
    param_grid_3 = { 'learning_rate' : [0.01], 
                      'n_estimators':[400,600,800,1000],
                       'max_depth':(4,15),
                        'min_child_weight':[5],
                       'gamma':[0],
                       'subsample':[0.8],
                       'colsample_bytree':[0.95],
                       'reg_alpha':[1e-5],
                       'objective': ['binary:logistic'], 
                      'nthread':[4], 
                      'scale_pos_weight':[1], 
                      'seed':[5432]}


    XGB_model_1 = GridSearchCV(XGB_Param, param_grid=param_grid_1, cv=kfold, scoring="roc_auc", n_jobs=-3, verbose=1)
    XGB_model_1.fit(train_data, target)
    print(XGB_model_1.best_score_)
    print(XGB_model_1.best_params_)

else: 
    param_grid_1 =  {  'learning_rate' : 0.1, 
                       'reg_alpha':0.02,
                      'reg_lambda':0.0,
                      'n_estimators':5000, #205
                      'max_depth':9,
                      'min_child_weight':4, 
                      'gamma':0.1, 
                      'subsample':0.6, 
                      'colsample_bytree':0.5,
                      'objective': 'binary:logistic', 
                      'nthread':3, 
                      'scale_pos_weight':1, 
                      'seed':97263}
    
   
    XGB_model_1 = XGBClassifier(**param_grid_1)
    XGB_model_1.fit(train_data, target)

cv_score = compute_score(XGB_model_1, train_data, target, scoring='accuracy')
print("cross_val_score=", cv_score)
cv_score = compute_score(XGB_model_1, train_data, target, scoring='roc_auc')
print("cross_val_score roc auc=", cv_score)



Fitting 10 folds for each of 38 candidates, totalling 380 fits


[Parallel(n_jobs=-3)]: Done  46 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-3)]: Done 196 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-3)]: Done 380 out of 380 | elapsed:  7.9min finished


0.886912952441
{'colsample_bytree': 0.5, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 9, 'min_child_weight': 4, 'n_estimators': 150, 'nthread': 3, 'objective': 'binary:logistic', 'reg_alpha': 0.02, 'reg_lambda': 0.0, 'scale_pos_weight': 1, 'seed': 97263, 'subsample': 0.6}
Fitting 10 folds for each of 38 candidates, totalling 380 fits


[Parallel(n_jobs=-3)]: Done  46 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-3)]: Done 196 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-3)]: Done 380 out of 380 | elapsed:  6.9min finished


Fitting 10 folds for each of 38 candidates, totalling 380 fits


[Parallel(n_jobs=-3)]: Done  46 tasks      | elapsed:   13.8s
[Parallel(n_jobs=-3)]: Done 196 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-3)]: Done 380 out of 380 | elapsed:  6.4min finished


Fitting 10 folds for each of 38 candidates, totalling 380 fits


[Parallel(n_jobs=-3)]: Done  46 tasks      | elapsed:   14.0s
[Parallel(n_jobs=-3)]: Done 196 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-3)]: Done 380 out of 380 | elapsed:  7.1min finished


Fitting 10 folds for each of 38 candidates, totalling 380 fits


[Parallel(n_jobs=-3)]: Done  46 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-3)]: Done 196 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-3)]: Done 380 out of 380 | elapsed:  8.2min finished


Fitting 10 folds for each of 38 candidates, totalling 380 fits


[Parallel(n_jobs=-3)]: Done  46 tasks      | elapsed:   24.4s
[Parallel(n_jobs=-3)]: Done 196 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-3)]: Done 380 out of 380 | elapsed:  8.9min finished


cross_val_score= 0.8417846809
Fitting 10 folds for each of 38 candidates, totalling 380 fits


[Parallel(n_jobs=-3)]: Done  46 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-3)]: Done 196 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-3)]: Done 380 out of 380 | elapsed:  8.3min finished


Fitting 10 folds for each of 38 candidates, totalling 380 fits


[Parallel(n_jobs=-3)]: Done  46 tasks      | elapsed:   20.2s
[Parallel(n_jobs=-3)]: Done 196 tasks      | elapsed:  2.8min


KeyboardInterrupt: 

In [None]:
   
final_pickle = False
if final_pickle:
    import pickle
    pickle_name = 'XGB_1_Pickle.sav'
    XGB_pickle = open(pickle_name, 'rb')
    XGB_pickled_model = pickle.load(XGB_pickle)
    print("pickled model",XGB_pickled_model)
    
    XGB_cv_score = compute_score(XGB_pickled_model, train_data, target, scoring='accuracy')
    print("Final pickle_score=", XGB_cv_score)
    final_submit = XGB_pickled_model.predict(test_data)

else:
    final_submit = XGB_model_1.predict(test_data)
    
    XGB_cv_score = compute_score(XGB_model_1, train_data, target, scoring='accuracy')
    print("Final cv_score=", XGB_cv_score)
    cv_score = compute_score(XGB_model_1, train_data, target, scoring='roc_auc')
    print("cross_val_score roc auc=", cv_score)
    
        
    

#Final array*************************************************************************************
PassengerId =np.array(test["PassengerId"]).astype(int)
my_solution = pd.DataFrame(final_submit, PassengerId, columns = ["Survived"])

my_solution.to_csv("solution_XGB_3.csv", index_label = ["PassengerId"])

In [2]:
#Sound******************************
import winsound
duration = 500  # millisecond
freq = 500
freq_2 = 450  # Hz
winsound.Beep(freq, duration)
winsound.Beep(freq_2, duration)
winsound.Beep(freq, duration)
winsound.Beep(freq_2, duration)
winsound.Beep(freq, duration)
winsound.Beep(freq_2, duration)
winsound.Beep(freq, duration)
winsound.Beep(freq, duration)
winsound.Beep(freq_2, 1000)
winsound.Beep(freq, duration)
winsound.Beep(freq, duration)
winsound.Beep(freq_2, 1000)

