# Importing Required Packages

In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.2.1-py3-none-win_amd64.whl (86.5 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.2.1


You should consider upgrading via the 'C:\Users\Yashveer\Anaconda3\python.exe -m pip install --upgrade pip' command.


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import pickle

import warnings
warnings.filterwarnings("ignore")

sns.set()

# Reading Data

In [5]:
train_features_backup = pd.read_csv("../Data/train_features_df.csv")
test_features_backup = pd.read_csv("../Data/test_features_df.csv")

train_features_df = train_features_backup.copy()
train_features_df = train_features_df.drop(["Age_label_enc", "Fare_label_enc"], axis = 1)
train_features_df["Indices"] = train_features_df.index

test_features_df = test_features_backup.copy()
test_features_df = test_features_df.drop(["Age_label_enc", "Fare_mean_enc"], axis = 1)

In [6]:
train_df = pd.read_csv("../Data/train.csv")
test_df = pd.read_csv("../Data/test.csv")

train_preprocessed_df = pd.read_csv("../Data/preprocessed_train_df.csv")
test_preprocessed_df = pd.read_csv("../Data/preprocessed_test_df.csv")

# Train Test Split

In [7]:
X, y = train_features_df.values, train_df["Survived"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Base XGBoost Model

In [8]:
xgb.XGBClassifier(objective = "binary:logistic", random_state = 42).fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [36]:
base_xgb = xgb.XGBClassifier(
                    objective = "binary:logistic", 
                    random_state = 42
                ).fit(X_train[:, :-1], y_train, early_stopping_rounds=5, eval_set=[(X_test[:, :-1], y_test)])

[0]	validation_0-error:0.18994
Will train until validation_0-error hasn't improved in 5 rounds.
[1]	validation_0-error:0.18436
[2]	validation_0-error:0.18994
[3]	validation_0-error:0.17877
[4]	validation_0-error:0.17877
[5]	validation_0-error:0.17877
[6]	validation_0-error:0.17877
[7]	validation_0-error:0.17318
[8]	validation_0-error:0.17877
[9]	validation_0-error:0.16760
[10]	validation_0-error:0.16760
[11]	validation_0-error:0.17318
[12]	validation_0-error:0.16760
[13]	validation_0-error:0.16201
[14]	validation_0-error:0.16760
[15]	validation_0-error:0.16201
[16]	validation_0-error:0.16201
[17]	validation_0-error:0.16760
[18]	validation_0-error:0.16201
Stopping. Best iteration:
[13]	validation_0-error:0.16201



## Classification Accuracy

In [37]:
np.where(base_xgb.predict(X_test[:, :-1]) == y_test, 1, 0).mean()

0.8379888268156425

## F1 Score

In [38]:
f1_score(y_true = y_test, y_pred = base_xgb.predict(X_test[:, :-1]))

0.7972027972027972

## Classification Summary

In [39]:
preds = base_xgb.predict(X_test[:, :-1])

f1_df = pd.DataFrame(index = ["Predictions:0", "Predictions:1"], columns = ["Actuals:0", "Actuals:1"])

f1_df.at["Predictions:0", "Actuals:0"] = np.where((preds == 0) & (y_test == 0), 1, 0).sum()
f1_df.at["Predictions:0", "Actuals:1"] = np.where((preds == 0) & (y_test == 1), 1, 0).sum()
f1_df.at["Predictions:1", "Actuals:0"] = np.where((preds == 1) & (y_test == 0), 1, 0).sum()
f1_df.at["Predictions:1", "Actuals:1"] = np.where((preds == 1) & (y_test == 1), 1, 0).sum()

f1_df

Unnamed: 0,Actuals:0,Actuals:1
Predictions:0,93,17
Predictions:1,12,57


## Error Analysis

In [40]:
preds = base_xgb.predict(X_test[:, :-1])

p0_a1 = X_test[:, -1][(preds == 0) & (y_test == 1)]
p1_a0 = X_test[:, -1][(preds == 1) & (y_test == 0)]


print(base_xgb.classes_)


preds_proba = base_xgb.predict_proba(X_test[:, :-1])

error_df = pd.concat([train_df.loc[p0_a1], train_df.loc[p1_a0]], axis = 0)
error_df["Prediction_Proba"] = preds_proba[:, 1][((preds == 0) & (y_test == 1)) | ((preds == 1) & (y_test == 0))]

error_df

[0 1]


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Prediction_Proba
447.0,448,1,1,"Seward, Mr. Frederic Kimber",male,34.0,0,0,113794,26.55,,S,0.529738
192.0,193,1,3,"Andersen-Jensen, Miss. Carla Christine Nielsine",female,19.0,1,0,350046,7.8542,,S,0.108202
673.0,674,1,2,"Wilhelms, Mr. Charles",male,31.0,0,0,244270,13.0,,S,0.437607
141.0,142,1,3,"Nysten, Miss. Anna Sofia",female,22.0,0,0,347081,7.75,,S,0.145879
204.0,205,1,3,"Cohen, Mr. Gurshon ""Gus""",male,18.0,0,0,A/5 3540,8.05,,S,0.437607
23.0,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5,A6,S,0.809651
712.0,713,1,1,"Taylor, Mr. Elmer Zebley",male,48.0,1,0,19996,52.0,C126,S,0.089335
338.0,339,1,3,"Dahl, Mr. Karl Edwart",male,45.0,0,0,7598,8.05,,S,0.102173
286.0,287,1,3,"de Mulder, Mr. Theodore",male,30.0,0,0,345774,9.5,,S,0.956771
209.0,210,1,1,"Blank, Mr. Henry",male,40.0,0,0,112277,31.0,A31,C,0.893663


# Base XGBoost with class balancing

In [41]:
class_weight = pd.Series(y_train).value_counts()[0] / pd.Series(y_train).value_counts()[1]

In [42]:
xgb.XGBClassifier(objective = "binary:logistic", random_state = 42, scale_pos_weight = class_weight)

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              objective='binary:logistic', random_state=42, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1.6567164179104477,
              subsample=None, tree_method=None, validate_parameters=None,
              verbosity=None)

In [47]:
balanced_base_xgb = xgb.XGBClassifier(
                        objective = "binary:logistic",
                        random_state = 42, 
                        scale_pos_weight = class_weight
                    ).fit(X_train[:, :-1], y_train, early_stopping_rounds=5, eval_set=[(X_test[:, :-1], y_test)])

[0]	validation_0-error:0.17318
Will train until validation_0-error hasn't improved in 5 rounds.
[1]	validation_0-error:0.17877
[2]	validation_0-error:0.16201
[3]	validation_0-error:0.17877
[4]	validation_0-error:0.17877
[5]	validation_0-error:0.16760
[6]	validation_0-error:0.17318
[7]	validation_0-error:0.16760
Stopping. Best iteration:
[2]	validation_0-error:0.16201



In [48]:
np.where(balanced_base_xgb.predict(X_test[:, :-1]) == y_test, 1, 0).mean()

0.8379888268156425

In [49]:
f1_score(y_pred = balanced_base_xgb.predict(X_test[:, :-1]), y_true = y_test)

0.802721088435374

## Creating Base Submission (Balanced and Unbalanced)

## Unbalanced

In [52]:
X, y = train_features_df.values, train_df["Survived"].values

full_base_xgb = xgb.XGBClassifier(
                        objective = "binary:logistic", 
                        random_state = 42
                    ).fit(X[:, :-1], y, early_stopping_rounds = 5, 
                          eval_metric = "error", eval_set = [(X[:, :-1], y)])

base_sub7 = pd.read_csv("../Data/gender_submission.csv")
base_sub7["Survived"] = full_base_xgb.predict(test_features_df.values)
base_sub7.head()

[0]	validation_0-error:0.14254
Will train until validation_0-error hasn't improved in 5 rounds.
[1]	validation_0-error:0.13468
[2]	validation_0-error:0.12009
[3]	validation_0-error:0.11560
[4]	validation_0-error:0.11336
[5]	validation_0-error:0.10887
[6]	validation_0-error:0.10999
[7]	validation_0-error:0.11336
[8]	validation_0-error:0.11223
[9]	validation_0-error:0.11448
[10]	validation_0-error:0.10887
Stopping. Best iteration:
[5]	validation_0-error:0.10887



Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [53]:
base_sub7.to_csv("sub7_base_xgb.csv", index = False)

## Balanced

In [55]:
X, y = train_features_df.values, train_df["Survived"].values

class_weight = pd.Series(y).value_counts()[0] / pd.Series(y).value_counts()[1]

full_bal_base_xgb = xgb.XGBClassifier(
                        objective = "binary:logistic",
                        scale_pos_weight = class_weight,
                        random_state = 42
                    ).fit(X[:, :-1], y, early_stopping_rounds = 5, 
                          eval_metric = "error", eval_set = [(X[:, :-1], y)])

bal_base_sub7 = pd.read_csv("../Data/gender_submission.csv")
bal_base_sub7["Survived"] = full_bal_base_xgb.predict(test_features_df.values)
bal_base_sub7.head()

[0]	validation_0-error:0.14029
Will train until validation_0-error hasn't improved in 5 rounds.
[1]	validation_0-error:0.14141
[2]	validation_0-error:0.13917
[3]	validation_0-error:0.13580
[4]	validation_0-error:0.12009
[5]	validation_0-error:0.11223
[6]	validation_0-error:0.10774
[7]	validation_0-error:0.10887
[8]	validation_0-error:0.10550
[9]	validation_0-error:0.10438
[10]	validation_0-error:0.10438
[11]	validation_0-error:0.10213
[12]	validation_0-error:0.09540
[13]	validation_0-error:0.09540
[14]	validation_0-error:0.09203
[15]	validation_0-error:0.09315
[16]	validation_0-error:0.08754
[17]	validation_0-error:0.08417
[18]	validation_0-error:0.08305
[19]	validation_0-error:0.08081
[20]	validation_0-error:0.07744
[21]	validation_0-error:0.07520
[22]	validation_0-error:0.07632
[23]	validation_0-error:0.07520
[24]	validation_0-error:0.06846
[25]	validation_0-error:0.06846
[26]	validation_0-error:0.06958
[27]	validation_0-error:0.06734
[28]	validation_0-error:0.06622
[29]	validation_0

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,1
4,896,1


In [56]:
bal_base_sub7.to_csv("sub7_bal_base_xgb.csv", index = False)

# Optimizing XGBoot

## Data

In [57]:
X, y = train_features_df.values, train_df["Survived"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Base Model

In [59]:
base_xgb = xgb.XGBClassifier(objective = "binary:logistic", random_state = 42)

## Parameter Grid

In [104]:
list(np.arange(1, 2, 1/100))

100

In [112]:
# XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, 
#               colsample_bytree=0.7, gamma=0, gpu_id=-1, importance_type='gain', 
#               interaction_constraints='', learning_rate=0.05, max_delta_step=0, max_depth=3, 
#               min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0, 
#               num_parallel_tree=1, objective='binary:logistic', random_state=42, reg_alpha=0, reg_lambda=0, 
#               scale_pos_weight=1, subsample=0.7, tree_method='exact', validate_parameters=1, verbosity=None)

# Accuracy - 83.79% (same as base model)

param_grid = [
    {
        "max_depth" : [3, 4, 5], # 3
        "subsample" : [0.5, 0.7, 0.9, 1.0], # 0.7
        "colsample_bytree" : [0.5, 0.7, 1.0], # 0.7
        "gamma" : [0, 0.25, 1.0], # 0
        "reg_lambda" : [0, 1.0, 10.0], # 0
        "learning_rate": [0.1, 0.05, 0.01] # 0.05
    }
]

## Grid Search

### Grid Object

In [113]:
xgb_grid = GridSearchCV(base_xgb, param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)
# rf_grid = RandomizedSearchCV(base_rf, 
#                              param_distributions = param_grid, 
#                              cv = 3, verbose=True, n_jobs=-1, n_iter = 40)

### Fitting All Models in the Grid

In [114]:
xgb_grid_fit = xgb_grid.fit(X_train[:, :-1], y_train)

Fitting 3 folds for each of 972 candidates, totalling 2916 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 680 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 1680 tasks      | elapsed:   25.8s
[Parallel(n_jobs=-1)]: Done 2916 out of 2916 | elapsed:   48.3s finished


### Best Estimator

In [115]:
xgb_grid_fit.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=42, reg_alpha=0,
              reg_lambda=0, scale_pos_weight=1, subsample=0.7,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [116]:
int_best_xgb = xgb_grid_fit.best_estimator_

**Classification Accuracy**

In [117]:
np.where(int_best_xgb.predict(X_test[:, :-1]) == y_test, 1, 0).mean()

0.8379888268156425

**F1 Score**

In [118]:
f1_score(y_true = y_test, y_pred = int_best_xgb.predict(X_test[:, :-1]))

0.8053691275167786

### Exporting Best Internal Model

In [83]:
filename = "best_int_xgb.sav"
pickle.dump(int_best_xgb, open(filename, 'wb'))

In [84]:
filename = "best_int_xgb.sav"
loaded_model = pickle.load(open(filename, 'rb'))

In [85]:
loaded_model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=42, reg_alpha=0,
              reg_lambda=0, scale_pos_weight=1, subsample=0.7,
              tree_method='exact', validate_parameters=1, verbosity=None)

# Final Submission File

## Fitting Model on Full Training data

In [86]:
X, y = train_features_df.values, train_df["Survived"].values

final_xgb = xgb_grid.fit(X[:, :-1], y)

Fitting 3 folds for each of 972 candidates, totalling 2916 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 680 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 1680 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-1)]: Done 2916 out of 2916 | elapsed:   52.8s finished


In [93]:
final_xgb.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=42, reg_alpha=0,
              reg_lambda=0, scale_pos_weight=1, subsample=0.5,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [94]:
best_ext_xgb = final_xgb.best_estimator_

## Predictions

In [95]:
final_sub7 = pd.read_csv("../Data/gender_submission.csv")
final_sub7["Survived"] = best_ext_xgb.predict(test_features_df.values)
final_sub7.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [96]:
final_sub7.to_csv("sub7_final_xgb.csv", index = False)

## Exporting Model

In [97]:
filename = "best_ext_xgb.sav"
pickle.dump(best_ext_xgb, open(filename, 'wb'))

In [98]:
filename = "best_ext_xgb.sav"
loaded_model = pickle.load(open(filename, 'rb'))

In [99]:
loaded_model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=42, reg_alpha=0,
              reg_lambda=0, scale_pos_weight=1, subsample=0.5,
              tree_method='exact', validate_parameters=1, verbosity=None)