In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten

import tensorflow as tf

## Pre-processing

In [81]:
pd.set_option('display.max_columns', None)

data = pd.read_csv('preprocessed_test.csv')
ids = data['PassengerId']
data = data.drop(['PassengerId'], axis=1)

for x in data: #categorize columns  
    data[x] = data[x].astype('category')
targets = data['Survived']
inputs = data.drop(['Survived', 'Deck', 'Embarked'], axis=1)
#OneHotEncoding
inputs = pd.get_dummies(inputs, drop_first=True)
features = inputs.columns.values

#LabelEncoding
to_encode = ['Title','Sex', 'Age_Group', 'Fare_Group', 'Embarked', 'Deck']

for f in to_encode:
    le = LabelEncoder()
    le.fit(inputs[f])
    inputs[f] = le.transform(inputs[f])

In [82]:
features

array(['Pclass_2', 'Pclass_3', 'Title_Master', 'Title_Miss', 'Title_Mr',
       'Title_Mrs', 'Sex_male', 'Age_Group_16-19', 'Age_Group_20-23',
       'Age_Group_24-29', 'Age_Group_30-39', 'Age_Group_40-54',
       'Age_Group_60+', 'Family_Size_1', 'Family_Size_2',
       'Fare_Group_10.5-13', 'Fare_Group_13-25', 'Fare_Group_25-38',
       'Fare_Group_38+', 'Fare_Group_7-8', 'Fare_Group_8-10.5'],
      dtype=object)

In [83]:
inputs = np.array(inputs, dtype='float32')
targets = np.array(targets, dtype='int')
inputs.shape

(418, 21)

In [39]:
X, X_final, Y, y_final = train_test_split(inputs, targets, train_size = .85, random_state=42)

In [40]:
X_train, X_test, y_train, y_test= train_test_split(X, Y, train_size = .85, random_state=42)

## XGBC

#### BaseModel

In [69]:
#Basemodel  
xgbm = xgb.XGBClassifier(n_estimators=30, max_dept=2)
xgbm.fit(X_train, y_train)
print(xgbm.score(X_train, y_train))## overfitting 
print(xgbm.score(X_test, y_test))

0.8553654743390358
0.8508771929824561


In [70]:
fi = pd.DataFrame(xgbm.feature_importances_, columns=['Importance'])
fi['Features'] = features
fi = fi.sort_values('Importance', ascending=False)
fi

Unnamed: 0,Importance,Features
4,0.492627,Title_Mr
1,0.111863,Pclass_3
14,0.071615,Family_Size_2
6,0.056787,Sex_male
17,0.041718,Fare_Group_25-38
2,0.039958,Title_Master
20,0.027272,Fare_Group_8-10.5
18,0.025987,Fare_Group_38+
9,0.014578,Age_Group_24-29
10,0.013639,Age_Group_30-39


##### find best # of features

In [71]:
train_scores = []
test_scores = []

for i in range(1, len(features)+1):
    focus = fi.head(n=i).index
    X_train_reduced = X_train[:, focus]
    X_test_reduced = X_test[:, focus]
    xgbm = xgb.XGBClassifier(n_estimators=50)
    xgbm.fit(X_train_reduced, y_train)
    train_scores.append(xgbm.score(X_train_reduced, y_train))
    test_scores.append(xgbm.score(X_test_reduced, y_test))

In [72]:
results = pd.DataFrame(train_scores, columns=['train'])
results['test'] = test_scores
results

Unnamed: 0,train,test
0,0.786936,0.754386
1,0.793157,0.798246
2,0.821151,0.842105
3,0.821151,0.842105
4,0.824261,0.815789
5,0.832037,0.842105
6,0.842924,0.859649
7,0.842924,0.859649
8,0.849145,0.842105
9,0.849145,0.850877


In [73]:
#7 
focus = fi.head(n=8).index

X_train_reduced = X_train[:, focus]
X_test_reduced = X_test[:, focus]
X_train_reduced.shape

(643, 8)

##### GridSearch for estimators + depth

In [74]:
#Reduced Features 
xgbm = xgb.XGBClassifier(n_estimators=30, max_depth=2)
xgbm.fit(X_train_reduced, y_train)
print(xgbm.score(X_train_reduced, y_train))
print(xgbm.score(X_test_reduced, y_test))

0.8413685847589425
0.868421052631579


In [67]:
model = xgb.XGBClassifier()
n_estimators = [5, 10, 20, 30, 40, 50, 80]
max_depth = [1, 2, 4, 6, 8]
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators)
kfold = KFold(n_splits=5, random_state=80, shuffle=True)
grid_search = GridSearchCV(model, param_grid, scoring="accuracy", n_jobs=-1, cv=kfold, verbose=1)
grid_result = grid_search.fit(X_test_reduced, y_test)

Fitting 5 folds for each of 35 candidates, totalling 175 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 123 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 175 out of 175 | elapsed:    1.5s finished


In [68]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
#for mean, stdev, param in zip(means, stds, params):
    #print("%f (%f) with: %r" % (mean, stdev, param)) 
#Best: 0.833202 using {'max_depth':2, 'n_estimators': 30}

Best: 0.833202 using {'max_depth': 2, 'n_estimators': 30}


In [77]:
#final model
model = xgb.XGBClassifier(n_estimators=30, max_depth=2)
model.fit(X_train_reduced, y_train)
print(model.score(X_train_reduced, y_train))
print(model.score(X_test_reduced, y_test))
print(model.score(X_final[:,focus], y_final))

0.8413685847589425
0.868421052631579
0.835820895522388


In [80]:
#final model
model = xgb.XGBClassifier(n_estimators=30, max_depth=2)
model.fit(inputs[:,focus], targets)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=2,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=30, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [84]:
predictions = model.predict(inputs[:, focus])
predictions

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,

In [85]:

predfinal = pd.DataFrame(predictions, columns = ['Survived'])
predfinal['PassengerId'] = ids.values
predfinal = predfinal[['PassengerId', 'Survived']]
predfinal

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [86]:
predfinal.to_csv('xgboost-8 - v1.csv', index = False)