# Titanic Dataset - XGBoost

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
# ReadIn the training data
titanic_train = pd.read_csv("titanic_train.csv")
print (titanic_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


## Variable Notes

>pclass: A proxy for socio-economic status (SES)
1st = Upper
2nd = Middle
3rd = Lower

>age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

>sibsp: The dataset defines family relations in this way...
Sibling = brother, sister, stepbrother, stepsister
Spouse = husband, wife (mistresses and fiancÃ©s were ignored)

>parch: The dataset defines family relations in this way...
Parent = mother, father
Child = daughter, son, stepdaughter, stepson
Some children travelled only with a nanny, therefore parch=0 for them.

In [3]:
titanic_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## Separate out the target/ label from the dataset

In [5]:
print(f'All column names: {titanic_train.columns}')
X_train = titanic_train.copy()
y_train = X_train.pop('Survived')
print(f'Training data column names: {X_train.columns}')
print(f'Training label: {y_train.name}')

All column names: Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Training data column names: Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Training label: Survived


## Helper Functions

In [6]:
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if str.find(big_string, substring) != -1:
            return substring
    # print (big_string)
    return np.nan

def replace_titles(x):
    title=x['salut']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady', 'Dona']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

## Data Preprocessing - Training Data

In [7]:
# Split Name and extract the salutation

X_train['salut'] = X_train['Name'].str.split(',',expand=True)[1].str.split('.',expand=True)[0].str.strip()
print("Unique values from salut - training dataset:\n", X_train['salut'].unique(), "\n")

print ("salut Before:")
print (X_train['salut'].value_counts(), "\n")

# X_train.drop(['firstname', 'last_name', 'lastname', 'lastname1'], axis=1, inplace=True)
print (X_train.columns, "\n")

X_train['salut']=X_train.apply(replace_titles, axis=1)
print ("salut After:")
print (X_train['salut'].value_counts())

Age_salut = pd.crosstab(X_train.Age, X_train.salut)
Age_salut.tail(10)

Unique values from salut - training dataset:
 ['Mr' 'Mrs' 'Miss' 'Master' 'Don' 'Rev' 'Dr' 'Mme' 'Ms' 'Major' 'Lady'
 'Sir' 'Mlle' 'Col' 'Capt' 'the Countess' 'Jonkheer'] 

salut Before:
Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Col               2
Major             2
Ms                1
the Countess      1
Jonkheer          1
Sir               1
Capt              1
Don               1
Lady              1
Mme               1
Name: salut, dtype: int64 

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'salut'],
      dtype='object') 

salut After:
Mr        531
Miss      185
Mrs       135
Master     40
Name: salut, dtype: int64


salut,Master,Miss,Mr,Mrs
Age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
62.0,0,0,3,1
63.0,0,1,0,1
64.0,0,0,2,0
65.0,0,0,3,0
66.0,0,0,1,0
70.0,0,0,2,0
70.5,0,0,1,0
71.0,0,0,2,0
74.0,0,0,1,0
80.0,0,0,1,0


## Imputing data to fill in the missing values

In [8]:
# Imputing Age - We are using the 'salut' feature to group the respondent to impute the age
print ("Null values for Age before imputation: ", X_train['Age'].isnull().sum())
X_train['Age'] = X_train.groupby('salut').Age.transform(lambda x: x.fillna(x.mean()))
print ("Null values for Age after imputation: ", X_train['Age'].isnull().sum())

Null values for Age before imputation:  177
Null values for Age after imputation:  0


In [9]:
# Imputing Cabin - This cannot be imputed as there is no logic and hence we fill the NAs with 'Null' string
print("Null values for Cabin before imputation: ", X_train['Cabin'].isnull().sum())

print("Value Counts of Cabin - Before")
print (X_train['Cabin'].value_counts(dropna = False))

X_train['Cabin'] = X_train['Cabin'].fillna('Null')

print("Value Counts of Cabin - After")
print (X_train['Cabin'].value_counts(dropna = False))

Null values for Cabin before imputation:  687
Value Counts of Cabin - Before
NaN            687
G6               4
C23 C25 C27      4
B96 B98          4
F2               3
              ... 
A6               1
B78              1
B80              1
A16              1
A19              1
Name: Cabin, Length: 148, dtype: int64
Value Counts of Cabin - After
Null           687
B96 B98          4
C23 C25 C27      4
G6               4
F2               3
              ... 
C99              1
A6               1
B78              1
B80              1
A19              1
Name: Cabin, Length: 148, dtype: int64


In [10]:
# Imputing the whole dataset just in case there are any furhter missing values
X_train = X_train.fillna(method='ffill').fillna(method='bfill')
print("Null values after imputation: ")
print(X_train.isnull().sum())

Null values after imputation: 
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
salut          0
dtype: int64


## Feature Engineering

In [11]:
# Deck
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Null']
X_train['Deck']=X_train['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))

X_train['Deck'].value_counts()

Null    687
C        59
B        47
E        33
D        33
A        15
F        12
G         4
T         1
Name: Deck, dtype: int64

In [12]:
# Family Size and Fare per Passenger
X_train['FamilySize'] = X_train['SibSp'] + X_train['Parch'] + 1
# X_train['FarePerPassenger'] = X_train['Fare']/(X_train['FamilySize'])

print(X_train['FamilySize'].value_counts())
# print()
# print(X_train['FarePerPassenger'].value_counts())

1     537
2     161
3     102
4      29
6      22
5      15
7      12
11      7
8       6
Name: FamilySize, dtype: int64


In [13]:
# X_train.to_csv("Titanic_Train_Processed.csv", index = False)

## Converting Categorical and String features into Numeric

In [14]:
# Drop features which are unique across respondents as they are not useful
X_train.drop(['Name', 'PassengerId'], axis=1, inplace=True)

# One Hot Encoding - To convert categorical to binary data
X_train_dummies = pd.get_dummies(X_train, columns=['Pclass', 'Sex', 'Cabin', 'Embarked', 'salut', 'Ticket', 'Deck'])

print ("Shape of training dataset after One Hot Encoding: ", X_train_dummies.shape)
print (X_train_dummies.head())

Shape of training dataset after One Hot Encoding:  (891, 855)
    Age  SibSp  Parch     Fare  FamilySize  Pclass_1  Pclass_2  Pclass_3  \
0  22.0      1      0   7.2500           2         0         0         1   
1  38.0      1      0  71.2833           2         1         0         0   
2  26.0      0      0   7.9250           1         0         0         1   
3  35.0      1      0  53.1000           2         1         0         0   
4  35.0      0      0   8.0500           1         0         0         1   

   Sex_female  Sex_male  ...  Ticket_WE/P 5735  Deck_A  Deck_B  Deck_C  \
0           0         1  ...                 0       0       0       0   
1           1         0  ...                 0       0       0       1   
2           1         0  ...                 0       0       0       0   
3           1         0  ...                 0       0       0       1   
4           0         1  ...                 0       0       0       0   

   Deck_D  Deck_E  Deck_F  Deck_G  D

## Test Data - ReadIn, Preprocess, Imputing, Feature Engineering and One Hot Encoding

In [15]:
# ReadIn the test data

titanic_test = pd.read_csv("titanic_test.csv")

In [16]:
titanic_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [17]:
titanic_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [18]:
titanic_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [19]:
X_test = titanic_test.copy()

# Split Name and extract the salutation
X_test['salut'] = X_test['Name'].str.split(',',expand=True)[1].str.split('.',expand=True)[0].str.strip()
print("Unique values from salut - test dataset:\n", X_test['salut'].unique())

print ("salut Before:")
print (X_test['salut'].value_counts())

print (X_test.columns)

X_test['salut']=X_test.apply(replace_titles, axis=1)

print ("salut After:")
print (X_test['salut'].value_counts())

Age_salut_test = pd.crosstab(X_test.Age, X_test.salut)
print(Age_salut_test.head(6))
print(Age_salut_test.tail(6))

Unique values from salut - test dataset:
 ['Mr' 'Mrs' 'Miss' 'Master' 'Ms' 'Col' 'Rev' 'Dr' 'Dona']
salut Before:
Mr        240
Miss       78
Mrs        72
Master     21
Rev         2
Col         2
Ms          1
Dr          1
Dona        1
Name: salut, dtype: int64
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'salut'],
      dtype='object')
salut After:
Mr        244
Miss       79
Mrs        74
Master     21
Name: salut, dtype: int64
salut  Master  Miss  Mr  Mrs
Age                         
0.17        0     1   0    0
0.33        1     0   0    0
0.75        1     0   0    0
0.83        1     0   0    0
0.92        0     1   0    0
1.00        0     3   0    0
salut  Master  Miss  Mr  Mrs
Age                         
61.0        0     0   2    0
62.0        0     0   1    0
63.0        0     0   1    1
64.0        0     0   1    2
67.0        0     0   1    0
76.0        0     0   0    1


In [20]:
# Imputing missing values - Test Data
print ("Null values for Age before imputation: ", X_test['Age'].isnull().sum())
X_test['Age'] = X_test.groupby('salut').Age.transform(lambda x: x.fillna(x.mean()))
print ("Null values for Age after imputation: ", X_test['Age'].isnull().sum(), "\n")

print ("Null values for Fare before imputation: ", X_test['Fare'].isnull().sum())
X_test['Fare'] = X_test.groupby('Pclass').Fare.transform(lambda x: x.fillna(x.median()))
print ("Null values for Fare after imputation: ", X_test['Fare'].isnull().sum(), "\n")

print("Null values for Cabin before imputation: ", X_train['Cabin'].isnull().sum())
X_test['Cabin'] = X_test['Cabin'].fillna('Null')
print("Null values for Cabin after imputation: ", X_train['Cabin'].isnull().sum(), "\n")

# Imputing the whole dataset just in case there are any furhter missing values
X_test = X_test.fillna(method='ffill').fillna(method='bfill')

print("Null values after imputation: ")
print(X_test.isnull().sum())

Null values for Age before imputation:  86
Null values for Age after imputation:  0 

Null values for Fare before imputation:  1
Null values for Fare after imputation:  0 

Null values for Cabin before imputation:  0
Null values for Cabin after imputation:  0 

Null values after imputation: 
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
salut          0
dtype: int64


In [21]:
## Feature Engineering
# Deck
X_test['Deck']=X_test['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
print(X_test['Deck'].value_counts(), "\n")

# Family Size and Fare per Passenger
X_test['FamilySize'] = X_test['SibSp'] + X_test['Parch'] + 1
# X_test['FarePerPassenger'] = X_test['Fare']/(X_test['FamilySize'] + 1)

print(X_test['FamilySize'].value_counts(), "\n")
# print()
# print(X_test['FarePerPassenger'].value_counts(), "\n")

Null    327
C        35
B        18
D        13
E        11
A         7
F         6
G         1
Name: Deck, dtype: int64 

1     253
2      74
3      57
4      14
5       7
11      4
7       4
6       3
8       2
Name: FamilySize, dtype: int64 



In [22]:
## Converting Categorical and String features into Numeric

# Drop features which are unique across respondents as they are not useful
X_test.drop(['PassengerId'], axis=1, inplace=True)

# One Hot Encoding - To convert categorical to binary data
X_test_dummies = pd.get_dummies(X_test, columns=['Pclass', 'Sex', 'Cabin', 'Embarked', 'salut', 'Ticket', 'Deck'])
print ("Shape of test dataset after One Hot Encoding: ", X_test_dummies.shape)
print (X_test_dummies.head())

Shape of test dataset after One Hot Encoding:  (418, 466)
                                           Name   Age  SibSp  Parch     Fare  \
0                              Kelly, Mr. James  34.5      0      0   7.8292   
1              Wilkes, Mrs. James (Ellen Needs)  47.0      1      0   7.0000   
2                     Myles, Mr. Thomas Francis  62.0      0      0   9.6875   
3                              Wirz, Mr. Albert  27.0      0      0   8.6625   
4  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  22.0      1      1  12.2875   

   FamilySize  Pclass_1  Pclass_2  Pclass_3  Sex_female  ...  \
0           1         0         0         1           0  ...   
1           2         0         0         1           1  ...   
2           1         0         1         0           0  ...   
3           1         0         0         1           0  ...   
4           3         0         0         1           1  ...   

   Ticket_W./C. 6608  Ticket_W.E.P. 5734  Deck_A  Deck_B  Deck_C  Deck_D  \


In [23]:
print ("Shape of training dataset after One Hot Encoding: ", X_train_dummies.shape)
print ("Shape of test dataset after One Hot Encoding: ", X_test_dummies.shape)

Shape of training dataset after One Hot Encoding:  (891, 855)
Shape of test dataset after One Hot Encoding:  (418, 466)


In [24]:
# Align the Train and Test datset for One Hot Encoding 
X_train_final, X_test_final = X_train_dummies.align(X_test_dummies, join='left', axis=1)
print (X_train_final.shape)
print (X_test_final.shape)

for col in (col for col in X_test_final.columns if X_test_final[col].isnull().any()):
    X_test_final[col] = 0

print(X_test_final.isnull().sum())

(891, 855)
(418, 855)
Age           0
SibSp         0
Parch         0
Fare          0
FamilySize    0
             ..
Deck_E        0
Deck_F        0
Deck_G        0
Deck_Null     0
Deck_T        0
Length: 855, dtype: int64


## Model Training

### Scaling the data

In [25]:
X_train_final_description = X_train_final.describe().T
X_test_final_description = X_test_final.describe().T

X_train_final_description.to_csv("X_train_final_description.csv")
X_test_final_description.to_csv("X_test_final_description.csv")

In [26]:
from sklearn import preprocessing

standard_scaler = preprocessing.StandardScaler()
X_train_standard_scaled = standard_scaler.fit_transform(X_train_final)
# X_train_standard_scaled.mean(axis=0)
X_test_standard_scaled = standard_scaler.fit_transform(X_test_final)

In [27]:
# split the data into train and evaluation data
from sklearn.model_selection import train_test_split

# X, val_X, y, val_y = train_test_split(X_train_final, y_train, train_size=0.7, test_size=0.3, random_state=123, stratify=y_train)

# Applying scaled data
X, val_X, y, val_y = train_test_split(X_train_standard_scaled, y_train, train_size=0.7, test_size=0.3, random_state=123, stratify=y_train)

print (X.shape)
print (val_X.shape)
print('All:', np.bincount(y_train) / float(len(y_train)) * 100.0)
print('Training:', np.bincount(y) / float(len(y)) * 100.0)
print('Test:', np.bincount(val_y) / float(len(val_y)) * 100.0)

(623, 855)
(268, 855)
All: [61.61616162 38.38383838]
Training: [61.63723917 38.36276083]
Test: [61.56716418 38.43283582]


### XGBoost

In [70]:
import xgboost as xgb
from xgboost import XGBClassifier
print(xgb.__version__)
from sklearn.preprocessing import Binarizer

1.0.2


In [50]:
model_XGB_sklearn = XGBClassifier(n_estimators=400, learning_rate=0.05, early_stopping_rounds=5, max_depth=3, min_child_weight=1, 
                          gamma=1, subsample=1, colsample_bytree=1, booster='gbtree', random_state=41)
model_XGB_sklearn.fit(X, y)

print("Model Parameters:")
print(model_XGB_sklearn.get_params(), "\n")

predictions = model_XGB_sklearn.predict(val_X)
predict_proba = model_XGB_sklearn.predict_proba(val_X)

print ("Count for validation data - actual: ", np.bincount(val_y))
print ("Count for validation data - prediction: ", np.bincount(predictions), "\n")

Model Parameters:
{'objective': 'binary:logistic', 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'gamma': 1, 'gpu_id': -1, 'importance_type': 'gain', 'interaction_constraints': None, 'learning_rate': 0.05, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 1, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 400, 'n_jobs': 0, 'num_parallel_tree': 1, 'random_state': 41, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'subsample': 1, 'tree_method': None, 'validate_parameters': False, 'verbosity': None, 'early_stopping_rounds': 5} 

Count for validation data - actual:  [165 103]
Count for validation data - prediction:  [177  91] 



In [74]:
params = {
#         'n_estimators': 400,
        'learning_rate': 0.05,
#         'early_stopping_rounds': 5,
        'max_depth': 3,
        'min_child_weight': 1,
        'gamma': 1,
        'subsample': 1,
        'colsample_bytree': 1,
        'random_state': 41
        }
steps = 10

D_train = xgb.DMatrix(X, label=y)
D_test = xgb.DMatrix(val_X, label=val_y)

model_XGB = xgb.train(params, D_train, steps)

print("Model Parameters:")
# print(model_XGB.get_params(), "\n")

predictions = model_XGB.predict(D_test)
# print(predictions.shape)
# print(predictions.dtype)
print(predictions[0])
best_preds = np.asarray([np.argmax(line) for line in predictions])
print(best_preds)

# predict_proba = model_XGB.predict_proba(D_test)

print ("Count for validation data - actual: ", np.bincount(val_y))
print ("Count for validation data - prediction: ", np.bincount(best_preds), "\n")

Model Parameters:
0.33819258
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0]
Count for validation data - actual:  [165 103]
Count for validation data - prediction:  [268] 



### Model Evaluation

In [30]:
# Score
print ("Score for training data - XGBoost: ", model_XGB.score(X, y))
score_val_dataset = model_XGB.score(val_X, val_y)
print ("Score for validation data - XGBoost: ", score_val_dataset, "\n")

print("Predictions: ", predictions[0:6], "\n")
print("Prediction Probabilities:\n", predict_proba[0:6])

Score for training data - XGBoost:  0.9181380417335474
Score for validation data - XGBoost:  0.832089552238806 

Predictions:  [0 1 0 0 0 0] 

Prediction Probabilities:
 [[0.85375005 0.14624994]
 [0.00858903 0.991411  ]
 [0.86770856 0.13229144]
 [0.91932213 0.08067787]
 [0.98871756 0.01128246]
 [0.7196296  0.2803704 ]]


In [31]:
from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score, classification_report

print ("f1_score for validation data - XGBoost: ", f1_score(val_y, model_XGB.predict(val_X)))
print ("f1_score (average=None) for validation data - XGBoost: ", f1_score(val_y, model_XGB.predict(val_X), average=None), "\n")

print ("precision_recall_fscore for validation data - XGBoost: ", precision_recall_fscore_support(val_y, model_XGB.predict(val_X)), "\n")

print("Accuracy for model XGBoost: %.2f" % (accuracy_score(val_y, model_XGB.predict(val_X)) * 100))

print("Classification Report - Training Data")
print(classification_report(y, model_XGB.predict(X)))

print("Classification Report - Validation Data")
print(classification_report(val_y, model_XGB.predict(val_X)))

f1_score for validation data - XGBoost:  0.7715736040609138
f1_score (average=None) for validation data - XGBoost:  [0.86725664 0.7715736 ] 

precision_recall_fscore for validation data - XGBoost:  (array([0.84482759, 0.80851064]), array([0.89090909, 0.73786408]), array([0.86725664, 0.7715736 ]), array([165, 103], dtype=int64)) 

Accuracy for model XGBoost: 83.21
Classification Report - Training Data
              precision    recall  f1-score   support

           0       0.92      0.95      0.93       384
           1       0.92      0.86      0.89       239

    accuracy                           0.92       623
   macro avg       0.92      0.91      0.91       623
weighted avg       0.92      0.92      0.92       623

Classification Report - Validation Data
              precision    recall  f1-score   support

           0       0.84      0.89      0.87       165
           1       0.81      0.74      0.77       103

    accuracy                           0.83       268
   macro av

### Model Selection

In [None]:
# Applying cross validation on the enire training data
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, ShuffleSplit

# scores = cross_val_score(model_logistic_regression, X_train_final, y_train)
scores = cross_val_score(model_XGB, X_train_standard_scaled, y_train)
print("\nScore for training data with default CV: ", scores)
print(np.mean(scores))

scores = cross_val_score(model_XGB, X_train_standard_scaled, y_train, cv=3)
print("\nScore for training data with CV=3: ", scores)
print(np.mean(scores))

In [None]:
# Applying cross validation by splitting the training data into training data and validation data
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, ShuffleSplit

scores = cross_val_score(model_XGB, X, y)
print("Score for training data with default CV: ", scores)
print("Mean Score: ", np.mean(scores), "\n")

scores = cross_val_score(model_XGB, X, y, cv=3)
print("Score for training data with CV=3: ", scores)
print("Mean Score: ", np.mean(scores), "\n")

scores = cross_val_score(model_XGB, val_X, val_y, cv=3)
print("Score for validation data with CV=3: ", scores)
print("Mean Score: ", np.mean(scores))

In [None]:
cv = StratifiedKFold(n_splits=4)
scores = cross_val_score(model_XGB, val_X, val_y, cv=cv)
print("Score for validation data with StartifiedKFold, CV=5: ", scores)
print("Mean Score: ", np.mean(scores))

### GridSearch

In [None]:
%%time

from sklearn.model_selection import GridSearchCV

param_grid = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

# n_estimators=400, learning_rate=0.05, early_stopping_rounds=5, max_depth=3, random_state=41
model_XGB_grid = GridSearchCV(XGBClassifier(n_estimators=400, learning_rate=0.05, early_stopping_rounds=5, random_state=41), param_grid=param_grid, cv=cv, verbose=3, n_jobs=-1)
model_XGB_grid.fit(X, y)

results_grid = model_XGB_grid.cv_results_

print ("Parameters: ", model_XGB_grid.get_params)

print("\nGridSearchCV best score - XGBoost: ", model_XGB_grid.best_score_)
print("\nGridSearchCV best params - XGBoost: ", model_XGB_grid.best_params_)
print("\nGridSearchCV best estimator - XGBoost: ", model_XGB_grid.best_estimator_)

print ("\nGridSearchCV Score for validation data - XGBoost: ", model_XGB_grid.score(val_X, val_y))

In [None]:
results_grid

In [None]:
print("Model Parameters:")
print(model_logistic_regression_grid.get_params(), "\n")

predictions_grid = model_logistic_regression_grid.predict(val_X)
predict_proba_grid = model_logistic_regression_grid.predict_proba(val_X)

print ("Count for validation data - actual: ", np.bincount(val_y))
print ("Count for validation data - prediction-grid: ", np.bincount(predictions_grid), "\n")

In [None]:
# Score
print ("Score for training data - Logistic Regression: ", model_logistic_regression_grid.score(X, y))
score_grid_val_dataset = model_logistic_regression_grid.score(val_X, val_y)
print ("Score for validation data - Logistic Regression: ", score_grid_val_dataset, "\n")

print("Predictions grid: ", predictions_grid[0:6], "\n")
print("Prediction Probabilities grid:\n", predict_proba_grid[0:6])

### GridSearch with Scoring

In [None]:
scoring = {'Accuracy': 'accuracy', 'F1 Score': 'f1'}
model_logistic_regression_grid_with_scoring = GridSearchCV(LogisticRegression(max_iter=200), param_grid=param_grid, cv=cv, verbose=3, scoring = scoring, refit=False)
model_logistic_regression_grid_with_scoring.fit(X, y)
results_grid_with_scoring = model_logistic_regression_grid_with_scoring.cv_results_

In [None]:
# sorted(sklearn.metrics.SCORERS.keys())
results_grid_with_scoring = model_logistic_regression_grid_with_scoring.cv_results_

In [None]:
results_grid_with_scoring