- Evaluate your model's performance with cross validation and using different metrics.

In [108]:
import numpy as np
import pandas as pd

In [109]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [110]:
titanic = pd.read_csv("train.csv")
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [115]:
titanic['is_male'] = pd.get_dummies(titanic['Sex'] , drop_first = True)
titanic.drop(titanic[titanic['Age'].isnull()].index , axis=0 , inplace = True)

In [116]:
X = titanic[['is_male' , 'Age' , 'Fare', 'Pclass']]
Y = titanic['Survived']
X_train , X_test , Y_train , Y_test = train_test_split(X , Y , test_size = 0.2 , random_state = 456)

In [117]:
lr = LogisticRegression()
lr.fit(X_train , Y_train)
print("train accuracy : {}".format(lr.score(X_train , Y_train)))
print("test accuracy  : {}".format(lr.score(X_test , Y_test)))

train accuracy : 0.7968476357267951
test accuracy  : 0.7762237762237763


# CROSS VALIDATION

In [118]:
from sklearn.model_selection import KFold

In [119]:
score_list = []
kf = KFold(n_splits = 5 , shuffle = True , random_state = 1111)
model = LogisticRegression()
pieces = kf.split(X)

In [120]:
for i , (train_index , test_index) in enumerate(pieces):
    X_train , Y_train = X.iloc[train_index] , Y.iloc[train_index]
    X_test , Y_test = X.iloc[test_index] , Y.iloc[test_index]
    
    model.fit(X_train , Y_train)
    predict = model.predict(X_test)
    
    score = model.score(X_test , Y_test)
    score_list.append(score)

In [121]:
score_list

[0.7902097902097902,
 0.7762237762237763,
 0.8251748251748252,
 0.7902097902097902,
 0.8028169014084507]

In [122]:
print("Mean score : {}".format(np.mean(score_list)))

Mean score : 0.7969270166453265


In [123]:
from sklearn.model_selection import cross_validate , cross_val_score

In [124]:
log_reg = LogisticRegression()
cv = cross_validate(estimator = log_reg ,
                    X = X ,
                    y = Y ,
                    cv = 10 , 
                    return_train_score = True ,
                    scoring = ['accuracy' , 'precision' , 'recall'])

In [125]:
print("Train set mean accuracy  : {}".format(np.mean(cv['train_accuracy'])))
print("Train set mean precision : {}".format(np.mean(cv['train_precision'])))
print("Train set mean recall    : {}".format(np.mean(cv['train_recall'])))

Train set mean accuracy  : 0.7959852812216877
Train set mean precision : 0.7628911676288443
Train set mean recall    : 0.7222222222222222


In [126]:
print("Test set mean accuracy   : {}".format(np.mean(cv['test_accuracy'])))
print("Test set mean precision  : {}".format(np.mean(cv['test_precision'])))
print("Test set mean recall     : {}".format(np.mean(cv['test_recall'])))

Test set mean accuracy   : 0.7898474178403755
Test set mean precision  : 0.7646483587389784
Test set mean recall     : 0.706896551724138


- Determine the model with the most appropriate parameters by hyperparameter tuning.

# HYPERPARAMETER TUNING

In [127]:
log_reg = LogisticRegression()
log_reg.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [128]:
from sklearn.model_selection import GridSearchCV

In [129]:
parameters = {'C' : [10 ** x for x in range(-5,5)] ,
              'multi_class' : ['ovr' , 'multinomial'] ,
              'penalty' : ['l1', 'l2'] , 
              'solver' : ['lbfgs', 'liblinear']}

In [130]:
grid_cv = GridSearchCV(estimator = log_reg ,
                       param_grid = parameters , 
                       cv = 10)
grid_cv.fit(X,Y)

GridSearchCV(cv=10, estimator=LogisticRegression(),
             param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100,
                               1000, 10000],
                         'multi_class': ['ovr', 'multinomial'],
                         'penalty': ['l1', 'l2'],
                         'solver': ['lbfgs', 'liblinear']})

In [131]:
print("Best parameters : {}".format(grid_cv.best_params_))
print("Best score      : {}".format(grid_cv.best_score_))

Best parameters : {'C': 0.1, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'lbfgs'}
Best score      : 0.7955399061032864
