# Titanic Dataset - Logistic Regression

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
# ReadIn the training data
titanic_train = pd.read_csv("titanic_train.csv")
print (titanic_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


## Variable Notes

>pclass: A proxy for socio-economic status (SES)
1st = Upper
2nd = Middle
3rd = Lower

>age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

>sibsp: The dataset defines family relations in this way...
Sibling = brother, sister, stepbrother, stepsister
Spouse = husband, wife (mistresses and fiancÃ©s were ignored)

>parch: The dataset defines family relations in this way...
Parent = mother, father
Child = daughter, son, stepdaughter, stepson
Some children travelled only with a nanny, therefore parch=0 for them.

In [3]:
titanic_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## Separate out the target/ label from the dataset

In [5]:
print(f'All column names: {titanic_train.columns}')
X_train = titanic_train.copy()
y_train = X_train.pop('Survived')
print(f'Training data column names: {X_train.columns}')
print(f'Training label: {y_train.name}')

All column names: Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Training data column names: Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Training label: Survived


## Helper Functions

In [6]:
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if str.find(big_string, substring) != -1:
            return substring
    # print (big_string)
    return np.nan

def replace_titles(x):
    title=x['salut']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady', 'Dona']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

## Data Preprocessing - Training Data

In [7]:
# Split Name and extract the salutation

X_train['salut'] = X_train['Name'].str.split(',',expand=True)[1].str.split('.',expand=True)[0].str.strip()
print("Unique values from salut - training dataset:\n", X_train['salut'].unique(), "\n")

print ("salut Before:")
print (X_train['salut'].value_counts(), "\n")

# X_train.drop(['firstname', 'last_name', 'lastname', 'lastname1'], axis=1, inplace=True)
print (X_train.columns, "\n")

X_train['salut']=X_train.apply(replace_titles, axis=1)
print ("salut After:")
print (X_train['salut'].value_counts())

Age_salut = pd.crosstab(X_train.Age, X_train.salut)
Age_salut.tail(10)

Unique values from salut - training dataset:
 ['Mr' 'Mrs' 'Miss' 'Master' 'Don' 'Rev' 'Dr' 'Mme' 'Ms' 'Major' 'Lady'
 'Sir' 'Mlle' 'Col' 'Capt' 'the Countess' 'Jonkheer'] 

salut Before:
Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Col               2
Major             2
Sir               1
Ms                1
Don               1
the Countess      1
Lady              1
Jonkheer          1
Capt              1
Mme               1
Name: salut, dtype: int64 

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'salut'],
      dtype='object') 

salut After:
Mr        531
Miss      185
Mrs       135
Master     40
Name: salut, dtype: int64


salut,Master,Miss,Mr,Mrs
Age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
62.0,0,0,3,1
63.0,0,1,0,1
64.0,0,0,2,0
65.0,0,0,3,0
66.0,0,0,1,0
70.0,0,0,2,0
70.5,0,0,1,0
71.0,0,0,2,0
74.0,0,0,1,0
80.0,0,0,1,0


## Imputing data to fill in the missing values

In [8]:
# Imputing Age - We are using the 'salut' feature to group the respondent to impute the age
print ("Null values for Age before imputation: ", X_train['Age'].isnull().sum())
X_train['Age'] = X_train.groupby('salut').Age.transform(lambda x: x.fillna(x.mean()))
print ("Null values for Age after imputation: ", X_train['Age'].isnull().sum())

Null values for Age before imputation:  177
Null values for Age after imputation:  0


In [9]:
# Imputing Cabin - This cannot be imputed as there is no logic and hence we fill the NAs with 'Null' string
print("Null values for Cabin before imputation: ", X_train['Cabin'].isnull().sum())

print("Value Counts of Cabin - Before")
print (X_train['Cabin'].value_counts(dropna = False))

X_train['Cabin'] = X_train['Cabin'].fillna('Null')

print("Value Counts of Cabin - After")
print (X_train['Cabin'].value_counts(dropna = False))

Null values for Cabin before imputation:  687
Value Counts of Cabin - Before
NaN            687
B96 B98          4
G6               4
C23 C25 C27      4
C22 C26          3
              ... 
E77              1
B71              1
D47              1
C91              1
E63              1
Name: Cabin, Length: 148, dtype: int64
Value Counts of Cabin - After
Null           687
B96 B98          4
C23 C25 C27      4
G6               4
D                3
              ... 
C148             1
D28              1
C70              1
E10              1
E49              1
Name: Cabin, Length: 148, dtype: int64


In [10]:
# Imputing the whole dataset just in case there are any furhter missing values
X_train = X_train.fillna(method='ffill').fillna(method='bfill')
print("Null values after imputation: ")
print(X_train.isnull().sum())

Null values after imputation: 
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
salut          0
dtype: int64


## Feature Engineering

In [11]:
# Deck
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Null']
X_train['Deck']=X_train['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))

X_train['Deck'].value_counts()

Null    687
C        59
B        47
D        33
E        33
A        15
F        12
G         4
T         1
Name: Deck, dtype: int64

In [12]:
# Family Size and Fare per Passenger
X_train['FamilySize'] = X_train['SibSp'] + X_train['Parch'] + 1
# X_train['FarePerPassenger'] = X_train['Fare']/(X_train['FamilySize'])

print(X_train['FamilySize'].value_counts())
# print()
# print(X_train['FarePerPassenger'].value_counts())

1     537
2     161
3     102
4      29
6      22
5      15
7      12
11      7
8       6
Name: FamilySize, dtype: int64


In [13]:
# X_train.to_csv("Titanic_Train_Processed.csv", index = False)

## Converting Categorical and String features into Numeric

In [14]:
# Drop features which are unique across respondents as they are not useful
X_train.drop(['Name', 'PassengerId'], axis=1, inplace=True)

# One Hot Encoding - To convert categorical to binary data
X_train_dummies = pd.get_dummies(X_train, columns=['Pclass', 'Sex', 'Cabin', 'Embarked', 'salut', 'Ticket', 'Deck'])

print ("Shape of training dataset after One Hot Encoding: ", X_train_dummies.shape)
print (X_train_dummies.head())

Shape of training dataset after One Hot Encoding:  (891, 855)
    Age  SibSp  Parch     Fare  FamilySize  Pclass_1  Pclass_2  Pclass_3  \
0  22.0      1      0   7.2500           2         0         0         1   
1  38.0      1      0  71.2833           2         1         0         0   
2  26.0      0      0   7.9250           1         0         0         1   
3  35.0      1      0  53.1000           2         1         0         0   
4  35.0      0      0   8.0500           1         0         0         1   

   Sex_female  Sex_male  ...  Ticket_WE/P 5735  Deck_A  Deck_B  Deck_C  \
0           0         1  ...                 0       0       0       0   
1           1         0  ...                 0       0       0       1   
2           1         0  ...                 0       0       0       0   
3           1         0  ...                 0       0       0       1   
4           0         1  ...                 0       0       0       0   

   Deck_D  Deck_E  Deck_F  Deck_G  D

## Test Data - ReadIn, Preprocess, Imputing, Feature Engineering and One Hot Encoding

In [15]:
# ReadIn the test data

titanic_test = pd.read_csv("titanic_test.csv")

In [16]:
titanic_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [17]:
titanic_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [18]:
titanic_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [19]:
X_test = titanic_test.copy()

# Split Name and extract the salutation
X_test['salut'] = X_test['Name'].str.split(',',expand=True)[1].str.split('.',expand=True)[0].str.strip()
print("Unique values from salut - test dataset:\n", X_test['salut'].unique())

print ("salut Before:")
print (X_test['salut'].value_counts())

print (X_test.columns)

X_test['salut']=X_test.apply(replace_titles, axis=1)

print ("salut After:")
print (X_test['salut'].value_counts())

Age_salut_test = pd.crosstab(X_test.Age, X_test.salut)
print(Age_salut_test.head(6))
print(Age_salut_test.tail(6))

Unique values from salut - test dataset:
 ['Mr' 'Mrs' 'Miss' 'Master' 'Ms' 'Col' 'Rev' 'Dr' 'Dona']
salut Before:
Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Dona        1
Ms          1
Dr          1
Name: salut, dtype: int64
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'salut'],
      dtype='object')
salut After:
Mr        244
Miss       79
Mrs        74
Master     21
Name: salut, dtype: int64
salut  Master  Miss  Mr  Mrs
Age                         
0.17        0     1   0    0
0.33        1     0   0    0
0.75        1     0   0    0
0.83        1     0   0    0
0.92        0     1   0    0
1.00        0     3   0    0
salut  Master  Miss  Mr  Mrs
Age                         
61.0        0     0   2    0
62.0        0     0   1    0
63.0        0     0   1    1
64.0        0     0   1    2
67.0        0     0   1    0
76.0        0     0   0    1


In [20]:
# Imputing missing values - Test Data
print ("Null values for Age before imputation: ", X_test['Age'].isnull().sum())
X_test['Age'] = X_test.groupby('salut').Age.transform(lambda x: x.fillna(x.mean()))
print ("Null values for Age after imputation: ", X_test['Age'].isnull().sum(), "\n")

print ("Null values for Fare before imputation: ", X_test['Fare'].isnull().sum())
X_test['Fare'] = X_test.groupby('Pclass').Fare.transform(lambda x: x.fillna(x.median()))
print ("Null values for Fare after imputation: ", X_test['Fare'].isnull().sum(), "\n")

print("Null values for Cabin before imputation: ", X_train['Cabin'].isnull().sum())
X_test['Cabin'] = X_test['Cabin'].fillna('Null')
print("Null values for Cabin after imputation: ", X_train['Cabin'].isnull().sum(), "\n")

# Imputing the whole dataset just in case there are any furhter missing values
X_test = X_test.fillna(method='ffill').fillna(method='bfill')

print("Null values after imputation: ")
print(X_test.isnull().sum())

Null values for Age before imputation:  86
Null values for Age after imputation:  0 

Null values for Fare before imputation:  1
Null values for Fare after imputation:  0 

Null values for Cabin before imputation:  0
Null values for Cabin after imputation:  0 

Null values after imputation: 
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
salut          0
dtype: int64


In [21]:
## Feature Engineering
# Deck
X_test['Deck']=X_test['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
print(X_test['Deck'].value_counts(), "\n")

# Family Size and Fare per Passenger
X_test['FamilySize'] = X_test['SibSp'] + X_test['Parch'] + 1
# X_test['FarePerPassenger'] = X_test['Fare']/(X_test['FamilySize'] + 1)

print(X_test['FamilySize'].value_counts(), "\n")
# print()
# print(X_test['FarePerPassenger'].value_counts(), "\n")

Null    327
C        35
B        18
D        13
E        11
A         7
F         6
G         1
Name: Deck, dtype: int64 

1     253
2      74
3      57
4      14
5       7
11      4
7       4
6       3
8       2
Name: FamilySize, dtype: int64 



In [22]:
## Converting Categorical and String features into Numeric

# Drop features which are unique across respondents as they are not useful
X_test.drop(['PassengerId'], axis=1, inplace=True)

# One Hot Encoding - To convert categorical to binary data
X_test_dummies = pd.get_dummies(X_test, columns=['Pclass', 'Sex', 'Cabin', 'Embarked', 'salut', 'Ticket', 'Deck'])
print ("Shape of test dataset after One Hot Encoding: ", X_test_dummies.shape)
print (X_test_dummies.head())

Shape of test dataset after One Hot Encoding:  (418, 466)
                                           Name   Age  SibSp  Parch     Fare  \
0                              Kelly, Mr. James  34.5      0      0   7.8292   
1              Wilkes, Mrs. James (Ellen Needs)  47.0      1      0   7.0000   
2                     Myles, Mr. Thomas Francis  62.0      0      0   9.6875   
3                              Wirz, Mr. Albert  27.0      0      0   8.6625   
4  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  22.0      1      1  12.2875   

   FamilySize  Pclass_1  Pclass_2  Pclass_3  Sex_female  ...  \
0           1         0         0         1           0  ...   
1           2         0         0         1           1  ...   
2           1         0         1         0           0  ...   
3           1         0         0         1           0  ...   
4           3         0         0         1           1  ...   

   Ticket_W./C. 6608  Ticket_W.E.P. 5734  Deck_A  Deck_B  Deck_C  Deck_D  \


In [23]:
print ("Shape of training dataset after One Hot Encoding: ", X_train_dummies.shape)
print ("Shape of test dataset after One Hot Encoding: ", X_test_dummies.shape)

Shape of training dataset after One Hot Encoding:  (891, 855)
Shape of test dataset after One Hot Encoding:  (418, 466)


In [24]:
# Align the Train and Test datset for One Hot Encoding 
X_train_final, X_test_final = X_train_dummies.align(X_test_dummies, join='left', axis=1)
print (X_train_final.shape)
print (X_test_final.shape)

for col in (col for col in X_test_final.columns if X_test_final[col].isnull().any()):
    X_test_final[col] = 0

print(X_test_final.isnull().sum())

(891, 855)
(418, 855)
Age           0
SibSp         0
Parch         0
Fare          0
FamilySize    0
             ..
Deck_E        0
Deck_F        0
Deck_G        0
Deck_Null     0
Deck_T        0
Length: 855, dtype: int64


## Model Training

### Scaling the data

In [25]:
X_train_final_description = X_train_final.describe().T
X_test_final_description = X_test_final.describe().T

X_train_final_description.to_csv("X_train_final_description.csv")
X_test_final_description.to_csv("X_test_final_description.csv")

In [26]:
from sklearn import preprocessing

standard_scaler = preprocessing.StandardScaler()
X_train_standard_scaled = standard_scaler.fit_transform(X_train_final)
# X_train_standard_scaled.mean(axis=0)
X_test_standard_scaled = standard_scaler.fit_transform(X_test_final)

In [27]:
# split the data into train and evaluation data
from sklearn.model_selection import train_test_split

# X, val_X, y, val_y = train_test_split(X_train_final, y_train, train_size=0.7, test_size=0.3, random_state=123, stratify=y_train)

# Applying scaled data
X, val_X, y, val_y = train_test_split(X_train_standard_scaled, y_train, train_size=0.7, test_size=0.3, random_state=123, stratify=y_train)

print (X.shape)
print (val_X.shape)
print('All:', np.bincount(y_train) / float(len(y_train)) * 100.0)
print('Training:', np.bincount(y) / float(len(y)) * 100.0)
print('Test:', np.bincount(val_y) / float(len(val_y)) * 100.0)

(623, 855)
(268, 855)
All: [61.61616162 38.38383838]
Training: [61.63723917 38.36276083]
Test: [61.56716418 38.43283582]


### Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression

model_logistic_regression = LogisticRegression()
model_logistic_regression.fit(X, y)

print("Model Parameters:")
print(model_logistic_regression.get_params(), "\n")

predictions = model_logistic_regression.predict(val_X)
predict_proba = model_logistic_regression.predict_proba(val_X)

print ("Count for validation data - actual: ", np.bincount(val_y))
print ("Count for validation data - prediction: ", np.bincount(predictions), "\n")

Model Parameters:
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False} 

Count for validation data - actual:  [165 103]
Count for validation data - prediction:  [179  89] 



### Model Evaluation

In [29]:
# Score
print ("Score for training data - Logistic Regression: ", model_logistic_regression.score(X, y))
score_val_dataset = model_logistic_regression.score(val_X, val_y)
print ("Score for validation data - Logistic Regression: ", score_val_dataset, "\n")

print("Predictions: ", predictions[0:6], "\n")
print("Prediction Probabilities:\n", predict_proba[0:6])

Score for training data - Logistic Regression:  0.9951845906902087
Score for validation data - Logistic Regression:  0.835820895522388 

Predictions:  [0 1 0 0 0 1] 

Prediction Probabilities:
 [[0.9809391  0.0190609 ]
 [0.00633401 0.99366599]
 [0.9860503  0.0139497 ]
 [0.99454257 0.00545743]
 [0.99868178 0.00131822]
 [0.24443106 0.75556894]]


In [30]:
from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score

print ("f1_score for validation data - Logistic Regression: ", f1_score(val_y, model_logistic_regression.predict(val_X)))
print ("f1_score (average=None) for validation data - Logistic Regression: ", f1_score(val_y, model_logistic_regression.predict(val_X), average=None), "\n")

print ("precision_recall_fscore for validation data - Logistic Regression: ", precision_recall_fscore_support(val_y, model_logistic_regression.predict(val_X)), "\n")

print("Accuracy for model Logistic Regression: %.2f" % (accuracy_score(val_y, model_logistic_regression.predict(val_X)) * 100))

f1_score for validation data - Logistic Regression:  0.7708333333333335
f1_score (average=None) for validation data - Logistic Regression:  [0.87209302 0.77083333] 

precision_recall_fscore for validation data - Logistic Regression:  (array([0.83798883, 0.83146067]), array([0.90909091, 0.7184466 ]), array([0.87209302, 0.77083333]), array([165, 103], dtype=int64)) 

Accuracy for model Logistic Regression: 83.58


### Model Selection

In [31]:
# Applying cross validation on the enire training data
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, ShuffleSplit

# scores = cross_val_score(model_logistic_regression, X_train_final, y_train)
scores = cross_val_score(model_logistic_regression, X_train_standard_scaled, y_train)
print("\nScore for training data with default CV: ", scores)
print(np.mean(scores))

scores = cross_val_score(model_logistic_regression, X_train_standard_scaled, y_train, cv=3)
print("\nScore for training data with CV=3: ", scores)
print(np.mean(scores))


Score for training data with default CV:  [0.86592179 0.8258427  0.83707865 0.84831461 0.87078652]
0.849588851923922

Score for training data with CV=3:  [0.82828283 0.84175084 0.85185185]
0.8406285072951739


In [32]:
# Applying cross validation by splitting the training data into training data and validation data
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, ShuffleSplit

scores = cross_val_score(model_logistic_regression, X, y)
print("Score for training data with default CV: ", scores)
print("Mean Score: ", np.mean(scores), "\n")

scores = cross_val_score(model_logistic_regression, X, y, cv=3)
print("Score for training data with CV=3: ", scores)
print("Mean Score: ", np.mean(scores), "\n")

scores = cross_val_score(model_logistic_regression, val_X, val_y, cv=3)
print("Score for validation data with CV=3: ", scores)
print("Mean Score: ", np.mean(scores))

Score for training data with default CV:  [0.872      0.864      0.8        0.83064516 0.84677419]
Mean Score:  0.8426838709677419 

Score for training data with CV=3:  [0.86057692 0.82692308 0.83091787]
Mean Score:  0.8394726247987118 

Score for validation data with CV=3:  [0.74444444 0.7752809  0.74157303]
Mean Score:  0.753766125676238


In [33]:
cv = StratifiedKFold(n_splits=4)
scores = cross_val_score(model_logistic_regression, val_X, val_y, cv=cv)
print("Score for validation data with StartifiedKFold, CV=5: ", scores)
print("Mean Score: ", np.mean(scores))

Score for validation data with StartifiedKFold, CV=5:  [0.7761194  0.7761194  0.76119403 0.71641791]
Mean Score:  0.7574626865671642


### GridSearch

In [34]:
from sklearn.model_selection import GridSearchCV
# param_grid = {'C': [0.001, 0.01, 0.1, 1, 10], 'gamma': [0.001, 0.01, 0.1, 1]}
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}

model_logistic_regression_grid = GridSearchCV(LogisticRegression(max_iter=200), param_grid=param_grid, cv=cv, verbose=3)
model_logistic_regression_grid.fit(X, y)

results_grid = model_logistic_regression_grid.cv_results_

print ("Parameters: ", model_logistic_regression_grid.get_params)

print("\nGridSearchCV best score - Logistic Regression: ", model_logistic_regression_grid.best_score_)
print("\nGridSearchCV best params - Logistic Regression: ", model_logistic_regression_grid.best_params_)
print("\nGridSearchCV best estimator - Logistic Regression: ", model_logistic_regression_grid.best_estimator_)

print ("\nGridSearchCV Score for validation data - Logistic Regression: ", model_logistic_regression_grid.score(val_X, val_y))

Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] C=0.001 .........................................................
[CV] ............................. C=0.001, score=0.718, total=   0.0s
[CV] C=0.001 .........................................................
[CV] ............................. C=0.001, score=0.712, total=   0.0s
[CV] C=0.001 .........................................................
[CV] ............................. C=0.001, score=0.699, total=   0.0s
[CV] C=0.001 .........................................................
[CV] ............................. C=0.001, score=0.742, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] .............................. C=0.01, score=0.846, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] .............................. C=0.01, score=0.840, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] ............

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s



[CV] C=0.1 ...........................................................
[CV] ............................... C=0.1, score=0.865, total=   0.0s
[CV] C=0.1 ...........................................................
[CV] ............................... C=0.1, score=0.814, total=   0.0s
[CV] C=0.1 ...........................................................
[CV] ............................... C=0.1, score=0.840, total=   0.0s
[CV] C=0.1 ...........................................................
[CV] ............................... C=0.1, score=0.832, total=   0.0s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.859, total=   0.1s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.827, total=   0.1s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.840, total=   0.1s
[CV] 

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    1.1s finished


In [35]:
results_grid

{'mean_fit_time': array([0.01749367, 0.01774353, 0.04297793, 0.06271452, 0.14492041]),
 'std_fit_time': array([0.0041497 , 0.00147642, 0.00463353, 0.00395804, 0.03478024]),
 'mean_score_time': array([0.00125062, 0.00074977, 0.00074917, 0.00124776, 0.00124878]),
 'std_score_time': array([0.00043519, 0.00043288, 0.00043253, 0.0008278 , 0.00043295]),
 'param_C': masked_array(data=[0.001, 0.01, 0.1, 1, 10],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.001}, {'C': 0.01}, {'C': 0.1}, {'C': 1}, {'C': 10}],
 'split0_test_score': array([0.71794872, 0.84615385, 0.86538462, 0.85897436, 0.86538462]),
 'split1_test_score': array([0.71153846, 0.83974359, 0.81410256, 0.82692308, 0.82051282]),
 'split2_test_score': array([0.69871795, 0.85897436, 0.83974359, 0.83974359, 0.83333333]),
 'split3_test_score': array([0.74193548, 0.81935484, 0.83225806, 0.83225806, 0.83225806]),
 'mean_test_score': array([0.71753515, 0.8410566

In [36]:
print("Model Parameters:")
print(model_logistic_regression_grid.get_params(), "\n")

predictions_grid = model_logistic_regression_grid.predict(val_X)
predict_proba_grid = model_logistic_regression_grid.predict_proba(val_X)

print ("Count for validation data - actual: ", np.bincount(val_y))
print ("Count for validation data - prediction-grid: ", np.bincount(predictions_grid), "\n")

Model Parameters:
{'cv': StratifiedKFold(n_splits=4, random_state=None, shuffle=False), 'error_score': nan, 'estimator__C': 1.0, 'estimator__class_weight': None, 'estimator__dual': False, 'estimator__fit_intercept': True, 'estimator__intercept_scaling': 1, 'estimator__l1_ratio': None, 'estimator__max_iter': 200, 'estimator__multi_class': 'auto', 'estimator__n_jobs': None, 'estimator__penalty': 'l2', 'estimator__random_state': None, 'estimator__solver': 'lbfgs', 'estimator__tol': 0.0001, 'estimator__verbose': 0, 'estimator__warm_start': False, 'estimator': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False), 'iid': 'deprecated', 'n_jobs': None, 'param_grid': {'C': [0.001, 0.01, 0.1, 1, 10]}, 'pre_dispatch': '2*n_jobs'

In [37]:
# Score
print ("Score for training data - Logistic Regression: ", model_logistic_regression_grid.score(X, y))
score_grid_val_dataset = model_logistic_regression_grid.score(val_X, val_y)
print ("Score for validation data - Logistic Regression: ", score_grid_val_dataset, "\n")

print("Predictions grid: ", predictions_grid[0:6], "\n")
print("Prediction Probabilities grid:\n", predict_proba_grid[0:6])

Score for training data - Logistic Regression:  0.9935794542536116
Score for validation data - Logistic Regression:  0.835820895522388 

Predictions grid:  [0 1 0 0 0 1] 

Prediction Probabilities grid:
 [[0.80778833 0.19221167]
 [0.19279573 0.80720427]
 [0.81923486 0.18076514]
 [0.87460092 0.12539908]
 [0.90283192 0.09716808]
 [0.4675212  0.5324788 ]]


### GridSearch with Scoring

In [38]:
scoring = {'Accuracy': 'accuracy', 'F1 Score': 'f1'}
model_logistic_regression_grid_with_scoring = GridSearchCV(LogisticRegression(max_iter=200), param_grid=param_grid, cv=cv, verbose=3, scoring = scoring, refit=False)
model_logistic_regression_grid_with_scoring.fit(X, y)
results_grid1 = model_logistic_regression_grid_with_scoring.cv_results_

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] C=0.001 .........................................................
[CV] .......... C=0.001, Accuracy=0.718, F1 Score=0.476, total=   0.0s
[CV] C=0.001 .........................................................
[CV] .......... C=0.001, Accuracy=0.712, F1 Score=0.400, total=   0.0s
[CV] C=0.001 .........................................................
[CV] .......... C=0.001, Accuracy=0.699, F1 Score=0.373, total=   0.0s
[CV] C=0.001 .........................................................
[CV] .......... C=0.001, Accuracy=0.742, F1 Score=0.500, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] ........... C=0.01, Accuracy=0.846, F1 Score=0.782, total=   0.0s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] ........... C=0.01, Accuracy=0.840, F1 Score=0.766, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] ........... C=0.01, Accuracy=0.859, F1 Score=0.800, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] ........... C=0.01, Accuracy=0.819, F1 Score=0.750, total=   0.0s
[CV] C=0.1 ...........................................................
[CV] ............ C=0.1, Accuracy=0.865, F1 Score=0.826, total=   0.0s
[CV] C=0.1 ...........................................................
[CV] ............ C=0.1, Accuracy=0.814, F1 Score=0.729, total=   0.0s
[CV] C=0.1 ...........................................................
[CV] ............ C=0.1, Accuracy=0.840, F1 Score=0.790, total=   0.0s
[CV] C=0.1 ...........................................................
[CV] ............ C=0.1, Accuracy=0.832, F1 Score=0.783, total=   0.0s
[CV] C=1 .............................................................
[CV] .

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    1.3s finished


In [39]:
# sorted(sklearn.metrics.SCORERS.keys())
results_grid_with_scoring = model_logistic_regression_grid_with_scoring.cv_results_

In [40]:
results_grid_with_scoring

{'mean_fit_time': array([0.0267365 , 0.02698654, 0.03848445, 0.069969  , 0.15291595]),
 'std_fit_time': array([0.00311364, 0.00254804, 0.00086759, 0.0211019 , 0.03262181]),
 'mean_score_time': array([0.00274664, 0.00374728, 0.002244  , 0.00274181, 0.00249678]),
 'std_score_time': array([0.00082786, 0.00148003, 0.00043627, 0.0008349 , 0.00086675]),
 'param_C': masked_array(data=[0.001, 0.01, 0.1, 1, 10],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.001}, {'C': 0.01}, {'C': 0.1}, {'C': 1}, {'C': 10}],
 'split0_test_Accuracy': array([0.71794872, 0.84615385, 0.86538462, 0.85897436, 0.86538462]),
 'split1_test_Accuracy': array([0.71153846, 0.83974359, 0.81410256, 0.82692308, 0.82051282]),
 'split2_test_Accuracy': array([0.69871795, 0.85897436, 0.83974359, 0.83974359, 0.83333333]),
 'split3_test_Accuracy': array([0.74193548, 0.81935484, 0.83225806, 0.83225806, 0.83225806]),
 'mean_test_Accuracy': array([0.7175