# Titanic Example
In this example, we will build a **Logistic regression model** with **Titanic** data. The main purpose of this dataset is to predict whether a subject survived or not.

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("titanic_cleaned.csv")

X = df.drop(columns='survived')
y = df['survived']

print(X.head())
print("-"*20)
print(y.head())

   pclass     sex   age     fare embarked
0       3    male  22.0   7.2500        S
1       1  female  38.0  71.2833        C
2       3  female  26.0   7.9250        S
3       1  female  35.0  53.1000        S
4       3    male  35.0   8.0500        S
--------------------
0    0
1    1
2    1
3    1
4    0
Name: survived, dtype: int64


## 0. Data Split

In [2]:
## 2. Data split
TEST_PROPORTION = 0.2
SEED = 87

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_PROPORTION, random_state=SEED)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(712, 5) (179, 5) (712,) (179,)


## 1. Preprocessing

#### 1.1 Missing value imputation

In [3]:
X_train.isnull().sum()

pclass        0
sex           0
age         143
fare          0
embarked      1
dtype: int64

- In our training set, only the `age` and `embarked` columns contain missing values.
- Is it okay to construct imputation transformers for these two variables only?

In [4]:
from sklearn.impute import SimpleImputer

num_imputer = SimpleImputer(strategy = 'mean') # strategy: mean, most_frequent, median, ...

num_imputer.fit(X_train[['age', 'fare']]) 
X_train[['age', 'fare']] = num_imputer.transform(X_train[['age', 'fare']])
X_test[['age', 'fare']] = num_imputer.transform(X_test[['age', 'fare']])

In [5]:
cat_imputer = SimpleImputer(strategy = 'most_frequent')

cat_imputer.fit(X_train[['pclass', 'sex', 'embarked']])
X_train[['pclass', 'sex', 'embarked']] = cat_imputer.transform(X_train[['pclass', 'sex', 'embarked']])
X_test[['pclass', 'sex', 'embarked']] = cat_imputer.transform(X_test[['pclass', 'sex', 'embarked']])

#### 1.2 Scaling (Numerical variable)

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train[['age', 'fare']])
X_test_num_scaled = scaler.transform(X_test[['age', 'fare']])

#### 1.3 Dimensionality Reduction (Numerical variable)
Since we only have two numerical varialbes in the dataset, we do not reduce the dimensionality.

#### 1.4 Encoding (Categorical variable)

In [7]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_cat_encoded = encoder.fit_transform(X_train[['pclass', 'sex', 'embarked']])
X_test_cat_encoded = encoder.transform(X_test[['pclass', 'sex', 'embarked']])

#### 1.5 Save

In [8]:
X_train_processed = np.hstack([X_train_num_scaled, X_train_cat_encoded])
X_test_processed = np.hstack([X_test_num_scaled, X_test_cat_encoded])

# (Optional) Save as DataFrame with original column names and index
colnames = np.concatenate([['age', 'fare'], encoder.get_feature_names_out(['pclass', 'sex', 'embarked'])])

X_train_processed = pd.DataFrame(X_train_processed, columns=colnames, index=X_train.index)
X_test_processed = pd.DataFrame(X_test_processed, columns=colnames, index=X_test.index)

In [9]:
# X_train_processed.to_csv("../data/processed/LogisticReg/X_train.csv")
# X_test_processed.to_csv("../data/processed/LogisticReg/X_test.csv")
# y_train.to_csv("../data/processed/LogisticReg/y_train.csv")
# y_test.to_csv("../data/processed/LogisticReg/y_test.csv")

In [10]:
# import pickle

# preprocessing_bundle = {
#     'num_imputer': num_imputer,
#     'cat_imputer': cat_imputer,
#     'scaler': scaler,
#     'encoder': encoder
# }

# # Save
# with open("../models/LogisticReg/preprocessing.pkl", "wb") as f:
#     pickle.dump(preprocessing_bundle, f)

#### More?
The following can be considered:
- You might choose to group passengers by `age` into categories such as children, adults, and the elderly.
- A histogram of the `fare` variable may reveal strong skewness, suggesting that a log transformation could be beneficial.
- and more...

## 2. Fitting and Prediction

In [11]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(penalty=None, max_iter=1000)

clf.fit(X_train_processed, y_train)

y_train_pred = clf.predict(X_train_processed)
y_test_pred = clf.predict(X_test_processed)

In [12]:
y_test

656    0
851    0
663    0
718    0
15     1
      ..
762    1
361    0
726    1
159    0
4      0
Name: survived, Length: 179, dtype: int64

In [13]:
y_test_pred

array([0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0])

## 3. Model Evaluation

In [14]:
from sklearn.metrics import accuracy_score

# 1. Training accuracy
train_acc = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_acc:.3f}")

# 2. Test accuracy
test_acc = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_acc:.3f}")

Training Accuracy: 0.805
Test Accuracy: 0.799


## 4. Model Selection
In `LogisticRegression`, we have the following hyperparameters:
- `C`     : Inverse of regularization strength
- `solver`: Algorithm to use in the optimization problem
- ...

We will select the best combination from the specified parameter grid.

In [15]:
from sklearn.model_selection import GridSearchCV

model = LogisticRegression(max_iter=1000)

param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear']
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy'
)

grid_search.fit(X_train_processed, y_train)

0,1,2
,estimator,LogisticRegre...max_iter=1000)
,param_grid,"{'C': [0.01, 0.1, ...], 'solver': ['lbfgs', 'liblinear']}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,1000


In [16]:
best_model = grid_search.best_estimator_
y_train_pred_best = best_model.predict(X_train_processed)
y_test_pred_best = best_model.predict(X_test_processed)

## Final evaluation
train_acc = accuracy_score(y_train, y_train_pred_best)
print(f"Training Accuracy: {train_acc:.3f}")

test_acc = accuracy_score(y_test, y_test_pred_best)
print(f"Test Accuracy: {test_acc:.3f}")

Training Accuracy: 0.798
Test Accuracy: 0.804


## 5. Save

In [17]:
# import pickle

# with open('../models/LogisticReg/model.pkl', 'wb') as f:
#     pickle.dump(grid_search.best_estimator_, f)

## 6. Load

In [18]:
# import pickle

# with open('../models/LogisticReg/model.pkl', 'rb') as f:
#     model = pickle.load(f)