In [1]:
import pandas as pd
import numpy as np

data  = pd.read_csv("./titanic_data/train.csv")

In [2]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
X = data.drop(['Survived', 'Cabin', 'Ticket'], axis=1)
y = data["Survived"].astype(np.int8)

In [4]:
X.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [6]:
# taking only the first letter of the names
X["Name"] = X["Name"].apply(lambda x: x.strip()[0])

# filling missing ages with the mean age
X["Age"] = X["Age"].fillna(X["Age"].mean())

# filling missing emarbarked with the most common value
X["Embarked"] = X["Embarked"].fillna(X["Embarked"].value_counts().index[0])

In [7]:
X.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,B,male,22.0,1,0,7.25,S
1,2,1,C,female,38.0,1,0,71.2833,C
2,3,3,H,female,26.0,0,0,7.925,S
3,4,1,F,female,35.0,1,0,53.1,S
4,5,3,A,male,35.0,0,0,8.05,S


In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Embarked     891 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [9]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

numeric_features = ["PassengerId", "Pclass", "Age", "SibSp", "Parch", "Fare"]
categorical_features = ["Name", "Sex", "Embarked"]

# transformer to scale the numeric data
numeric_transformer = StandardScaler()

# transformer to encode the categorical data
categorical_transformer = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-999)

# combine the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

#### Trying out some models with default parameters

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, BaggingClassifier

classifiers = [LogisticRegression(), AdaBoostClassifier(), GradientBoostingClassifier(), RandomForestClassifier(), ExtraTreesClassifier(), BaggingClassifier()]
votting_classifier = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression()), 
        ('rf', RandomForestClassifier()), 
        ('et', ExtraTreesClassifier()),
        ('gb', GradientBoostingClassifier()),
        ('ada', AdaBoostClassifier()),
    ], 
    voting='hard'
)

classifiers.append(votting_classifier)

for classifier in classifiers:
    # create the pipeline
    pipe = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("scaler", StandardScaler()),
            ("classifier", classifier),
        ]
    )

    pipe.fit(X_train, y_train)
    print(f"{classifier.__class__.__name__} score: {pipe.score(X_test, y_test)}")

LogisticRegression score: 0.8100558659217877
AdaBoostClassifier score: 0.7988826815642458
GradientBoostingClassifier score: 0.8100558659217877
RandomForestClassifier score: 0.8268156424581006
ExtraTreesClassifier score: 0.8100558659217877
BaggingClassifier score: 0.8379888268156425
VotingClassifier score: 0.7877094972067039


In [20]:
model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("scaler", StandardScaler()),
        ("classifier", RandomForestClassifier()),
    ]
)

model.fit(X_train, y_train)

In [21]:
print("Model score on training data: ", model.score(X_train, y_train))
print("Model score on test data: ", model.score(X_test, y_test))

Model score on training data:  1.0
Model score on test data:  0.8156424581005587


In [22]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "classifier__n_estimators": [10, 50, 100, 200, 500],
}

grid_search = GridSearchCV(
    model, param_grid, cv=5, n_jobs=-1, verbose=1, return_train_score=True
)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [24]:
print("Grid search score on training data: ", grid_search.score(X_train, y_train))
print("Grid search score on test data: ", grid_search.score(X_test, y_test))

Grid search score on training data:  1.0
Grid search score on test data:  0.8268156424581006


In [25]:
final_model = grid_search.best_estimator_

In [26]:
# retraining the best model on the whole training data
final_model.fit(X, y)

In [27]:
# saving the best model to the models directory
import pickle
import os

os.makedirs("./models", exist_ok=True)

with open("./models/model.pkl", "wb") as f:
    pickle.dump(final_model, f)