# Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.model_selection import KFold, cross_val_score
import warnings
warnings.filterwarnings('ignore')

# Dataset

In [2]:
# Read data
data = pd.read_json('titanic_json.json')
# check data
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Check shape
data.shape

(891, 12)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB


# Preprocessing

Kolom yang akan menjadi prediktor adalah Survived, Pclass, Sex, Age, SibSp, Parch, Fare, Cabin, dan Embarked.

Kolom respon adalah Survived.

Kolom dengan data non-angka akan diubah menjadi angka dengan teknik categorical label encoding.

Sebagai contoh, Sex:male diberi label 1, Sex:female diberi label 2.

Missing value pada kolom kategorik (Pclass, Sex, SibSp, Parch, Cabin, dan Embarked) akan diisi dengan nilai modus.

Missing value pada kolom numerik (Fare, Age) akan diisi dengan median.

In [5]:
# Define numerical columns
numerical_cols = [cname for cname in data.columns if data[cname].dtype in ['int64', 'float64']]
# drop passenger id and survived
numerical_cols.remove('PassengerId')
numerical_cols.remove('Survived')
# check numerical columns
numerical_cols

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [6]:
# Define categorical columns
categorical_cols = [cname for cname in data.columns if data[cname].dtype == 'object']
# drop name and ticket
categorical_cols.remove('Name')
categorical_cols.remove('Ticket')
categorical_cols

['Sex', 'Cabin', 'Embarked']

In [7]:
# Select numerical columns
X_num = data[numerical_cols]

# Select categorical columns
X_cat = data[categorical_cols]

# Select target
y = data['Survived']

### Ubah kolom non-angka ke dalam angka dengan label encoding

Kolom yg ingin kita ubah adalah Sex, Cabin, dan Embarked.

In [8]:
# Label encoding
X_cat_encoded = X_cat.apply(LabelEncoder().fit_transform)
X_cat_encoded

Unnamed: 0,Sex,Cabin,Embarked
0,1,147,2
1,0,81,0
2,0,147,2
3,0,55,2
4,1,147,2
...,...,...,...
886,1,147,2
887,0,30,2
888,0,147,2
889,1,60,0


In [9]:
# Join categorical and numerical features
X = pd.concat([X_num, X_cat_encoded], axis=1)
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex,Cabin,Embarked
0,3,22.0,1,0,7.25,1,147,2
1,1,38.0,1,0,71.2833,0,81,0
2,3,26.0,0,0,7.925,0,147,2
3,1,35.0,1,0,53.1,0,55,2
4,3,35.0,0,0,8.05,1,147,2


### Observasi ada tidaknya missing value

In [10]:
# Check missing values
X.isnull().sum()

Pclass        0
Age         177
SibSp         0
Parch         0
Fare          0
Sex           0
Cabin         0
Embarked      0
dtype: int64

### Isi missing value kolom angka (Age) dengan median

In [11]:
# Impute missing values age with median
X['Age'].fillna(X['Age'].median(), inplace=True)
# Check missing values
X.isnull().sum()

Pclass      0
Age         0
SibSp       0
Parch       0
Fare        0
Sex         0
Cabin       0
Embarked    0
dtype: int64

# Train Test Split

Setelah data 'bersih', saatnya kita bikin model sesuai kreativitas masing2

In [12]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Evaluation Function

In [13]:
def evaluate_classification(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(classification_report(y_true, y_pred))
    return {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1-score': f1}

# Modelling

In [14]:
# Passive Aggressive Classifier
from sklearn.linear_model import PassiveAggressiveClassifier
# NuSVC
from sklearn.svm import NuSVC
# LogisticRegression
from sklearn.linear_model import LogisticRegression
# GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
# BaggingClassifier
from sklearn.ensemble import BaggingClassifier
# ExtraTreesClassifier
from sklearn.ensemble import ExtraTreesClassifier
# HistGradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

## Using Test Train Split

### PassiveAggressiveClassifier

In [26]:
# PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier()
pac.fit(X_train, y_train)
y_pred = pac.predict(X_test)
evaluate_classification(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.70      0.89      0.79       114
           1       0.65      0.34      0.44        65

    accuracy                           0.69       179
   macro avg       0.68      0.62      0.62       179
weighted avg       0.68      0.69      0.66       179



{'Accuracy': 0.6927374301675978,
 'Precision': 0.6470588235294118,
 'Recall': 0.3384615384615385,
 'F1-score': 0.4444444444444445}

### NuSVC

In [25]:
# NuSVC
nusvc = NuSVC()
nusvc.fit(X_train, y_train)
y_pred = nusvc.predict(X_test)
evaluate_classification(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.87      0.82      0.84       114
           1       0.71      0.78      0.74        65

    accuracy                           0.80       179
   macro avg       0.79      0.80      0.79       179
weighted avg       0.81      0.80      0.81       179



{'Accuracy': 0.8044692737430168,
 'Precision': 0.7083333333333334,
 'Recall': 0.7846153846153846,
 'F1-score': 0.7445255474452555}

### LogisticRegression

In [24]:
# LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
evaluate_classification(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.85      0.85      0.85       114
           1       0.74      0.74      0.74        65

    accuracy                           0.81       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.81      0.81      0.81       179



{'Accuracy': 0.8100558659217877,
 'Precision': 0.7384615384615385,
 'Recall': 0.7384615384615385,
 'F1-score': 0.7384615384615385}

### GradientBoostingClassifier

In [23]:
# GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)
evaluate_classification(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.91      0.89      0.90       114
           1       0.81      0.85      0.83        65

    accuracy                           0.87       179
   macro avg       0.86      0.87      0.86       179
weighted avg       0.87      0.87      0.87       179



{'Accuracy': 0.8715083798882681,
 'Precision': 0.8088235294117647,
 'Recall': 0.8461538461538461,
 'F1-score': 0.8270676691729324}

### BaggingClassifier

In [22]:
# BaggingClassifier
bc = BaggingClassifier()
bc.fit(X_train, y_train)
y_pred = bc.predict(X_test)
evaluate_classification(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.90      0.90      0.90       114
           1       0.83      0.82      0.82        65

    accuracy                           0.87       179
   macro avg       0.86      0.86      0.86       179
weighted avg       0.87      0.87      0.87       179



{'Accuracy': 0.8715083798882681,
 'Precision': 0.828125,
 'Recall': 0.8153846153846154,
 'F1-score': 0.8217054263565892}

### ExtraTreesClassifier

In [21]:
# ExtraTreesClassifier
et = ExtraTreesClassifier()
et.fit(X_train, y_train)
y_pred = et.predict(X_test)
evaluate_classification(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.85      0.82      0.83       114
           1       0.70      0.75      0.73        65

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.80      0.79      0.79       179



{'Accuracy': 0.7932960893854749,
 'Precision': 0.7,
 'Recall': 0.7538461538461538,
 'F1-score': 0.725925925925926}

### HistGradientBoostingClassifier

In [27]:
# HistGradientBoostingClassifier
hgbc = HistGradientBoostingClassifier()
hgbc.fit(X_train, y_train)
y_pred = hgbc.predict(X_test)
evaluate_classification(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.91      0.93      0.92       114
           1       0.87      0.83      0.85        65

    accuracy                           0.89       179
   macro avg       0.89      0.88      0.88       179
weighted avg       0.89      0.89      0.89       179



{'Accuracy': 0.8938547486033519,
 'Precision': 0.8709677419354839,
 'Recall': 0.8307692307692308,
 'F1-score': 0.8503937007874016}

### Automate Model Training

In [30]:
# Create function to automate model training
def train_model(X_train, y_train, X_test, y_test):
    # Create list of models
    list_of_models = [PassiveAggressiveClassifier(), NuSVC(), LogisticRegression(), GradientBoostingClassifier(), BaggingClassifier(), ExtraTreesClassifier(), HistGradientBoostingClassifier()]

    # Create empty list to store accuracy scores, f1 scores, precision scores, recall scores
    accuracy_scores = []
    f1_scores = []
    precision_scores = []
    recall_scores = []

    # Loop through list of models
    for model in list_of_models:
        # Fit model
        model.fit(X_train, y_train)
        # Predict
        y_pred = model.predict(X_test)
        # Append accuracy scores
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        # Append f1 scores
        f1_scores.append(f1_score(y_test, y_pred))
        # Append precision scores
        precision_scores.append(precision_score(y_test, y_pred))
        # Append recall scores
        recall_scores.append(recall_score(y_test, y_pred))

    # Create dictionary to store scores
    val = {
        "Model" : ["PassiveAggressiveClassifier", "NuSVC", "LogisticRegression", "GradientBoostingClassifier", "BaggingClassifier", "ExtraTreesClassifier","HistGradientBoostingClassifier"],
        "Accuracy" : accuracy_scores,
        "F1" : f1_scores,
        "Precision" : precision_scores,
        "Recall" : recall_scores
    }

    # Return dataframe
    return pd.DataFrame(val)

In [31]:
# Call function
Result = train_model(X_train, y_train, X_test, y_test)
Result

Unnamed: 0,Model,Accuracy,F1,Precision,Recall
0,PassiveAggressiveClassifier,0.363128,0.525,0.36,0.969231
1,NuSVC,0.804469,0.744526,0.708333,0.784615
2,LogisticRegression,0.810056,0.738462,0.738462,0.738462
3,GradientBoostingClassifier,0.871508,0.827068,0.808824,0.846154
4,BaggingClassifier,0.821229,0.753846,0.753846,0.753846
5,ExtraTreesClassifier,0.798883,0.731343,0.710145,0.753846
6,HistGradientBoostingClassifier,0.893855,0.850394,0.870968,0.830769


In [32]:
# Sort values by accuracy
Result.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Model,Accuracy,F1,Precision,Recall
6,HistGradientBoostingClassifier,0.893855,0.850394,0.870968,0.830769
3,GradientBoostingClassifier,0.871508,0.827068,0.808824,0.846154
4,BaggingClassifier,0.821229,0.753846,0.753846,0.753846
2,LogisticRegression,0.810056,0.738462,0.738462,0.738462
1,NuSVC,0.804469,0.744526,0.708333,0.784615
5,ExtraTreesClassifier,0.798883,0.731343,0.710145,0.753846
0,PassiveAggressiveClassifier,0.363128,0.525,0.36,0.969231


## Using Cross Validation

In [33]:
# Create function to automate model training
def train_model_cv(X, y):
    # Create list of models
    list_of_models = [PassiveAggressiveClassifier(), NuSVC(), LogisticRegression(), GradientBoostingClassifier(), BaggingClassifier(), ExtraTreesClassifier(), HistGradientBoostingClassifier()]
    # Create empty list to store accuracy scores, f1 scores, precision scores, recall scores
    accuracy_scores = []
    f1_scores = []
    precision_scores = []
    recall_scores = []
    # Define number of folds
    kf = KFold(n_splits=5, shuffle=True, random_state=123)
    # Loop through list of models
    for model in list_of_models:
         # Append accuracy scores
        accuracy_scores.append(cross_val_score(model, X, y, cv=kf, scoring='accuracy').mean())
        # Append f1 scores
        f1_scores.append(cross_val_score(model, X, y, cv=kf, scoring='f1').mean())
        # Append precision scores
        precision_scores.append(cross_val_score(model, X, y, cv=kf, scoring='precision').mean())
        # Append recall scores
        recall_scores.append(cross_val_score(model, X, y, cv=kf, scoring='recall').mean())

    # Create dictionary to store scores

    val = {
        "Model" : ["PassiveAggressiveClassifier", "NuSVC", "LogisticRegression", "GradientBoostingClassifier", "BaggingClassifier", "ExtraTreesClassifier","HistGradientBoostingClassifier"],
        "Accuracy" : accuracy_scores,
        "F1" : f1_scores,
        "Precision" : precision_scores,
        "Recall" : recall_scores
    }
    # Return dataframe
    return pd.DataFrame(val)

In [34]:
# Call function
Result = train_model_cv(X, y)
Result

Unnamed: 0,Model,Accuracy,F1,Precision,Recall
0,PassiveAggressiveClassifier,0.556481,0.365282,0.575121,0.134714
1,NuSVC,0.794621,0.731012,0.728042,0.73845
2,LogisticRegression,0.793472,0.720523,0.743627,0.702115
3,GradientBoostingClassifier,0.824863,0.75588,0.80644,0.716023
4,BaggingClassifier,0.810282,0.734089,0.802487,0.68868
5,ExtraTreesClassifier,0.790114,0.726027,0.7366,0.703379
6,HistGradientBoostingClassifier,0.836074,0.775688,0.813013,0.743311


In [35]:
# Sort values by accuracy
Result.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Model,Accuracy,F1,Precision,Recall
6,HistGradientBoostingClassifier,0.836074,0.775688,0.813013,0.743311
3,GradientBoostingClassifier,0.824863,0.75588,0.80644,0.716023
4,BaggingClassifier,0.810282,0.734089,0.802487,0.68868
1,NuSVC,0.794621,0.731012,0.728042,0.73845
2,LogisticRegression,0.793472,0.720523,0.743627,0.702115
5,ExtraTreesClassifier,0.790114,0.726027,0.7366,0.703379
0,PassiveAggressiveClassifier,0.556481,0.365282,0.575121,0.134714
