# Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.model_selection import KFold, cross_val_score
import warnings
warnings.filterwarnings('ignore')

# Dataset

In [2]:
# Read data
data = pd.read_json('titanic_json.json')
# check data
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Check shape
data.shape

(891, 12)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB


# Preprocessing

Kolom yang akan menjadi prediktor adalah Survived, Pclass, Sex, Age, SibSp, Parch, Fare, Cabin, dan Embarked.

Kolom respon adalah Survived.

Kolom dengan data non-angka akan diubah menjadi angka dengan teknik categorical label encoding.

Sebagai contoh, Sex:male diberi label 1, Sex:female diberi label 2.

Missing value pada kolom kategorik (Pclass, Sex, SibSp, Parch, Cabin, dan Embarked) akan diisi dengan nilai modus.

Missing value pada kolom numerik (Fare, Age) akan diisi dengan median.

In [5]:
# Define numerical columns
numerical_cols = [cname for cname in data.columns if data[cname].dtype in ['int64', 'float64']]
# drop passenger id and survived
numerical_cols.remove('PassengerId')
numerical_cols.remove('Survived')
# check numerical columns
numerical_cols

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [6]:
# Define categorical columns
categorical_cols = [cname for cname in data.columns if data[cname].dtype == 'object']
# drop name and ticket
categorical_cols.remove('Name')
categorical_cols.remove('Ticket')
categorical_cols

['Sex', 'Cabin', 'Embarked']

In [7]:
# Select numerical columns
X_num = data[numerical_cols]

# Select categorical columns
X_cat = data[categorical_cols]

# Select target
y = data['Survived']

### Ubah kolom non-angka ke dalam angka dengan label encoding

Kolom yg ingin kita ubah adalah Sex, Cabin, dan Embarked.

In [8]:
# Label encoding
X_cat_encoded = X_cat.apply(LabelEncoder().fit_transform)
X_cat_encoded

Unnamed: 0,Sex,Cabin,Embarked
0,1,147,2
1,0,81,0
2,0,147,2
3,0,55,2
4,1,147,2
...,...,...,...
886,1,147,2
887,0,30,2
888,0,147,2
889,1,60,0


In [9]:
# Join categorical and numerical features
X = pd.concat([X_num, X_cat_encoded], axis=1)
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex,Cabin,Embarked
0,3,22.0,1,0,7.25,1,147,2
1,1,38.0,1,0,71.2833,0,81,0
2,3,26.0,0,0,7.925,0,147,2
3,1,35.0,1,0,53.1,0,55,2
4,3,35.0,0,0,8.05,1,147,2


### Observasi ada tidaknya missing value

In [10]:
# Check missing values
X.isnull().sum()

Pclass        0
Age         177
SibSp         0
Parch         0
Fare          0
Sex           0
Cabin         0
Embarked      0
dtype: int64

### Isi missing value kolom angka (Age) dengan median

In [11]:
# Impute missing values age with median
X['Age'].fillna(X['Age'].median(), inplace=True)
# Check missing values
X.isnull().sum()

Pclass      0
Age         0
SibSp       0
Parch       0
Fare        0
Sex         0
Cabin       0
Embarked    0
dtype: int64

# Train Test Split

Setelah data 'bersih', saatnya kita bikin model sesuai kreativitas masing2

In [12]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Evaluation Function

In [13]:
def evaluate_classification(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(classification_report(y_true, y_pred))
    return {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1-score': f1}

# Modelling

In [14]:
# NearestCentroid
from sklearn.neighbors import NearestCentroid
# RidgeClassifier
from sklearn.linear_model import RidgeClassifier
# LinearSVC
from sklearn.svm import LinearSVC
# LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# SGDClassifier
from sklearn.linear_model import SGDClassifier
# BernoulliNB
from sklearn.naive_bayes import BernoulliNB

## Using Test Train Split

### NearestCentroid

In [15]:
# NearestCentroid
nc = NearestCentroid()
nc.fit(X_train, y_train)
y_pred = nc.predict(X_test)
evaluate_classification(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.75      0.86      0.80       114
           1       0.67      0.51      0.58        65

    accuracy                           0.73       179
   macro avg       0.71      0.68      0.69       179
weighted avg       0.72      0.73      0.72       179



{'Accuracy': 0.7318435754189944,
 'Precision': 0.673469387755102,
 'Recall': 0.5076923076923077,
 'F1-score': 0.5789473684210525}

### RidgeClassifier

In [16]:
# RidgeClassifier
rc = RidgeClassifier()
rc.fit(X_train, y_train)
y_pred = rc.predict(X_test)
evaluate_classification(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.84      0.86      0.85       114
           1       0.75      0.72      0.73        65

    accuracy                           0.81       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.81      0.81      0.81       179



{'Accuracy': 0.8100558659217877,
 'Precision': 0.746031746031746,
 'Recall': 0.7230769230769231,
 'F1-score': 0.7343749999999999}

### LinearSVC

In [17]:
# LinearSVC
lsvc = LinearSVC()
lsvc.fit(X_train, y_train)
y_pred = lsvc.predict(X_test)
evaluate_classification(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.71      0.96      0.82       114
           1       0.84      0.32      0.47        65

    accuracy                           0.73       179
   macro avg       0.78      0.64      0.64       179
weighted avg       0.76      0.73      0.69       179



{'Accuracy': 0.7318435754189944,
 'Precision': 0.84,
 'Recall': 0.3230769230769231,
 'F1-score': 0.4666666666666667}

### LinearDiscriminantAnalysis

In [18]:
# LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
y_pred = lda.predict(X_test)
evaluate_classification(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.84      0.86      0.85       114
           1       0.75      0.72      0.73        65

    accuracy                           0.81       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.81      0.81      0.81       179



{'Accuracy': 0.8100558659217877,
 'Precision': 0.746031746031746,
 'Recall': 0.7230769230769231,
 'F1-score': 0.7343749999999999}

### SGDClassifier

In [19]:
# SGDClassifier
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
evaluate_classification(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.58      0.48      0.53       114
           1       0.30      0.38      0.34        65

    accuracy                           0.45       179
   macro avg       0.44      0.43      0.43       179
weighted avg       0.48      0.45      0.46       179



{'Accuracy': 0.44692737430167595,
 'Precision': 0.2976190476190476,
 'Recall': 0.38461538461538464,
 'F1-score': 0.33557046979865773}

### BernoulliNB

In [20]:
# BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_pred = bnb.predict(X_test)
evaluate_classification(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.83      0.84      0.84       114
           1       0.72      0.71      0.71        65

    accuracy                           0.79       179
   macro avg       0.78      0.77      0.78       179
weighted avg       0.79      0.79      0.79       179



{'Accuracy': 0.7932960893854749,
 'Precision': 0.71875,
 'Recall': 0.7076923076923077,
 'F1-score': 0.7131782945736435}

In [21]:
# Create function to automate model training
def train_model(X_train, y_train, X_test, y_test):
    # Create list of models
    list_of_models = [NearestCentroid(), RidgeClassifier(), LinearSVC(), LinearDiscriminantAnalysis(), SGDClassifier(), BernoulliNB()]

    # Create empty list to store accuracy scores, f1 scores, precision scores, recall scores
    accuracy_scores = []
    f1_scores = []
    precision_scores = []
    recall_scores = []

    # Loop through list of models
    for model in list_of_models:
        # Fit model
        model.fit(X_train, y_train)
        # Predict
        y_pred = model.predict(X_test)
        # Append accuracy scores
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        # Append f1 scores
        f1_scores.append(f1_score(y_test, y_pred))
        # Append precision scores
        precision_scores.append(precision_score(y_test, y_pred))
        # Append recall scores
        recall_scores.append(recall_score(y_test, y_pred))

    # Create dictionary to store scores
    val = {
        "Model" : ["NearestCentroid", "RidgeClassifier", "LinearSVC", "LinearDiscriminantAnalysis", "SGDClassifier", "BernoulliNB"],
        "Accuracy" : accuracy_scores,
        "F1" : f1_scores,
        "Precision" : precision_scores,
        "Recall" : recall_scores
    }

    # Return dataframe
    return pd.DataFrame(val)

In [22]:
# Call function
Result = train_model(X_train, y_train, X_test, y_test)
Result

Unnamed: 0,Model,Accuracy,F1,Precision,Recall
0,NearestCentroid,0.731844,0.578947,0.673469,0.507692
1,RidgeClassifier,0.810056,0.734375,0.746032,0.723077
2,LinearSVC,0.810056,0.701754,0.816327,0.615385
3,LinearDiscriminantAnalysis,0.810056,0.734375,0.746032,0.723077
4,SGDClassifier,0.731844,0.478261,0.814815,0.338462
5,BernoulliNB,0.793296,0.713178,0.71875,0.707692


In [23]:
# Sort values by accuracy
Result.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Model,Accuracy,F1,Precision,Recall
1,RidgeClassifier,0.810056,0.734375,0.746032,0.723077
2,LinearSVC,0.810056,0.701754,0.816327,0.615385
3,LinearDiscriminantAnalysis,0.810056,0.734375,0.746032,0.723077
5,BernoulliNB,0.793296,0.713178,0.71875,0.707692
0,NearestCentroid,0.731844,0.578947,0.673469,0.507692
4,SGDClassifier,0.731844,0.478261,0.814815,0.338462


## Using Cross Validation

In [24]:
# Create function to automate model training
def train_model_cv(X, y):
    # Create list of models
    list_of_models = [NearestCentroid(), RidgeClassifier(), LinearSVC(), LinearDiscriminantAnalysis(), SGDClassifier(), BernoulliNB()]
    # Create empty list to store accuracy scores, f1 scores, precision scores, recall scores
    accuracy_scores = []
    f1_scores = []
    precision_scores = []
    recall_scores = []
    # Define number of folds
    kf = KFold(n_splits=5, shuffle=True, random_state=123)
    # Loop through list of models
    for model in list_of_models:
         # Append accuracy scores
        accuracy_scores.append(cross_val_score(model, X, y, cv=kf, scoring='accuracy').mean())
        # Append f1 scores
        f1_scores.append(cross_val_score(model, X, y, cv=kf, scoring='f1').mean())
        # Append precision scores
        precision_scores.append(cross_val_score(model, X, y, cv=kf, scoring='precision').mean())
        # Append recall scores
        recall_scores.append(cross_val_score(model, X, y, cv=kf, scoring='recall').mean())

    # Create dictionary to store scores

    val = {
        "Model" : ["NearestCentroid", "RidgeClassifier", "LinearSVC", "LinearDiscriminantAnalysis", "SGDClassifier", "BernoulliNB"],
        "Accuracy" : accuracy_scores,
        "F1" : f1_scores,
        "Precision" : precision_scores,
        "Recall" : recall_scores
    }
    # Return dataframe
    return pd.DataFrame(val)

In [25]:
# Call function
Result = train_model_cv(X, y)
Result

Unnamed: 0,Model,Accuracy,F1,Precision,Recall
0,NearestCentroid,0.674459,0.475435,0.618109,0.391125
1,RidgeClassifier,0.793472,0.718424,0.744113,0.696536
2,LinearSVC,0.746268,0.579775,0.640229,0.805853
3,LinearDiscriminantAnalysis,0.792348,0.717299,0.74157,0.696536
4,SGDClassifier,0.602492,0.488168,0.441183,0.395514
5,BernoulliNB,0.785626,0.70939,0.736001,0.688936


In [26]:
# Sort values by accuracy
Result.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Model,Accuracy,F1,Precision,Recall
1,RidgeClassifier,0.793472,0.718424,0.744113,0.696536
3,LinearDiscriminantAnalysis,0.792348,0.717299,0.74157,0.696536
5,BernoulliNB,0.785626,0.70939,0.736001,0.688936
2,LinearSVC,0.746268,0.579775,0.640229,0.805853
0,NearestCentroid,0.674459,0.475435,0.618109,0.391125
4,SGDClassifier,0.602492,0.488168,0.441183,0.395514
