# Library

In [1]:
# Importing the common libraries
import pandas as pd
import numpy as np

# importing the libraries for data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder

# Importing the libraries for the model
import tensorflow as tf
from tensorflow import keras
import tensorflow_addons as tfa

# Importing the libraries for the evaluation
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, roc_auc_score

# Importing additional libraries
import os
from pathlib import Path
import shutil
import gzip
import joblib


# Dataset

In [2]:
# Download titanic dataset
url = 'https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv'

# Define the file paths
data_path = Path('data')
data_file = data_path / 'titanic.csv'

# Store the data in the data folder
if not os.path.exists(data_path):
    os.makedirs(data_path)

# Download the data
if not os.path.exists(data_file):
    wget.download(url, data_file.as_posix())

# Load the data
data = pd.read_csv(data_file)

# Display the first 5 rows of the data
print(data.head())

   Survived  Pclass                                               Name  \
0         0       3                             Mr. Owen Harris Braund   
1         1       1  Mrs. John Bradley (Florence Briggs Thayer) Cum...   
2         1       3                              Miss. Laina Heikkinen   
3         1       1        Mrs. Jacques Heath (Lily May Peel) Futrelle   
4         0       3                            Mr. William Henry Allen   

      Sex   Age  Siblings/Spouses Aboard  Parents/Children Aboard     Fare  
0    male  22.0                        1                        0   7.2500  
1  female  38.0                        1                        0  71.2833  
2  female  26.0                        0                        0   7.9250  
3  female  35.0                        1                        0  53.1000  
4    male  35.0                        0                        0   8.0500  


In [3]:
# Check the data types of the columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Survived                 887 non-null    int64  
 1   Pclass                   887 non-null    int64  
 2   Name                     887 non-null    object 
 3   Sex                      887 non-null    object 
 4   Age                      887 non-null    float64
 5   Siblings/Spouses Aboard  887 non-null    int64  
 6   Parents/Children Aboard  887 non-null    int64  
 7   Fare                     887 non-null    float64
dtypes: float64(2), int64(4), object(2)
memory usage: 55.6+ KB


In [4]:
# Drop the columns that are not required
data.drop(['Name'], axis=1, inplace=True)

# Check the missing values in the data
print(data.isnull().sum())

Survived                   0
Pclass                     0
Sex                        0
Age                        0
Siblings/Spouses Aboard    0
Parents/Children Aboard    0
Fare                       0
dtype: int64


In [5]:
# Check survived column
print(data['Survived'].value_counts())

Survived
0    545
1    342
Name: count, dtype: int64


In [6]:
# Define the target and features
target = 'Survived'
features = data.columns.drop(target)

# Splitting Dataset

In [7]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.2, random_state=42)

In [8]:
# Check the shape of the train and test data
print(X_train.shape, X_test.shape)

(709, 6) (178, 6)


In [9]:
# Check the class distribution in the target
print(y_train.value_counts())

Survived
0    434
1    275
Name: count, dtype: int64


In [10]:
# Check unique values in Pclass
print(X_train['Pclass'].value_counts())

Pclass
3    394
1    170
2    145
Name: count, dtype: int64


In [11]:
# Define categorical columns
categorical_columns = ['Sex', 'Pclass']  # Include both 'Sex' and 'Pclass'

# Preprocessing Data

## Label Encoder

In [12]:
# Label encoder for the categorical columns using LabelEncoder

# Initialize the label encoder
label_encoders = {}
for cat_col in X_train.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_train[cat_col] = le.fit_transform(X_train[cat_col])
    X_test[cat_col] = le.transform(X_test[cat_col])
    label_encoders[cat_col] = le

In [13]:
# Check the first 5 rows of the data
X_train.head()

Unnamed: 0,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
730,2,1,23.0,0,0,13.0
390,3,1,28.0,2,0,7.925
118,3,0,2.0,4,2,31.275
440,2,0,28.0,0,0,13.0
309,1,0,18.0,2,2,262.375


In [14]:
# check label encoders
print(label_encoders)

{'Sex': LabelEncoder()}


In [15]:
# Save the label encoders using joblib
joblib.dump(label_encoders, 'Model/label_encoders.pkl')

['Model/label_encoders.pkl']

## Standar scaler

In [16]:
# Standardize the data without Pclass and Sex columns
scaler = StandardScaler()
# Select numerical columns
numerical_columns = ['Age', 'Siblings/Spouses Aboard','Parents/Children Aboard', 'Fare']
# Scale the numerical columns
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

# Check the first 5 rows of the data
X_train.head()

Unnamed: 0,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
730,2,1,-0.44901,-0.470388,-0.475009,-0.388597
390,3,1,-0.094153,1.275711,-0.475009,-0.484577
118,3,0,-1.939405,3.02181,1.992254,-0.042975
440,2,0,-0.094153,-0.470388,-0.475009,-0.388597
309,1,0,-0.803866,1.275711,1.992254,4.327664


In [17]:
# Save the scaler using joblib
joblib.dump(scaler, 'Model/scaler.pkl')

['Model/scaler.pkl']

# Handle Imbalanced Class

In [18]:
# Compute class weights to handle imbalanced dataset
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))


In [19]:
# Check class weights
print(class_weights)

[0.81682028 1.28909091]


In [20]:
# Check class weights dictionary
print(class_weights_dict)

{0: 0.8168202764976958, 1: 1.289090909090909}


# Modelling TabNet with Weighted Class

In [21]:
# Calculate n_steps for TabNet
n_steps = X_train.shape[0] // 2
# print n_steps
print(n_steps)

354


In [22]:
# Define Sparsemax Layer
class Sparsemax(keras.layers.Layer):
    def call(self, inputs):
        return tfa.activations.sparsemax(inputs)

In [23]:
# Define the TabNet Encoder
class TabNetFeatureTransformer(keras.layers.Layer):
    def __init__(self, hidden_dim):
        super(TabNetFeatureTransformer, self).__init__()
        self.hidden_dim = hidden_dim
        self.dense1 = keras.layers.Dense(hidden_dim, activation='relu')
        self.bn1 = keras.layers.BatchNormalization()
        self.dense2 = keras.layers.Dense(hidden_dim, activation='relu')
        self.bn2 = keras.layers.BatchNormalization()

    def call(self, x):
        x = self.dense1(x)
        x = self.bn1(x)
        x = self.dense2(x)
        x = self.bn2(x)
        return x

    # Add get_config method
    def get_config(self):
        config = super().get_config()
        config.update({
            "hidden_dim": self.hidden_dim,
        })
        return config

class TabNetAttentiveTransformer(keras.layers.Layer):
    def __init__(self, hidden_dim):
        super(TabNetAttentiveTransformer, self).__init__()
        self.hidden_dim = hidden_dim
        self.dense = keras.layers.Dense(hidden_dim)
        self.sparsemax = Sparsemax()

    def call(self, x):
        x = self.dense(x)
        x = self.sparsemax(x)
        return x

    # Add get_config method
    def get_config(self):
        config = super().get_config()
        config.update({
            "hidden_dim": self.hidden_dim,
        })
        return config

class TabNetEncoder(keras.layers.Layer):
    def __init__(self, feature_dim, num_steps):
        super(TabNetEncoder, self).__init__()
        self.feature_dim = feature_dim
        self.num_steps = num_steps
        self.feature_transformer_shared = TabNetFeatureTransformer(feature_dim)
        self.attentive_transformer = TabNetAttentiveTransformer(feature_dim)
        self.masks = []  # Store masks for feature importance

    def call(self, inputs):
        outputs = []
        masked_features = inputs
        for step in range(self.num_steps):
            # Feature transformer
            transformed_features = self.feature_transformer_shared(masked_features)
            
            # Attentive transformer for feature selection
            attention_weights = self.attentive_transformer(transformed_features)
            masked_features = attention_weights * inputs
            
            self.masks.append(attention_weights)  # Save the mask
            outputs.append(transformed_features)
        
        return keras.layers.Concatenate()(outputs)

    # Add get_config method
    def get_config(self):
        config = super().get_config()
        config.update({
            "feature_dim": self.feature_dim,
            "num_steps": self.num_steps,
        })
        return config


In [24]:
# TabNet model
def build_tabnet_model(input_shape, feature_dim, num_steps):
    inputs = keras.Input(shape=input_shape)
    tabnet_encoder = TabNetEncoder(feature_dim=feature_dim, num_steps=num_steps)(inputs)
    outputs = keras.layers.Dense(1, activation="sigmoid")(tabnet_encoder)
    
    model = keras.Model(inputs, outputs)
    return model

In [25]:
# Build the TabNet model
input_shape = X_train.shape[1]
model = build_tabnet_model(input_shape=input_shape, feature_dim=6, num_steps=10)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [26]:
# Display the model summary
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 6)]               0         
                                                                 
 tab_net_encoder (TabNetEnco  (None, 60)               174       
 der)                                                            
                                                                 
 dense_3 (Dense)             (None, 1)                 61        
                                                                 
Total params: 235
Trainable params: 211
Non-trainable params: 24
_________________________________________________________________


In [27]:
# Define callbacks to monitor 'val_accuracy'
callbacks = [
    keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True, verbose=1),  # Stop early if no improvement
    keras.callbacks.ModelCheckpoint('best_model_TabNet.h5', monitor='val_accuracy', save_best_only=True, verbose=1),  # Save the best model based on val_accuracy
    keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.2, patience=5, min_lr=0.0001, verbose=1)  # Reduce learning rate on plateau
]

In [28]:
# Train the model with class weights, callbacks, and verbose=2 for more detailed output
history = model.fit(
    X_train, 
    y_train, 
    epochs=100, 
    batch_size=1, 
    validation_data=(X_test, y_test),
    class_weight=class_weights_dict,  # Handle imbalanced classes
    callbacks=callbacks  # Add callbacks
)

Epoch 1/100
Epoch 1: val_accuracy improved from -inf to 0.38764, saving model to best_model_TabNet.h5
Epoch 2/100
Epoch 2: val_accuracy improved from 0.38764 to 0.56742, saving model to best_model_TabNet.h5
Epoch 3/100
Epoch 3: val_accuracy did not improve from 0.56742
Epoch 4/100
Epoch 4: val_accuracy improved from 0.56742 to 0.57303, saving model to best_model_TabNet.h5
Epoch 5/100
Epoch 5: val_accuracy did not improve from 0.57303
Epoch 6/100
Epoch 6: val_accuracy did not improve from 0.57303
Epoch 7/100
Epoch 7: val_accuracy did not improve from 0.57303
Epoch 8/100
Epoch 8: val_accuracy did not improve from 0.57303
Epoch 9/100
Epoch 9: val_accuracy did not improve from 0.57303

Epoch 9: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 10/100
Epoch 10: val_accuracy did not improve from 0.57303
Epoch 11/100
Epoch 11: val_accuracy did not improve from 0.57303
Epoch 12/100
Epoch 12: val_accuracy did not improve from 0.57303
Epoch 13/100
Epoch 13: val_accuracy d

# Feature Importane

In [29]:
# Get feature importance from the trained model
def get_tabnet_feature_importance(model, X):
    # Get the feature transformer model
    feature_transformer = model.get_layer('tab_net_encoder').feature_transformer_shared
    # Get the weights of the feature transformer
    feature_weights = feature_transformer.get_weights()[0]
    # Calculate the importance of each feature
    importance = np.abs(feature_weights).sum(axis=1)
    # Normalize the importance
    importance = importance / importance.max()
    return importance

In [30]:
# Get feature importance
feature_importance = get_tabnet_feature_importance(model, X_train)

In [31]:
# Get the feature names
feature_names = X_train.columns

In [32]:
# Create a DataFrame with feature names and importance
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
})

# Sort the values in descending order
feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)

# Display the feature importance
feature_importance_df

Unnamed: 0,feature,importance
4,Parents/Children Aboard,1.0
0,Pclass,0.732656
2,Age,0.704009
1,Sex,0.66867
5,Fare,0.513875
3,Siblings/Spouses Aboard,0.418226
