# Deep Learning for Business Applications course

## TOPIC 8: More Tasks for Deep Learning. TabNet for data analysis

### 1. Libraries and parameters

In [None]:
!pip install pytorch-tabnet

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, KFold

### 2. Titanic challenge

Famous [Titanic challenge](https://www.kaggle.com/competitions/titanic) will help us to try [TabNet](https://github.com/dreamquark-ai/tabnet). Let's also use [this notebook](https://www.kaggle.com/code/masaonda/titanic-how-to-use-tabnet) as an example.

#### 2.1. Dataset

In [None]:
train = pd.read_csv('~/__DATA/DLBA_F24/topic_08/train.csv', header=0)
test = pd.read_csv('~/__DATA/DLBA_F24/topic_08/test.csv', header=0)

In [None]:
# data preprocessing and basic feature engineering
train.fillna({'Age': train.Age.mean()}, inplace=True)
test.fillna({'Age': train.Age.mean()}, inplace=True)
test.fillna({'Fare': train.Fare.mean()}, inplace=True)
train.drop('Cabin', axis=1, inplace=True)
test.drop('Cabin', axis=1, inplace=True)
train.fillna({'Embarked': 'S'}, inplace=True)
test.fillna({'Embarked': 'S'}, inplace=True)
train.replace({'Sex': {'male': 0, 'female': 1}}, inplace=True)
test.replace({'Sex': {'male': 0, 'female': 1}}, inplace=True)
train.replace({'Title': {'Mr': 0, 'Mrs': 1, 'Miss': 2, 'Master': 3}}, inplace=True)
test.replace({'Title': {'Mr': 0, 'Mrs': 1, 'Miss': 2, 'Master': 3}}, inplace=True)

# one-hot encoding
embarked = pd.concat([train['Embarked'], test['Embarked']])
embarked_ohe = pd.get_dummies(embarked)
embarked_ohe_train = embarked_ohe[:891]
embarked_ohe_test = embarked_ohe[891:]
train = pd.concat([train, embarked_ohe_train], axis=1)
test = pd.concat([test, embarked_ohe_test], axis=1)

In [None]:
# drop useless columns
train.drop('Embarked', axis=1, inplace=True)
test.drop('Embarked', axis=1, inplace=True)
train.drop('Name', axis=1, inplace=True)
test.drop('Name', axis=1, inplace=True)
train.drop('Ticket', axis=1, inplace=True)
test.drop('Ticket', axis=1, inplace=True)

#### 2.2. Train-test split

In [None]:
X = train.iloc[:, 2:].values.astype(float)
y = train.iloc[:, 1].values.astype(float)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.3)

#### 2.3. Training TabNet

In [None]:
tabnet_params = dict(
    n_d=16,
    n_a=16,
    n_steps=5,
    gamma=1.3,
    lambda_sparse=0,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
    mask_type='entmax',
    scheduler_params=dict(
        mode='min',
        patience=5,
        min_lr=1e-5,
        factor=.9
    ),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    seed=2024,
    verbose=10
)

In [None]:
EPOCH = 100
BATCH_SIZE = 32
model = TabNetClassifier(**tabnet_params)
model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    max_epochs=EPOCH,
    batch_size=BATCH_SIZE,
    patience=10
)

#### 2.4. Feature importances analysis

In [None]:
importance = pd.DataFrame()
importance['feature'] = train.iloc[:, 2:].columns
importance['importance'] = model.feature_importances_

sns.barplot(x='importance', y='feature', data=importance.sort_values(by='importance', ascending=False))

### 3. Covertype task

[Here](https://archive.ics.uci.edu/dataset/31/covertype) you can get info about the Cocertype task. Our example is based on [this code](https://www.kaggle.com/code/mrisdal/pytorch-tabnet-example).

#### 3.1. Dataset

In [None]:
target = 'Covertype'
bool_columns = [
    'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
    'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4',
    'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9',
    'Soil_Type10', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
    'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19',
    'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
    'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28', 'Soil_Type29',
    'Soil_Type30', 'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
    'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39',
    'Soil_Type40'
]
int_columns = [
    'Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
    'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points'
]
feature_columns = (int_columns + bool_columns + [target])

In [None]:
file_path = '~/__DATA/DLBA_F24/topic_08/covtype.data.gz'
train = pd.read_csv(file_path, header=None, names=feature_columns)

# Train, val and test split follows
# Rory Mitchell, Andrey Adinets, Thejaswi Rao, and Eibe Frank.
# Xgboost: Scalable GPU accelerated learning. arXiv:1806.11248, 2018.

n_total = len(train)
train_val_indices, test_indices = train_test_split(
    range(n_total),
    test_size=.2,
    random_state=2024
)
train_indices, valid_indices = train_test_split(
    train_val_indices,
    test_size=.2 / .6,
    random_state=2024
)
display(train.head())

#### 3.2. Features

In [None]:
categorical_columns = []
categorical_dims = {}

for col in train.columns[train.dtypes == object]:
    print(col, train[col].nunique())
    l_enc = LabelEncoder()
    train[col] = train[col].fillna('VV_likely')
    train[col] = l_enc.fit_transform(train[col].values)
    categorical_columns.append(col)
    categorical_dims[col] = len(l_enc.classes_)

for col in train.columns[train.dtypes == 'float64']:
    train.fillna(train.loc[train_indices, col].mean(), inplace=True)

In [None]:
features = [col for col in train.columns if col not in target]
cat_idxs = [i for i, f in enumerate(features) if f in categorical_columns]
cat_dims = [categorical_dims[f] for i, f
            in enumerate(features)
            if f in categorical_columns]

#### 3.3. Train-test dplit

In [None]:
X_train = train[features].values[train_indices]
y_train = train[target].values[train_indices]

X_valid = train[features].values[valid_indices]
y_valid = train[target].values[valid_indices]

X_test = train[features].values[test_indices]
y_test = train[target].values[test_indices]

#### 3.4. Training TabNet

In [None]:
clf = TabNetClassifier(
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    optimizer_fn=torch.optim.Adam
)

In [None]:
MAX_EPOCHS = 5  # 100 epochs recommended
clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_name=['train', 'valid'],
    max_epochs=MAX_EPOCHS,
    patience=10,
    batch_size=16384
)

In [None]:
plt.plot(clf.history['loss'])

In [None]:
plt.plot(clf.history['valid_accuracy'])

In [None]:
y_pred = clf.predict(X_test)
test_acc = accuracy_score(y_pred=y_pred, y_true=y_test)
print(f'final test accuracyE: {test_acc}')

#### 3.5. Feature importances analysis

In [None]:
len(clf.feature_importances_)

In [None]:
len(train.columns[:-1])

In [None]:
importance = pd.DataFrame()
importance['feature'] = train.columns[:-1]
importance['importance'] = clf.feature_importances_

plt.figure(figsize=(16, 24))
sns.barplot(
    x='importance',
    y='feature',
    data=importance.sort_values(by='importance', ascending=False)
)
plt.show()

In [None]:
explain_matrix, masks = clf.explain(X_test)
fig, axs = plt.subplots(1, 3, figsize=(20, 20))

for i in range(3):
    axs[i].imshow(masks[i][:50])
    axs[i].set_title(f'mask {i}')