# Question A4

In this section, we will understand the utility of such a neural network in real world scenarios.

#### Please use the real record data named ‘record.wav’  as a test sample. Preprocess the data using the provided preprocessing script (data_preprocess.ipynb) and prepare the dataset.
Do a model prediction on the sample test dataset and obtain the predicted label using a threshold of 0.5. The model used is the optimized pretrained model using the selected optimal batch size and optimal number of neurons.
Find the most important features on the model prediction for the test sample using SHAP. Plot the local feature importance with a force plot and explain your observations.  (Refer to the documentation and these three useful references:
https://christophm.github.io/interpretable-ml-book/shap.html#examples-5,
https://towardsdatascience.com/deep-learning-model-interpretation-using-shap-a21786e91d16,  
https://medium.com/mlearning-ai/shap-force-plots-for-classification-d30be430e195)



1. Firstly, we import relevant libraries.

In [1]:
import tqdm
import time
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from scipy.io import wavfile as wav

from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from common_utils import set_seed

# setting seed
set_seed()

To reduce repeated code, place your
network (MLP defined in QA1)
torch datasets (CustomDataset defined in QA1)
loss function (loss_fn defined in QA1)
in a separate file called common_utils.py

Import them into this file. You will not be repenalised for any error in QA1 here as the code in QA1 will not be remarked.

The following code cell will not be marked.


In [None]:
# YOUR CODE HERE
from common_utils import MLP, CustomDataset, loss_fn, preprocess_dataset, split_dataset, EarlyStopper

optimal_batch_size = 256
optimal_neurons = 128

def train(model, X_train_scaled, y_train2, X_val_scaled, y_val2, batch_size=optimal_batch_size):
    # YOUR CODE HERE
    train_accuracies, train_losses, test_accuracies, test_losses, times = [], [], [], [], []
    # each list contains the accuracy/loss/time for each epoch 
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.BCELoss()
    early_stopper = EarlyStopper()
    training_data = CustomDataset(X_train_scaled, y_train2)
    training_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
    validation_data = CustomDataset(X_val_scaled, y_val2)
    validation_dataloader = DataLoader(validation_data, batch_size=batch_size, shuffle=True)
    no_epochs = 100
    for epoch in range(no_epochs):
        train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
        # Training
        start_time = time.time()
        for i, (x, y) in enumerate(training_dataloader):
            # Prediction
            y_pred = model(x) # shape: [256, 1]
            y_pred = y_pred.squeeze(dim=1) # to get shape [256]
            # Compute loss and accuracy
            loss = loss_fn(y_pred.float(), y.float())
            train_loss += loss.item()
            # transform y_pred to give class 0 or class 1
            pred_label = [1 if i > 0.5 else 0 for i in y_pred]
            # compare pred_label with the ground truth y, add 1 to the train_acc variable if the prediction is correct
            train_acc += sum([1 if i == j else 0 for i, j in zip(pred_label, y)])
            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        training_time = time.time() - start_time
        times.append(training_time)
        train_losses.append(train_loss/ len(training_dataloader))
        train_accuracies.append(train_acc/ len(training_dataloader.dataset))
        # Testing
        with torch.no_grad():
            for i, (x, y) in enumerate(validation_dataloader):
                # Prediction
                y_pred = model(x) # shape: [256, 1]
                y_pred = y_pred.squeeze(dim=1) # to get shape [256]
                # Compute loss and accuracy
                loss = loss_fn(y_pred.float(), y.float())
                test_loss += loss.item()
                # transform y_pred to give class 0 or class 1
                pred_label = [1 if i > 0.5 else 0 for i in y_pred]
                # compare pred_label with the ground truth y, add 1 to the test_acc variable if the prediction is correct
                test_acc += sum([1 if i == j else 0 for i, j in zip(pred_label, y)])
        test_losses.append(test_loss/ len(validation_dataloader))
        test_accuracies.append(test_acc/ len(validation_dataloader.dataset))
        if early_stopper.early_stop(test_loss):
            print("Early stopping at epoch number: ", epoch+1)
            break
    return model



In [None]:
train_df = pd.read_csv('simplified.csv')
train_df['label'] = train_df['filename'].str.split('_').str[-2]
df_train, y_train, df_val, y_val = split_dataset(train_df, ['filename'], 0.3, 42)
X_train = df_train.drop(columns=["label"])
X_val = df_val.drop(columns=["label"])
# Scale the training and validation input features
X_train_scaled, X_val_scaled = preprocess_dataset(X_train, X_val)

model = MLP(no_hidden=optimal_neurons, no_features=77, no_labels=1)
model = train(model, X_train_scaled, y_train, X_val_scaled, y_val)

2. Install and import shap

In [None]:
# YOUR CODE HERE
!pip install shap

In [None]:
import shap

3. Read the csv data preprocessed from 'record.wav', using variable name 'df', and fill the size of 'df' in 'size_row' and 'size_column'.

In [None]:
df = 0
size_row = 0
size_column = 0
# YOUR CODE HERE
df = pd.read_csv('new_record.csv')
size_row = df.shape[0]
size_column = df.shape[1]

 4.  Preprocess to obtain the test data, save the test data as numpy array.

In [None]:

def preprocess(X_train, df):
    """preprocess your dataset to obtain your test dataset, remember to remove the 'filename' as Q1
    """
    # YOUR CODE HERE
    # Remove 'filename' attribute from dataframe since model was trained without this attribute
    df2 = df.drop(['filename'],axis=1)
    standard_scaler = preprocessing.StandardScaler()
    # Fit the scaler to the X_train data
    standard_scaler = standard_scaler.fit(X_train)
    # Transform the test data using the fitted scaler
    X_test_scaled_eg = standard_scaler.transform(df2)
    return X_test_scaled_eg

X_test_scaled_eg = preprocess(X_train, df)


5. Do a model prediction on the sample test dataset and obtain the predicted label using a threshold of 0.5. The model used is the optimized pretrained model using the selected optimal batch size and optimal number of neurons. Note: Please define the variable of your final predicted label as 'pred_label'.

In [None]:
# YOUR CODE HERE
X_test_scaled_tensor = torch.tensor(X_test_scaled_eg, dtype=torch.float32)
pred = model(X_test_scaled_tensor)
print(pred)
threshold = 0.5
if pred.item() > threshold:
	pred_label = 1
else:
	pred_label = 0
print("The prediction (pred_label) is: ", pred_label)

6. Find the most important features on the model prediction for your test sample using SHAP. Create an instance of the DeepSHAP which is called DeepExplainer using traianing dataset: https://shap-lrjball.readthedocs.io/en/latest/generated/shap.DeepExplainer.html.

Plot the local feature importance with a force plot and explain your observations.  (Refer to the documentation and these three useful references:
https://christophm.github.io/interpretable-ml-book/shap.html#examples-5,
https://towardsdatascience.com/deep-learning-model-interpretation-using-shap-a21786e91d16,  
https://medium.com/mlearning-ai/shap-force-plots-for-classification-d30be430e195)


In [None]:
shap.initjs()

In [None]:
'''
Fit the explainer on a subset of the data (you can try all but then gets slower)
Return approximate SHAP values for the model applied to the data given by X.
Plot the local feature importance with a force plot and explain your observations.
'''
# YOUR CODE HERE
training_data = CustomDataset(X_train_scaled, y_train)
training_dataloader = DataLoader(training_data, batch_size=optimal_batch_size, shuffle=True)
batch = next(iter(training_dataloader))
images, _ = batch
background = images[:1000]

test_image = X_test_scaled_tensor

# I use the first 1000 training examples as the background dataset to integrate over
e = shap.DeepExplainer(model, background)

# Explain the test prediction
shap_values = e.shap_values(test_image) 


In [None]:
shap.force_plot(e.expected_value, shap_values, features = X_test_scaled_eg, feature_names=list(df.drop(['filename'],axis=1).columns))


In [None]:
# Explanation
"""
From the force plot above, we see that the prediction label assigned by the model to a test sample largely depends on features spec_bw_var, cent_var, mfcc2_var and constrast_mean. Specifically, features spec_bw_var and contrast_mean would be the two main features pushing the prediction towards class "1", while features cent_var and mfcc_var would be the two main features that pulls the prediction away from class "1" ie. pushing the prediction towards class "0".
So this particular recording was ultimately classified as "1", because it is pushed more to the right by all the factors shown in red
"""