# MLP

## Setup

In [5]:
import os
import sys
import importlib

LIB_DIR = './lib'
SAVED_MODELS_DIR = '../backend/saved_models'
SAVED_MODEL_FILE =  '/mlp.pkl'

os.makedirs(LIB_DIR, exist_ok=True)
sys.path.append(LIB_DIR)
os.makedirs(SAVED_MODELS_DIR, exist_ok=True)
sys.path.append(SAVED_MODELS_DIR)

def install_package(package_name):
    try:
        importlib.import_module(package_name)
        print(f"{package_name} is already installed")
    except ImportError:
        print(f"Installing {package_name}")
        %pip install --quiet $package_name --progress-bar on

def download_lib(filename, url):
    LIB_PATH = os.path.join(LIB_DIR, filename)
    if not os.path.exists(LIB_PATH):
        print(f"Downloading {filename}")
        import urllib.request
        urllib.request.urlretrieve(url, LIB_PATH)
        print(f"Downloaded {filename} to {LIB_PATH}")
    else:
        print(f"{filename} already downloaded")

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import joblib

download_lib("visualization.py", "https://raw.githubusercontent.com/andreaaraldo/machine-learning-for-networks/master/course_library/visualization.py")

import visualization # type: ignore
from visualization import plot_conf_mat # type: ignore
importlib.reload(visualization)

from lib.utils import generate_summary_plot, generate_bar_plot, get_shap_dict, get_metrics_dict, save_to_pickle
import shap

visualization.py already downloaded


In [6]:
files_path = ''

if 'google.colab' in str(get_ipython()): # type: ignore
    print('TO DO: Set up Google Colab')
    # print('Running in Google Colab')
    # from google.colab import drive
    # mount_point = '/content/drive'
    # drive.mount(mount_point)
    # files_path = mount_point + '/MyDrive/Colab Notebooks/'
else:
    print('Running locally')
    files_path = '../data/'

Running locally


In [7]:
german_df = pd.read_csv(files_path + 'syntetic_sample.csv')

In [8]:
mappings = {
    'sex': {
        'female': 0,
        'male': 1
    },
    'present_employee_since': {
        'unemployed': 0, '<1y': 1, '1-4y': 2, '4-7y': 3, '>=7y': 4
    },
    'checking_account': {
        'no checking account': 0, '< 0 DM': 1, '0 <= ... < 200 DM': 2, '>= 200 DM': 3
    },
    'savings': {
        '0 or unk.': 0, '<100 DM': 1, '100-500 DM': 2, '500-1000 DM': 3, '>1000 DM': 4
    },
    'job': {
        'unemployed/unskilled non-resident': 0,
        'unskilled resident': 1,
        'qualified': 2,
        'highly qualified': 3
    }
}

german_preprocessed_df = german_df.copy()

for col, mapping in mappings.items():
    german_preprocessed_df[col] = german_preprocessed_df[col].map(mapping).astype(int)

In [9]:
german_dummies_df = pd.get_dummies(german_preprocessed_df, dtype=int, drop_first=True)

In [10]:
x_train, x_test, y_train, y_test = train_test_split(
    german_dummies_df.drop(columns=['risk']),
    german_dummies_df['risk'],
    test_size=0.2,
    random_state=42
)

In [11]:
pipeline = None
if os.path.exists(SAVED_MODELS_DIR + SAVED_MODEL_FILE):
    print(f"Loading saved model ...")
    pipeline = joblib.load(SAVED_MODELS_DIR + SAVED_MODEL_FILE)["model"]
else:
    pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('mlp', MLPClassifier(
                activation='relu',
                alpha=0.001,
                hidden_layer_sizes=(128, 64, 32),
                learning_rate_init=0.01,
                max_iter=200,
                early_stopping=True,
                random_state=42
            ))
        ])


    pipeline.fit(x_train, y_train)



In [None]:
pipeline.predict_proba(x_test)

In [None]:
y_pred = pipeline.predict(x_test)
y_pred

In [None]:
class_report = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

class_names = np.array(['good', 'bad'])
plot_conf_mat(y_test, y_pred, class_names)

In [None]:
# Explainable AI
scaler = pipeline.named_steps['scaler']
mlp_model = pipeline.named_steps['mlp']

x_test_xai = scaler.transform(x_test.sample(frac=0.1, random_state=42))

explainer = shap.Explainer(mlp_model.predict_proba, x_test_xai, feature_names=x_train.columns)
shap_values = explainer(x_test_xai)

shap.summary_plot(shap_values[:,:,1], x_test_xai)

In [None]:
shap.plots.bar(shap_values[:,:,1], max_display=25)

In [None]:
summary_plot_b64 = generate_summary_plot(shap_values[:, :, 1], x_test_xai)

shap_importance_b64 = generate_bar_plot(shap_values[:,:,1], max_display=25)


In [None]:
sample_index = 0
shap.plots.waterfall(shap_values[sample_index,:,1])

In [None]:
shap_data = get_shap_dict(summary_plot_b64, shap_importance_b64, x_test_xai)

metrics_data = get_metrics_dict(cm_normalized, class_report)

save_to_pickle(pipeline, metrics_data, shap_data, SAVED_MODELS_DIR + SAVED_MODEL_FILE)