*This is just a ***lecture*** notebook - you do not have to hand this in!*

# Lecture 03 - 17.05.2022
Playing around with the parameters for a support vector classification.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC

from ipywidgets import interactive, widgets
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
sns.set_style('whitegrid')
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('png')

RANDOM_STATE = 42

## Pre-Processing

In [None]:
# read data
data = pd.read_csv('student-mat.csv')

# binarize grades
def grade_binary_transform(g):
    if g > 9:
        return 1
    else: 
        return 0

data_c = data.copy()
data_c['grades'] = data_c.apply(lambda x: grade_binary_transform(x['G3']), axis = 1 )
data_c.drop('G3', axis=1, inplace=True)

# split data
target = "grades"
train_val, test = train_test_split(data_c, stratify=data_c[target], test_size=0.25, random_state=RANDOM_STATE)
train, val = train_test_split(train_val, stratify=train_val[target], test_size=0.25, random_state=RANDOM_STATE)

# encode data
binary_vars = ["school", "sex", "address", "famsize", "Pstatus", "schoolsup",
               "famsup", "paid", "activities", "nursery", "higher", 
               "internet", "romantic"]
nominal_vars = ["Mjob", "Fjob", "reason", "guardian"]
class_ohe = OneHotEncoder(drop="if_binary", sparse=False)
train[binary_vars] = class_ohe.fit_transform(train.loc[:, binary_vars])
test[binary_vars] = class_ohe.transform(test.loc[:, binary_vars])
val[binary_vars] = class_ohe.transform(val.loc[:, binary_vars])
train_val[binary_vars] = class_ohe.transform(train_val.loc[:, binary_vars])
train.drop(columns=nominal_vars, inplace=True)
test.drop(columns=nominal_vars, inplace=True)
train_val.drop(columns=nominal_vars, inplace=True)
val.drop(columns=nominal_vars, inplace=True)

# create X and Y
target = "grades"
X_train_val = train_val.drop(columns=target)
X_val = val.drop(columns=target)
X_train = train.drop(columns=target)
Y_train_val = train_val[target]
Y_val = val[target]
Y_train = train[target]
X_test = test.drop(columns=target)
Y_test = test[target]

# scale data
sc = StandardScaler()
sc2 = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_val_scaled = sc.transform(X_val)
X_train_val_scaled = sc2.fit_transform(X_train_val)
X_test_scaled = sc2.transform(X_test)

### Support Vector Classification Widget
Running the following cell will start an IPyWidget.

At the top you can choose the settings for a variety of parameters for the Support Vector Classifier. If you hit the `Run` button you will see the accuracys for training and testing. You will also get two plots that show the support vectors together with the decision boundary of the model. Left plot shows the training samples, right plot shows the test samples.

Be aware that the calculation and the rendering can take a few seconds!

In [None]:
svc_kernel_radiobutton = widgets.RadioButtons(
    options=['linear', 'rbf', 'poly', 'sigmoid'],
    description='SVC_kernel:',
    disabled=False
)

svc_c_boundedfloattext = widgets.BoundedFloatText(
    value=1,
    min=0.01,
    max=100.0,
    step=0.01,
    description='SVC_C',
    disabled=False
)

svc_gamma_boundedfloattext = widgets.BoundedFloatText(
    value=1,
    min=0.0001,
    max=10.0,
    step=0.001,
    description='SVC_gamma',
    disabled=False
)

scaling_checkbox = widgets.Checkbox(
    value=False,
    description='Scaling',
    disabled=False,
    indent=False
)

run_button = widgets.Button(
    description='Run',
    disabled=False,
    button_style='',
    icon='check'
)

out = widgets.Output(layout={'border': '1px solid black'})

box = widgets.VBox([widgets.HBox([svc_kernel_radiobutton, widgets.VBox([svc_c_boundedfloattext, svc_gamma_boundedfloattext, scaling_checkbox])]), run_button, out])
display(box)

def on_button_clicked(b):
    with out:
        out.clear_output(True)
        
        # apply pca
        pca = PCA(n_components=2)
        pca2 = PCA(n_components=2)
        if scaling_checkbox.value:
            pca_train = pca.fit_transform(X_train_scaled)
            pca_val = pca.transform(X_val_scaled)
            pca_train_val = pca2.fit_transform(X_train_val_scaled)
            pca_test = pca2.transform(X_test_scaled)
        else:
            pca_train = pca.fit_transform(X_train)
            pca_val = pca.transform(X_val)
            pca_train_val = pca2.fit_transform(X_train_val)
            pca_test = pca2.transform(X_test)
        # build and train model
        model_svc = SVC(kernel=svc_kernel_radiobutton.value, C=svc_c_boundedfloattext.value, gamma=svc_gamma_boundedfloattext.value)
        model2_svc = SVC(kernel=svc_kernel_radiobutton.value, C=svc_c_boundedfloattext.value, gamma=svc_gamma_boundedfloattext.value)
        model_svc.fit(pca_train, Y_train)
        model2_svc.fit(pca_train_val,Y_train_val)
        print(f"Train Accuracy: {model_svc.score(pca_train, Y_train) :.4f}")
        print(f"Validation Accuracy: {model_svc.score(pca_val, Y_val) :.4f}\n")
        print(f"\nFinal Test Accuracy: {model2_svc.score(pca_test, Y_test) :.4f}\n")
        
        print(f"Kernel: {svc_kernel_radiobutton.value}")
        print(f"C: {svc_c_boundedfloattext.value}")
        print(f"gamma: {svc_gamma_boundedfloattext.value}")
        print(f"Scaling: {scaling_checkbox.value}\n")
        
        # build grid for plotting
        x_min = pca_train[:, 0].min()
        x_max = pca_train[:, 0].max()
        y_min = pca_train[:, 1].min()
        y_max = pca_train[:, 1].max()
        XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
        Z = model_svc.decision_function(np.c_[XX.ravel(), YY.ravel()])
        Z = Z.reshape(XX.shape)
        
        # plot
        fig, ax = plt.subplots(1, 2, figsize=(20,10))
        ax[0].scatter(pca_train[:, 0], pca_train[:, 1], c=Y_train.values, zorder=10, cmap=plt.cm.Paired,
                        edgecolor='k', s=20)
        ax[0].grid(False)
        ax[0].pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired, shading='auto')
        ax[0].contour(XX, YY, Z, colors=['k', 'k', 'k'],
                linestyles=['--', '-', '--'], levels=[-.5, 0, .5])
        ax[0].set_title("Train data")
        
        ax[1].scatter(pca_val[:, 0], pca_val[:, 1], c=Y_val.values, zorder=10, cmap=plt.cm.Paired,
                        edgecolor='k', s=20)
        ax[1].grid(False)
        ax[1].pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired, shading='auto')
        ax[1].contour(XX, YY, Z, colors=['k', 'k', 'k'],
                linestyles=['--', '-', '--'], levels=[-.5, 0, .5])

        ax[1].set_title("Validation data")
        
        plt.show()
        
run_button.on_click(on_button_clicked, False)
