# Objective compare LDA and QDA (based on ex5)
How does the **linearity of Baysian decision boundary** and **sample size (n)** affect their respective performances on the **training** vs **test** sets?

In [127]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
from sklearn.datasets import make_blobs, make_gaussian_quantiles
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from ipywidgets import interactive
from slm import viz, model

# Compare single examples

In [57]:
def gen_nonlinear_classification(nonlinearity=1, n_sample=100):
    
    X = np.random.normal(size=(n_sample,2))
    beta = np.random.uniform(low=-1, high=1, size=5)
    y = nonlinearity*beta[4]*X[:,0]**2 + nonlinearity*beta[3]*X[:,1]**2 + beta[2]*X[:,0] + beta[1]*X[:,1] + beta[0]
    y = (y > np.median(y)).astype(int)
    
    return X, y

In [304]:
def plot_da(nonlinearity=0, n_sample=100):

    X, y = gen_nonlinear_classification(nonlinearity, n_sample)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    
    x_mesh = viz.make_mesh(X, step_size=0.1)
    X0 = x_mesh[0,:,0]
    X1 = x_mesh[:,1,1]
    
    fig, ax = plt.subplots(1,2, figsize=(12,6), sharex=True, sharey=True)
    cmap = cm.get_cmap('RdBu', 2)
    
    for i, clf in enumerate([LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis()]):

        clf.fit(X_train, y_train)
        train_error = model.error_rate(y_train, clf.predict(X_train))
        test_error = model.error_rate(y_test, clf.predict(X_test))
                                       
        decision_boundary = clf.predict(x_mesh.reshape(-1,2)).reshape(x_mesh.shape[:2])
    
        ax[i].pcolor(X0, X1, decision_boundary, cmap=cmap, alpha=0.2)
        ax[i].scatter(X[:,0], X[:,1], c=y, s=10, cmap=cmap)
        ax[i].text(X0[0]+.2, X1[-1]-.1, 'train: {:.3}; test: {:.3}'.format(\
                                        1-train_error, 1-test_error),\
                   verticalalignment='top', fontsize=15)
    
    ax[0].set_xlim(X0[0], X0[-1])
    ax[0].set_ylim(X1[0], X1[-1])
    
    ax[0].set_title('LDA')
    ax[1].set_title('QDA')
    
    plt.show()
    
w = interactive(plot_da, nonlinearity=(0., 1.), n_sample=(20, 1000, 10))
display(w)

interactive(children=(FloatSlider(value=0.0, description='nonlinearity', max=1.0), IntSlider(value=100, descri…

# Multiple runs to see dependence

## Preprocess random data

In [172]:
%%time

NONLIN_RANGE = np.linspace(0, 1, 10)
N_RANGE = np.logspace(4, 8, 10, base=2, dtype=int)
TEST_SIZE  = 100

nonlin_n_mat = np.zeros((4, TEST_SIZE, len(NONLIN_RANGE), len(N_RANGE)))

for n, n_sample in enumerate(N_RANGE):

    for m, nonlinearity in enumerate(NONLIN_RANGE):

        for j in range(TEST_SIZE):

            X, y = gen_nonlinear_classification(nonlinearity, n_sample)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

            for i, clf in enumerate([LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis()]):

                clf.fit(X_train, y_train)
                train_error = model.error_rate(y_train, clf.predict(X_train))
                test_error = model.error_rate(y_test, clf.predict(X_test))

                nonlin_n_mat[i*2,j,m,n] = train_error
                nonlin_n_mat[i*2+1,j,m,n] = test_error

CPU times: user 2min 14s, sys: 2.78 s, total: 2min 17s
Wall time: 23.3 s


In [305]:
def plot_nonlin_errors(slice=6):

    clrs = ['Magenta', 'Magenta', 'LimeGreen', 'LimeGreen']
    lss = ['-', '--', '-', '--']

    legend_elements = [Patch(facecolor='LimeGreen', edgecolor='LimeGreen', label='QDA'),
                       Patch(facecolor='Magenta', edgecolor='Magenta', label='LDA'),
                       Line2D([0], [0], color='k', lw=1.5, ls='-', label='train'),
                       Line2D([0], [0], color='k', lw=1.5, ls='--', label='test')]

    fig, ax = plt.subplots()

    for i, (mat, clr, ls) in enumerate(zip(nonlin_n_mat[:,:,:,slice], clrs, lss)):

        mean = 1-np.mean(mat, axis=0)
        std = np.std(mat, axis=0)
        ax.plot(NONLIN_RANGE, mean, c=clr, linestyle=ls)
        ax.fill_between(NONLIN_RANGE, mean-std, mean+std, alpha=0.1, color=clr)
        if i%2==0:
            ax.fill_between(NONLIN_RANGE, mean-std, mean+std, alpha=0.5, facecolor='none', edgecolor=clr, lw=1)

    ax.legend(handles=legend_elements, loc='lower left', ncol=2)
    ax.set_ylabel('Accuracy')
    ax.set_xlabel('Nonlinearity')
    ax.set_title('Sample size (n): {}'.format(N_RANGE[slice]))
    ax.set_ylim(0.35, 1.05)
    ax.axhline(1, c='k', lw=1)
    plt.show()

def plot_n_errors(slice=5):

#     clrs = ['b', 'b', 'r', 'r']
    lss = ['-', '--', '-', '--']
    clrs = ['k', 'r']

    legend_elements = [Line2D([0], [0], color='k', lw=1.5, ls='-', label='train'),\
                       Line2D([0], [0], color='r', lw=1.5, ls='-', label='test')]

    fig, ax = plt.subplots(1)

    for i, (mat, clr, ls) in enumerate(zip(nonlin_n_mat[:2,:,slice,:]-nonlin_n_mat[2:,:,slice,:], clrs, lss)):

        mean = np.mean(mat, axis=0)
        std = np.std(mat, axis=0)
        ax.plot(N_RANGE, mean, color=clr)
        ax.fill_between(N_RANGE, mean-std, mean+std, alpha=0.2, color=clr)
#         ax[0].plot(N_RANGE, mean, c='k', linestyle=ls)
#         ax[0].fill_between(N_RANGE, mean-std, mean+std, alpha=0.1, color='k')
#         if i==0:
#             ax[0].fill_between(N_RANGE, mean-std, mean+std, alpha=0.5, facecolor='none', edgecolor='k', lw=1)

    ax.legend(handles=legend_elements, loc='upper right')
    ax.set_ylabel('Difference in accuracy\n($\longleftarrow$ + LDA | QDA + $\longrightarrow$)')
    ax.set_xlabel('Sample size (n)\n')
    ax.set_title('Nonlinearity: {:.2}'.format(NONLIN_RANGE[slice]))
    ax.set_ylim(-0.4, 0.4)
    ax.axhline(0, c='k', lw=1, ls='--')
    plt.show()


w = interactive(plot_nonlin_errors, slice=(0, 9))
display(w)

w = interactive(plot_n_errors, slice=(0, 9))
display(w)

interactive(children=(IntSlider(value=6, description='slice', max=9), Output()), _dom_classes=('widget-interac…

interactive(children=(IntSlider(value=5, description='slice', max=9), Output()), _dom_classes=('widget-interac…

### If the Baysian decision boundary is linear, LDA and QDA perform similarly.
### If the Baysian decision boundary becomes more nonlinear, the error rate of LDA increases, while QDA maintains a steady performance.
### If the Baysian decision boundary is very linear, increase in sample size improve test error of both LDA and QDA.