In [1]:
# HIDDEN
import warnings
# Ignore numpy dtype warnings. These warnings are caused by an interaction
# between numpy and Cython and can be safely ignored.
# Reference: https://stackoverflow.com/a/40846742
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import nbinteract as nbi

sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option('display.max_rows', 7)
pd.set_option('display.max_columns', 8)
pd.set_option('precision', 2)
# This option stops scientific notation for pandas
# pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
# HIDDEN
def df_interact(df, nrows=7, ncols=7):
    '''
    Outputs sliders that show rows and columns of df
    '''
    def peek(row=0, col=0):
        return df.iloc[row:row + nrows, col:col + ncols]

    row_arg = (0, len(df), nrows) if len(df) > nrows else fixed(0)
    col_arg = ((0, len(df.columns), ncols)
               if len(df.columns) > ncols else fixed(0))
    
    interact(peek, row=row_arg, col=col_arg)
    print('({} rows, {} columns) total'.format(df.shape[0], df.shape[1]))

def display_df(df, rows=pd.options.display.max_rows,
               cols=pd.options.display.max_columns):
    with pd.option_context('display.max_rows', rows,
                           'display.max_columns', cols):
        display(df)

In [3]:
def points_for_boundary(X, clf):
    x_min, x_max = X.iloc[:, 0].agg(['min', 'max'])
    y_min, y_max = X.iloc[:, 1].agg(['min', 'max'])
    xs = np.linspace(x_min, x_max, 100)
    ys = np.linspace(y_min, y_max, 100)
    points = pd.DataFrame({
        'xs': np.tile(xs, len(ys)),
        'ys': np.repeat(ys, len(xs)),
    })
    return points.assign(pred=clf.predict(points))

In [4]:
def decision_boundary(X, y, clf):
    pred = points_for_boundary(X, clf)
    plt.figure(figsize=(12, 6))
    
    plt.subplot(121)
    sns.scatterplot('xs', 'ys', hue='pred', data=pred, alpha=0.2, s=40,
                    legend=False)
    sns.scatterplot(X.iloc[:, 0], X.iloc[:, 1], hue=y,
                    s=60, legend=False)
    
    plt.subplot(122)
    sns.scatterplot('xs', 'ys', hue='pred', data=pred, s=40, legend=False)

## Bagging

In [None]:
from sklearn.datasets import make_moons, make_circles

X, y = make_circles(10000, noise=0.1, random_state=42)
data = pd.DataFrame(X, columns=['x1', 'x2']).assign(y=y)
data

In [None]:
plt.figure(figsize=(5, 5))
sns.scatterplot('x1', 'x2', hue='y', data=data)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    data[['x1', 'x2']], data['y'], test_size=0.2, random_state=42)
X_train

In [None]:
from sklearn.tree import DecisionTreeClassifier

...

In [None]:
from sklearn.ensemble import BaggingClassifier

...

## Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier

...

Why doesn't the RF do better than a simple bagging classifier in this case?

### Image Classification

In [None]:
def flatten(arr): return [i for row in arr for i in row]

def display_digit(digit, ax=None):
    im = np.array(digit).reshape((28, 28))
    ax = sns.heatmap(im, cbar=None, xticklabels=[], yticklabels=[], cmap=sns.color_palette('Greys'), ax=ax)
    ax.set_aspect('equal')
    
def display_digits(digits):
    fig, axes = plt.subplots(2, 5, squeeze=False, figsize=(6, 3))
    for digit, ax in zip(digits.values, flatten(axes)):
        display_digit(digit, ax)

In [None]:
from scipy.io import loadmat

data = loadmat('mnist_data.mat')
X = pd.DataFrame(data['training_data'])
y = pd.Series(data['training_labels'][:, 0])

In [None]:
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.1, shuffle=True)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.metrics import confusion_matrix

...

In [None]:
%%time

...

## Runtime Considerations

In [None]:
%%timeit

n = 1_000_000

[i ** 2 for i in range(n)]

In [None]:
%%timeit

n = 2_000_000

[i ** 2 for i in range(n)]

numpy makes things run much faster but doesn't change how the runtime grows!

In [None]:
%%timeit

n = 1_000_000

np.arange(n) ** 2

In [None]:
%%timeit

n = 2_000_000

np.arange(n) ** 2