In [None]:
%pylab inline

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import confusion_matrix, f1_score
import seaborn
seaborn.set_style('white')

np.random.seed(12345)

In [None]:
seaborn.set_context('talk')

# Random Forest

## Generate Data

In [None]:
x = np.random.randn(100, 3)
y = (x[:, 0] > 0).astype(int)
x[:, 0] += 0.2*np.random.randn(100)
train_x = x[:75]
test_x = x[75:]
train_y = y[:75]
test_y = y[75:]

## Fit Random Forest classifer

In [None]:
model = RandomForestClassifier()
model.fit(train_x, train_y)
pred = model.predict(test_x)
print("Random Forest F1 Score", f1_score(test_y, pred))

## Plot confusion matrix

In [None]:
_ = seaborn.heatmap(pd.DataFrame(confusion_matrix(test_y, pred)), annot=True)

# SVM

## Generate nonlinear data

In [None]:
x = np.linspace(-10, 10)
y = np.linspace(-10, 10)
grid_x, grid_y = np.meshgrid(x, y)
z = grid_x ** 3+30*(grid_x*grid_y)
imshow(z, cmap=cm.RdBu, extent=[-10, 10, -10, 10])
colorbar()
df = pd.DataFrame(dict(x=grid_x.flatten(), y=grid_y.flatten(), z=(z.flatten() < 0).astype(int)))
x = df[['x','y']].values
y = df['z'].values
train_x = x[::2, :]
train_y = y[::2]
test_x = x[1::2, :]
test_y = y[1::2]

## Fit a Linear SVC

In [None]:
model = LinearSVC()
model.fit(train_x, train_y)
pred = model.predict(test_x)

In [None]:
_ = seaborn.heatmap(pd.DataFrame(confusion_matrix(test_y, pred)), annot=True, vmin=0, vmax=test_x.shape[0])

#### It doesn't perform very well at all

In [None]:
imshow(z, extent=[-10, 10, -10, 10], cmap=cm.RdBu)
scatter(test_x[:, 0], test_x[:, 1], c=pred, s=5)

#### It's getting confused by the two patches of high values

## Fit SVM with a different (nonlinear) kernel

In [None]:
model = SVC(kernel='rbf')
model.fit(train_x, train_y)
pred = model.predict(test_x)

In [None]:
_ = seaborn.heatmap(pd.DataFrame(confusion_matrix(test_y, pred)), annot=True, vmin=0, vmax=test_x.shape[0])

#### It performs much better, and can correctly classify both regions

In [None]:
imshow(z, extent=[-10, 10, -10, 10], cmap=cm.RdBu)
scatter(test_x[:, 0], test_x[:, 1], c=pred, s=5)