In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

# Model validation

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [None]:
data = np.load('../data/spectral_lines.npz')
X = data['spec']
y = data['target']

In [None]:
X.shape, y.shape

In [None]:
from scipy.stats import itemfreq
for i, c in itemfreq(y):
    print("Type %i\t%i"%(i,c))

In [None]:
#Random 36 spectra from dataset
def get_color(x):
    return {
        1: 'm',
        2: 'g',
        3: 'b',
        4: 'r',
    }.get(x, 0)

plt.figure(figsize=(8,8))
for i, j in enumerate(np.random.permutation(X.shape[0])[200+50:200+75]):
    plt.subplot(5, 5, (i + 1))
    plt.plot(X[j], c=get_color(y[j]))
    plt.xticks(([0, 200, 400, 600, 777])), plt.yticks(())
    plt.tight_layout()

In [None]:
X=X[::3]
y=y[::3]

In [None]:
np.unique(y)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
np.unique(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
clf = SVC(kernel='rbf', class_weight='balanced')

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

# Cross-validation

In [None]:
cv = cross_val_score(clf, X, y, cv=5)
cv

In [None]:
cv.mean()

In [None]:
from sklearn.model_selection import learning_curve

def rms_error(model, X, y):
    y_pred = model.predict(X)
    return np.sqrt(np.mean((y - y_pred) ** 2))

def plot_with_err(x, data, **kwargs):
    mu, std = data.mean(1), data.std(1)
    lines = plt.plot(x, mu, '-', **kwargs)
    plt.fill_between(x, mu - std, mu + std, edgecolor='none',
                     facecolor=lines[0].get_color(), alpha=0.2)
    
train_sizes = np.linspace(0.05, 1., 10)
N_train, val_train, val_test = learning_curve(clf, X, y, cv=7, scoring=rms_error, shuffle=True)
plot_with_err(N_train, val_train, label='training scores')
plot_with_err(N_train, val_test, label='validation scores')
plt.xlabel('Training Set Size'); plt.ylabel('rms error')
plt.legend()

# Find a Bigger Hammer = Grid Search

In [None]:
svc_params = {
    'C': np.logspace(-2, 2, 5),
    'gamma': np.logspace(-4, 0, 5),
}

In [None]:
grid = GridSearchCV(clf, svc_params, cv=3)
grid.fit(X, y)
print(grid.best_params_, grid.best_score_)

In [None]:
y_pred = grid.predict(X)
print(accuracy_score(y, y_pred))
print(classification_report(y, y_pred))

In [None]:
grid.score(X_test, y_test)

In [None]:
clf = SVC(kernel='rbf', class_weight='balanced', 
         C=grid.best_params_['C'],
         gamma=grid.best_params_['gamma'])
clf.fit(X_train, y_train)
clf.score(X_test, y_test)