In [1]:
import numpy as np
import cv2
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm, metrics
from tqdm import tqdm_notebook as tqdm
import random
from sklearn.dummy import DummyClassifier
from scipy import stats
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
path_to_data = "./gdrive/My Drive/Mcgill/U4/Fall 2018/COMP 551/kaggle/data/train_images.npy"
path_to_labels = "./gdrive/My Drive/Mcgill/U4/Fall 2018/COMP 551/kaggle/data/train_labels.csv"
path_to_test = "./gdrive/My Drive/Mcgill/U4/Fall 2018/COMP 551/kaggle/data/test_images.npy"

In [0]:
data = np.load(path_to_data, encoding = 'bytes')

In [0]:
labels_df = pd.read_csv(path_to_labels)
labels_df.Category = pd.Categorical(labels_df.Category)
y = labels_df.Category.cat.codes.values

In [0]:
def plot_images(list_of_images, max_col = 4):
    n = len(list_of_images)
    if n == 1:
        plt.imshow(list_of_images[0], cmap = 'gray_r'); plt.axis('off'); plt.show()
    else:
        # get number of columns and rows required
        r, c = 1, n
        if n > max_col:
            c = max_col
            r = int(math.ceil(n/max_col))
    
        fig = plt.figure(figsize=(17, max_col * r))
        for i, (img,name) in enumerate(list_of_images):
            ax = fig.add_subplot(r, c, (i+1))
            ax.set_title(str(name))
            ax.axis('off')
            ax.imshow(img, cmap = 'gray_r')

In [0]:
X = list(data[:, 1])

In [0]:
x_tr, x_val, y_tr, y_val = train_test_split(X, y, test_size=0.33, random_state=42)

In [0]:
def performance_random_clf(x_tr, y_tr, x_test, y_test, r = 1234):
    clf = DummyClassifier(strategy = 'uniform', random_state = r)
    clf.fit(x_tr, y_tr)
    preds = clf.predict(x_test)
    metric = metrics.classification_report(y_test, preds)
    return metric
    
def performance_majority_classifier(x_tr, y_tr, x_test, y_test, r = 1234):
    most_common_val = stats.mode(y_tr).mode[0]
    preds = np.full((y_test.shape), most_common_val)
    metric = metrics.classification_report(y_test, preds)
    return metric

In [0]:
def fine_tune_svm(x_tr, y_tr, x_test, y_test):
  # Set the parameters by cross-validation
  tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100]}]
  models = []
  clf = GridSearchCV(svm.SVC(), tuned_parameters, cv=5,
                       scoring = 'accuracy', n_jobs = -1)
  clf.fit(x_tr, y_tr)
  models.append(clf)
  print(clf.best_params_)

  return models

In [0]:
best_svm_model = fine_tune_svm(x_tr, y_tr, x_val, y_val)