# Baseline Model (K-Nearest Neighbors)

## Imports

Import libraries

In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.metrics import accuracy_score

Import data

In [2]:
with open('data/derived/data_train.pickle', 'rb') as file:
    data_train = pickle.load(file)
with open('data/derived/labels_train.pickle', 'rb') as file:
    labels_train = pickle.load(file)
print('Shape of data_train:', data_train.shape)
print('Length of labels_train:', len(labels_train))

Shape of data_train: (25361, 128, 128, 3)
Length of labels_train: 25361


Reshape and normalize data to train KNN model

In [3]:
data_train_reshaped = data_train.reshape((data_train.shape[0],
                                          data_train.shape[1] * data_train.shape[2] * data_train.shape[3]))
data_train_reshaped = data_train_reshaped / 255
print('Shape of data_train_reshaped:', data_train_reshaped.shape)

Shape of data_train_reshaped: (25361, 49152)


Convert labels to integers for model training

In [4]:
label_encoder = LabelEncoder()
labels_train_encoded = label_encoder.fit_transform(labels_train)
print('Original labels: {}'.format(labels_train[:5]))
print('Encoded labels: {}'.format(labels_train_encoded[:5]))
print('Label encoder classes: {}'.format(label_encoder.classes_[:5]))

Original labels: ['w_025911c', 'new_whale', 'new_whale', 'new_whale', 'new_whale']
Encoded labels: [44  0  0  0  0]
Label encoder classes: ['new_whale' 'w_0003639' 'w_0003c59' 'w_0027efa' 'w_00289b1']


Subsample data to reduce computational load

In [5]:
sample_index = np.random.choice(len(labels_train_encoded), 1000, replace=False)
data_sample = data_train_reshaped[sample_index]
labels_sample = labels_train_encoded[sample_index]

Build pipeline to perform PCA and KNN, then compute CV score for KNN

PCA to further reduce computational load for training KNN model, `n_components` set to achieve cumulative explained variance of ~80%

In [6]:
pipeline_knn = make_pipeline(PCA(n_components=0.8), KNeighborsClassifier(n_neighbors=5, metric='euclidean'))
repeat_kfold = RepeatedKFold(n_splits=5, n_repeats=2)
cv_scores = cross_val_score(pipeline_knn, data_sample, labels_sample, scoring='accuracy', cv=repeat_kfold)
print('Mean of CV scores:', np.mean(cv_scores))
print('SD of CV scores:', np.std(cv_scores))

Mean of CV scores: 0.354
SD of CV scores: 0.03726929030716843


Baseline CV accuracy score is about 35%