The goal of this notebook is to choose the number of PCA components for the feature extraction part. We want the number to provide the best possible accuracy without complicating the model too much.

In [1]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [13]:
from sklearn.metrics import accuracy_score

In [2]:
from src.models.knn_model import create_knn_model
from src.features.build_features import create_pca_model

In [3]:
train_data_filepath = os.path.join('..', 'data', 'raw', 'face_data_train.csv')
train_labels_filepath = os.path.join('..', 'data', 'raw', 'labels_train.csv')
test_data_filepath = os.path.join('..', 'data', 'raw', 'face_data_test.csv')
test_labels_filepath = os.path.join('..', 'data', 'raw', 'labels_test.csv')

In [4]:
train_data = pd.read_csv(train_data_filepath)
train_labels = pd.read_csv(train_labels_filepath)
test_data = pd.read_csv(test_data_filepath)
test_labels = pd.read_csv(test_labels_filepath)

In [17]:
scores = list()
for eigenface_count in range(1, train_data.shape[1] + 1):
    pca = create_pca_model(eigenface_count, train_data)
    current_train_data = pca.transform(train_data)
    current_test_data = pca.transform(test_data)
    model = create_knn_model(1, current_train_data, train_labels.values.ravel())
    current_score = accuracy_score(model.predict(current_test_data), test_labels)
    scores.append(current_score)

In [18]:
max(scores) # best possible accuracy

0.9375

In [20]:
scores.index(max(scores)) + 1 # minimal number of components required to achieve the best accuracy

40