# K Neighbors

## Load Data

In [1]:
from pathlib import Path

import pandas as pd

In [2]:
from sklearn.utils import shuffle

base_directory = Path(r'/media/thor/PortableSSD/mydata/emojihero/dataset/dataset-face-expression-csv')
train_df = pd.read_csv(base_directory / 'training_set.csv')
val_df = pd.read_csv(base_directory / 'validation_set.csv')

SEED = 42
train_df = shuffle(train_df, random_state=SEED)
val_df = shuffle(val_df, random_state=SEED)
train_df.head()

Unnamed: 0,timestamp,participant_id,FAU_0,FAU_1,FAU_2,FAU_3,FAU_4,FAU_5,FAU_6,FAU_7,...,FAU_54,FAU_55,FAU_56,FAU_57,FAU_58,FAU_59,FAU_60,FAU_61,FAU_62,Label
760,1700488114760,3,0.107896,0.194889,1.401298e-45,1.32022e-22,0.024912,0.024321,1.401298e-45,1.401298e-45,...,9.209801e-21,2.610711e-37,1.000182e-13,5.09926e-05,0.01002689,4.672041e-07,2.050273e-07,0.05688704,0.08172417,4
884,1700748819058,16,0.834857,0.764817,1.0264619999999999e-20,2.168397e-11,0.416924,0.377339,0.002632983,1.401298e-45,...,0.002479763,0.21947,0.2563659,1.88864e-06,0.005865657,8.533958e-10,0.0002958799,0.01071407,0.0271013,5
938,1701357565849,34,0.131417,0.130338,0.0001680341,0.0001439196,0.252091,0.225658,0.0,0.0,...,1.401298e-45,0.01083556,0.01089628,2.802597e-45,1.9801029999999998e-34,1.401298e-45,1.401298e-45,0.2718902,0.2924575,5
901,1700827807807,21,0.615232,0.514535,0.009174733,0.009058302,0.389754,0.374438,0.0,0.0,...,0.02141047,0.9831126,0.9945938,0.008538091,0.04727362,9.986752e-05,4.469715e-06,0.3522862,0.4055933,5
622,1700748690839,16,1e-06,2e-05,0.01059425,0.01085821,0.010866,2e-06,1.401298e-45,0.0,...,7.473406e-12,5.651674e-10,0.01086265,0.01238884,0.00962015,0.1538864,0.1280742,1.139681e-12,6.957371e-13,3


In [3]:
import numpy as np
from sklearn.model_selection import PredefinedSplit

X_train = train_df.drop(['Label', 'timestamp', 'participant_id'], axis=1)
y_train = train_df['Label']
X_val = val_df.drop(['Label', 'timestamp', 'participant_id'], axis=1)
y_val = val_df['Label']
X_combined = pd.concat([X_train, X_val])
y_combined = pd.concat([y_train, y_val])

train_indices = -1 * np.ones(len(train_df), dtype=int)
val_indices = np.zeros(len(val_df), dtype=int)
split_index = np.concatenate((train_indices, val_indices))
ps = PredefinedSplit(split_index)

## Find optimal Hyperparameters via GridSearch

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

param_grid = {
    'knn__n_neighbors': [1, 3, 5, 7, 9, 11],
    'knn__weights': ['uniform', 'distance'],
    'knn__algorithm': ['ball_tree', 'kd_tree', 'brute'],
    'knn__metric': ['minkowski', 'euclidean', 'l1', 'l2', 'manhattan']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=ps, refit=False, scoring='accuracy', n_jobs=-1, verbose=2)

In [5]:
import time

start_time = time.time()
grid_search.fit(X_combined, y_combined)
end_time = time.time()

print(f'GridSearch took: {end_time - start_time:.2f} seconds.')

Fitting 1 folds for each of 180 candidates, totalling 180 fits
GridSearch took: 1.33 seconds.


In [7]:
best_parameters = grid_search.best_params_
print('Best Parameters:', best_parameters)
print('Best Cross-Validation-Accuracy:', grid_search.best_score_)

Best Parameters: {'knn__algorithm': 'ball_tree', 'knn__metric': 'minkowski', 'knn__n_neighbors': 5, 'knn__weights': 'distance'}
Best Cross-Validation-Accuracy: 0.7324675324675325


## Evaluation

In [12]:
best_model = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(
        algorithm=best_parameters['knn__algorithm'],
        metric=best_parameters['knn__metric'],
        n_neighbors=best_parameters['knn__n_neighbors'],
        weights=best_parameters['knn__weights']
    ))
])

best_model.fit(X_train, y_train)
print('Trained model on training set with optimal hyperparameters.')

Trained model on training set with optimal hyperparameters.


In [14]:
from sklearn.metrics import accuracy_score

y_val_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy with optimized Hyperparameters: {val_accuracy:.4f}")

Validation Accuracy with optimized Hyperparameters: 0.7325


In [15]:
test_df = pd.read_csv(base_directory / 'test_set.csv')
X_test, y_test = test_df.drop(['Label', 'timestamp', 'participant_id'], axis=1), test_df['Label']

In [17]:
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy with optimized Hyperparameters: {test_accuracy:.4f}")

Test Accuracy with optimized Hyperparameters: 0.6243


In [18]:
from sklearn.metrics import classification_report

emotion_to_id = {'Neutral': 0, 'Happiness': 1, 'Sadness': 2, 'Surprise': 3, 'Fear': 4, 'Disgust': 5, 'Anger': 6}

print("Classification Report (Test Set):")
print(classification_report(y_test, y_test_pred, target_names=emotion_to_id.keys(), digits=4))

Classification Report (Test Set):
              precision    recall  f1-score   support

     Neutral     0.5949    0.8704    0.7068        54
   Happiness     0.7027    0.9630    0.8125        54
     Sadness     0.8333    0.7407    0.7843        54
    Surprise     0.6143    0.7963    0.6935        54
        Fear     0.4884    0.3889    0.4330        54
     Disgust     0.4400    0.4074    0.4231        54
       Anger     0.7857    0.2037    0.3235        54

    accuracy                         0.6243       378
   macro avg     0.6370    0.6243    0.5967       378
weighted avg     0.6370    0.6243    0.5967       378



In [19]:
from sklearn.utils import compute_class_weight

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

In [20]:
def weighted_predict(model, X, class_weight_dict):
    pred_proba = model.predict_proba(X)
    
    weighted_pred_proba = np.zeros_like(pred_proba)
    for cls, weight in class_weight_dict.items():
        weighted_pred_proba[:, cls] = pred_proba[:, cls] * weight
    
    weighted_pred = np.argmax(weighted_pred_proba, axis=1)
    return weighted_pred

In [21]:
weighted_predictions = weighted_predict(best_model, X_test, class_weight_dict)
test_accuracy2 = accuracy_score(y_test, weighted_predictions)
print(f"Test Accuracy with optimized Hyperparameters and weighted predictions: {test_accuracy2:.2f}")

Test Accuracy with optimized Hyperparameters and weighted predictions: 0.63


In [22]:
print("Classification Report (Test Set):")
print(classification_report(y_test, weighted_predictions, target_names=emotion_to_id.keys(), digits=4))

Classification Report (Test Set):
              precision    recall  f1-score   support

     Neutral     0.6111    0.8148    0.6984        54
   Happiness     0.7761    0.9630    0.8595        54
     Sadness     0.8182    0.6667    0.7347        54
    Surprise     0.7292    0.6481    0.6863        54
        Fear     0.4507    0.5926    0.5120        54
     Disgust     0.4694    0.4259    0.4466        54
       Anger     0.5556    0.2778    0.3704        54

    accuracy                         0.6270       378
   macro avg     0.6300    0.6270    0.6154       378
weighted avg     0.6300    0.6270    0.6154       378

