# Model Training with k-Nearest Neighbor

## Importing Processed Dataset

In [15]:
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
import sys
import os
sys.path.append(os.path.abspath('../..'))
from utils import array_utils

url = '../../datasets/processed/processed.csv'
processed_df = pd.read_csv(url, low_memory=False)

## Data Scaling

In [16]:
label = processed_df['Good Player']
feature = processed_df[array_utils.subtract(processed_df.columns, ['Good Player'])]

scaler = StandardScaler()
feature_scaled = scaler.fit_transform(feature)

## Data Splitting

In [17]:
processed_df.head(2)

Unnamed: 0,Good Player,Value (€),Potential Value,Wage (€),Best Overall Rating,Release Clause (€),Reactions,International Reputation,Potential,Potential Normalized
0,1,4.410448,4.672397,4.30677,2.845952,5.822134,3.009316,3.777075,2.530017,2.530017
1,1,4.410448,4.839821,4.30677,3.004658,15.195951,3.009316,3.777075,3.01494,3.01494


In [18]:
feature_train, feature_test, label_train, label_test = train_test_split(feature_scaled, label, test_size=0.2, random_state=42)

## Initial Model Training

### Training Model

In [19]:
k_nearest_neighbors_model = KNeighborsClassifier(n_neighbors=5)

k_nearest_neighbors_model.fit(feature_train, label_train)

### Hyperparameter Tuning

In [6]:
# TODO: Tune hyperparameter

### Testing Model

In [20]:
label_predicted = k_nearest_neighbors_model.predict(feature_test)

### Interpreting Model

In [8]:
# TODO: Interpreting model

## Model Evaluation

In [21]:
accuracy = accuracy_score(label_test, label_predicted)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9810052600818235


In [24]:
f1_score = f1_score(label_test, label_predicted)
print(f'F1 score: {f1_score}')

F1 score: 0.9292709466811752


In [22]:
cm = confusion_matrix(label_test, label_predicted)
print(f'Confusion Matrix:\n{cm}')

Confusion Matrix:
[[2930   36]
 [  29  427]]


In [23]:
report = classification_report(label_test, label_predicted)
print(f'Classification Report:\n{report}')

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2966
           1       0.92      0.94      0.93       456

    accuracy                           0.98      3422
   macro avg       0.96      0.96      0.96      3422
weighted avg       0.98      0.98      0.98      3422



### Learning Curve Analysis

In [13]:
# TODO: Plot model performance

## Model Iteration

In [14]:
# TODO: Refine the model based on evaluation results and repeat the training if necessary