# Model Training with Logistic Regression

## Importing Processed Dataset

In [100]:
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, learning_curve, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score

import matplotlib.pyplot as plt

import sys
import os
sys.path.append(os.path.abspath('../..'))
from utils import array_utils

url = '../../datasets/processed/processed.csv'
processed_df = pd.read_csv(url, low_memory=False)

## Data Splitting

In [101]:
processed_df.head(2)

Unnamed: 0,Good Player,Value (€),Potential Value,Wage (€),Best Overall Rating,Release Clause (€),Reactions,International Reputation,Potential,Potential Normalized
0,1,4.410448,4.672397,4.30677,2.845952,5.822134,3.009316,3.777075,2.530017,2.530017
1,1,4.410448,4.839821,4.30677,3.004658,15.195951,3.009316,3.777075,3.01494,3.01494


In [102]:
label = processed_df['Good Player']
feature = processed_df[array_utils.subtract(processed_df.columns, ['Good Player'])]

feature_train, feature_test, label_train, label_test = train_test_split(feature, label, test_size=0.2, random_state=42)

## Initial Model Training

### Training Model

In [103]:
logisticRegressionModel = LogisticRegression(random_state=42)

logisticRegressionModel.fit(feature_train, label_train)

### Testing Model

In [104]:
label_predicted = logisticRegressionModel.predict(feature_test)

## Model Evaluation

In [105]:
accuracy = accuracy_score(label_test, label_predicted)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9774985388661601


In [106]:
cm = confusion_matrix(label_test, label_predicted)
print(f'Confusion Matrix:\n{cm}')

Confusion Matrix:
[[2933   33]
 [  44  412]]


In [107]:
report = classification_report(label_test, label_predicted)
print(f'Classification Report:\n{report}')

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2966
           1       0.93      0.90      0.91       456

    accuracy                           0.98      3422
   macro avg       0.96      0.95      0.95      3422
weighted avg       0.98      0.98      0.98      3422



In [108]:
f1_score = f1_score(label_test, label_predicted)
print(f'F1 score: {f1_score}')

F1 score: 0.9145394006659268


### Learning Curve Analysis

In [109]:
# TODO: Plot model performance

## Model Iteration

In [110]:
# TODO: Refine the model based on evaluation results and repeat the training if necessary