# Model evaluation 
Part 1


In [23]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from itertools import product

In [2]:
file_path = 'data/dataset.csv'
df = pd.read_csv(file_path, sep=';', header=0)

df.head()

Unnamed: 0,ShotType,Competition,PlayerType,Transition,TwoLegged,Movement,Angle,Distance
0,above head,U14,F,1,1,no,73.83,0.73
1,layup,U14,F,0,1,no,28.13,1.02
2,above head,U14,F,0,1,no,51.88,7.22
3,above head,U14,F,1,1,no,80.84,3.64
4,above head,U14,F,0,1,no,30.89,7.2


In [3]:
null_counts = df.isnull().sum()
print(null_counts)

ShotType       0
Competition    0
PlayerType     0
Transition     0
TwoLegged      0
Movement       0
Angle          0
Distance       0
dtype: int64


In [4]:
categorical_columns = df.select_dtypes(include=['object']).columns
unique_classes = df[categorical_columns].nunique()
print("Number of unique classes in each categorical column:")
print(unique_classes)

Number of unique classes in each categorical column:
ShotType       6
Competition    5
PlayerType     3
Movement       3
dtype: int64


In [5]:
categorical_columns = ["Competition", "PlayerType", "Movement"]
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

In [6]:
y = df_encoded['ShotType']
X = df_encoded.drop(columns=['ShotType'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Helper functions

In [7]:
def log_loss(y_true, y_pred):
    """
    Compute the log loss (logarithmic loss) for multiclass classification.

    Parameters:
    - y_true: True labels (integers representing classes).
    - y_pred: Predicted probabilities for each class (2D array).

    Returns:
    - log_loss: The computed log loss.
    """
    # Clip predicted probabilities to avoid log(0) or log(1)
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    
    # Convert y_true to one-hot encoding
    y_true_one_hot = np.eye(y_pred.shape[1])[y_true]
    
    loss = -np.mean(np.sum(y_true_one_hot * np.log(y_pred), axis=1))
    
    return loss

In [8]:
def accuracy_score(y_true, y_pred):
    """
    Compute the accuracy score for classification.

    Parameters:
    - y_true: True labels.
    - y_pred: Predicted labels.

    Returns:
    - accuracy: The computed accuracy score.
    """
    correct = np.sum(y_true == y_pred)
    
    accuracy = correct / len(y_true)
    
    return accuracy

## Comparison of Three Machine Learning Models

In [9]:
# Converting the output string labels to integers
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

### 1. Baseline model
Learns and Predicts the Relative Frequencies of Classes

In [102]:
baseline_model = DummyClassifier(strategy="stratified")
baseline_model.fit(X_train, y_train_encoded)

y_pred = baseline_model.predict(X_test)
y_pred_proba = baseline_model.predict_proba(X_test)

accuracy = accuracy_score(y_test_encoded, y_pred)
log_score = log_loss(y_test_encoded, y_pred_proba)

print(f"Accuracy: {accuracy}")
print(f"Log Loss: {log_score}")

Accuracy: 0.4336870026525199
Log Loss: 20.086543036032275


### 2. Logistic Regression

In [10]:
log_reg = LogisticRegression(max_iter=1500)  
log_reg.fit(X_train, y_train_encoded)

y_pred = log_reg.predict(X_test)
y_pred_proba = log_reg.predict_proba(X_test)

accuracy = accuracy_score(y_test_encoded, y_pred)
log_score = log_loss(y_test_encoded, y_pred_proba)

print(f"Accuracy: {accuracy}")
print(f"Log Loss: {log_score}")

Accuracy: 0.7427055702917772
Log Loss: 0.6514255329754493


### 3. Random Forest

In [24]:
model = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# to generate all combinations of hyperparameters
keys, values = zip(*param_grid.items())
param_combinations = [dict(zip(keys, v)) for v in product(*values)]


#### Approach 1: Optimizing training fold performance

In [25]:
def flat_cv(X, y, param_grid, n_splits=5):
    cv = KFold(n_splits=n_splits)
    acc_max = 0.0
    log_loss_min = float('inf')
    best_params = None
    
    for params in param_grid:
        acc = 0.0
        log_loss_total = 0.0
        
        for train_idx, test_idx in cv.split(X):
            X_train_cv, X_test_cv = X.iloc[train_idx], X.iloc[test_idx]
            y_train_cv, y_test_cv = y[train_idx], y[test_idx]
            
            model = RandomForestClassifier(**params)
            model.fit(X_train_cv, y_train_cv)
            
            y_pred_proba = model.predict_proba(X_test_cv)
            y_pred = model.predict(X_test_cv)
            
            acc += accuracy_score(y_test_cv, y_pred)
            log_loss_total += log_loss(y_test_cv, y_pred_proba)
        
        acc_avg = acc / n_splits
        log_loss_avg = log_loss_total / n_splits
        
        # Check if this is the best model based on log loss
        if log_loss_avg < log_loss_min:
            log_loss_min = log_loss_avg
            acc_max = acc_avg
            best_params = params
    
    return acc_max, log_loss_min, best_params

In [26]:
best_accuracy, best_log_loss, best_hyperparams = flat_cv(X_train, y_train_encoded, param_combinations)
print(f"Best Accuracy: {best_accuracy}")
print(f"Best Log Loss: {best_log_loss}")
print(f"Best Hyperparameters: {best_hyperparams}")

# Train the final model with the best hyperparameters on the entire training set
final_model = RandomForestClassifier(**best_hyperparams)
final_model.fit(X_train, y_train_encoded)

# Evaluate on the test set
y_test_pred = final_model.predict(X_test)
y_test_pred_proba = final_model.predict_proba(X_test)
test_accuracy = accuracy_score(y_test_encoded, y_test_pred)
test_log_loss = log_loss(y_test_encoded, y_test_pred_proba)

print(f"Test Accuracy: {test_accuracy}")
print(f"Test Log Loss: {test_log_loss}")

Best Accuracy: 0.7596748512866933
Best Log Loss: 0.6355365048802055
Best Hyperparameters: {'n_estimators': 100, 'max_features': 'sqrt', 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 1}
Test Accuracy: 0.7718832891246684
Test Log Loss: 0.6016953389650355


#### Approach 2: Nested cross-validation

In [None]:
for train_idx, test_idx in outer_cv.split(X_train):
    X_train_cv, X_test_cv = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_cv, y_test_cv = y_train_encoded[train_idx], y_train_encoded[test_idx]
    
    # Inner cross-validation for parameter tuning
    grid_search = GridSearchCV(model, param_grid, cv=inner_cv, scoring='neg_log_loss')
    grid_search.fit(X_train_cv, y_train_cv)
    
    # Best model from inner CV
    best_model = grid_search.best_estimator_
    
    # Evaluate on the test fold
    y_pred_prob = best_model.predict_proba(X_test_cv)
    y_pred = best_model.predict(X_test_cv)
    
    # Store metrics
    results['nested_cv']['log_loss'].append(log_loss(y_test_cv, y_pred_prob))
    results['nested_cv']['accuracy'].append(accuracy_score(y_test_cv, y_pred))

## Analysis of results

In [None]:
# Compute mean and standard deviation of metrics
for approach in results:
    for metric in results[approach]:
        mean_score = np.mean(results[approach][metric])
        std_score = np.std(results[approach][metric])
        print(f"{approach} - {metric}: Mean = {mean_score:.4f}, Std = {std_score:.4f}")