# Model evaluation 
Part 1


In [67]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

In [68]:
file_path = 'data/dataset.csv'
df = pd.read_csv(file_path, sep=';', header=0)

df.head()

Unnamed: 0,ShotType,Competition,PlayerType,Transition,TwoLegged,Movement,Angle,Distance
0,above head,U14,F,1,1,no,73.83,0.73
1,layup,U14,F,0,1,no,28.13,1.02
2,above head,U14,F,0,1,no,51.88,7.22
3,above head,U14,F,1,1,no,80.84,3.64
4,above head,U14,F,0,1,no,30.89,7.2


In [69]:
null_counts = df.isnull().sum()
print(null_counts)

ShotType       0
Competition    0
PlayerType     0
Transition     0
TwoLegged      0
Movement       0
Angle          0
Distance       0
dtype: int64


In [70]:
categorical_columns = df.select_dtypes(include=['object']).columns
unique_classes = df[categorical_columns].nunique()
print("Number of unique classes in each categorical column:")
print(unique_classes)

Number of unique classes in each categorical column:
ShotType       6
Competition    5
PlayerType     3
Movement       3
dtype: int64


In [71]:
categorical_columns = ["Competition", "PlayerType", "Movement"]
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

In [74]:
y = df_encoded['ShotType']
X = df_encoded.drop(columns=['ShotType'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Helper functions

In [75]:
def log_loss(y_true, y_pred):
    """
    Compute the log loss (logarithmic loss) for multiclass classification.

    Parameters:
    - y_true: True labels (integers representing classes).
    - y_pred: Predicted probabilities for each class (2D array).

    Returns:
    - log_loss: The computed log loss.
    """
    # Clip predicted probabilities to avoid log(0) or log(1)
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    
    # Convert y_true to one-hot encoding
    y_true_one_hot = np.eye(y_pred.shape[1])[y_true]
    
    loss = -np.mean(np.sum(y_true_one_hot * np.log(y_pred), axis=1))
    
    return loss

In [76]:
def accuracy_score(y_true, y_pred):
    """
    Compute the accuracy score for classification.

    Parameters:
    - y_true: True labels.
    - y_pred: Predicted labels.

    Returns:
    - accuracy: The computed accuracy score.
    """
    correct = np.sum(y_true == y_pred)
    
    accuracy = correct / len(y_true)
    
    return accuracy

### Compare 3 models

#### 1. Baseline - learns and predicts the relative frequencies of classes

In [77]:
# Convert string labels to integers
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [78]:
baseline_model = DummyClassifier(strategy="stratified")
baseline_model.fit(X_train, y_train_encoded)

y_pred = baseline_model.predict(X_test)
y_pred_proba = baseline_model.predict_proba(X_test)

accuracy = accuracy_score(y_test_encoded, y_pred)
log_score = log_loss(y_test_encoded, y_pred_proba)

print(f"Accuracy: {accuracy}")
print(f"Log Loss: {log_score}")

Accuracy: 0.42639257294429705
Log Loss: 19.811698661536962


#### 2. Logistic Regression

In [80]:
log_reg = LogisticRegression(max_iter=1500)  
log_reg.fit(X_train, y_train_encoded)

y_pred = log_reg.predict(X_test)
y_pred_proba = log_reg.predict_proba(X_test)

accuracy = accuracy_score(y_test_encoded, y_pred)
log_score = log_loss(y_test_encoded, y_pred_proba)

print(f"Accuracy: {accuracy}")
print(f"Log Loss: {log_score}")

Accuracy: 0.7427055702917772
Log Loss: 0.6514255329754493
