In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# 1. Load the CSV file into a pandas DataFrame
# This reads the raw football match data for the 2024-2025 season.
df = pd.read_csv('../data/raw/E0_2024-2025.csv')

# 2. Encode categorical variables (teams) as numerical codes
# Machine learning models require numeric inputs.
# Each unique team name is converted to a unique integer code.
df['HomeTeam_encoded'] = df['HomeTeam'].astype('category').cat.codes
df['AwayTeam_encoded'] = df['AwayTeam'].astype('category').cat.codes

In [None]:
# 3. Define features and target variable
# Features (X) are the encoded home and away teams.
# Target variable (y) is the match result ('FTR': H/D/A).
features = ['HomeTeam_encoded', 'AwayTeam_encoded']
X = df[features]
y = df['FTR']

In [None]:
# 4. Split the dataset into training and testing sets
# 80% of data is used for training, 20% for testing.
# random_state ensures reproducibility.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# 5. Initialize the Random Forest Classifier
# Random Forest is an ensemble method using multiple decision trees.
model = RandomForestClassifier(random_state=42)

In [None]:
# 6. Train the model on the training data
model.fit(X_train, y_train)

In [None]:
# 7. Make predictions on the test set
predictions = model.predict(X_test)

In [None]:
# 8. Evaluate model performance
# Accuracy: proportion of correct predictions.
# classification_report: detailed metrics for each class (H, D, A)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, predictions))