**Random Forest:**

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Load the dataset
training_data = pd.read_excel('/content/Premier_League_Date_Combined_Modified_2.0 - Training Set.xlsx')

# Define the feature columns
features = [
    'LSPR',    # Last Season Possession Ratio
    'LSGFD',   # Last Season Goals For Difference
    'LSGAD',   # Last Season Goals Against Difference
    'LSYCD',   # Last Season Yellow Cards Difference
    'LSPD',    # Last Season Penalty Difference
    'LSSPR',   # Last Season Save Percentage Ratio
    'LSCSPR',  # Last Season Clean Sheet Percentage Ratio
    'R5PD',    # Recent 5 Games Points Difference
    'R5GFD',   # Recent 5 Games Goals For Difference
    'R5GAD',   # Recent 5 Games Goals Against Difference
    'TSSD',    # This Season Squad Difference
    'TSAD',    # This Season Age (Average) Difference
    'TSFD',    # This Season Foreigners Difference
    'TSTMR',   # This Season Total Market Ratio
    'R3VATGD', # Recent 3 Versus Away Team Goals Difference
    'R3VATP'   # Recent 3 Vercus Away Team Points
]

# Define the seasons for sliding window cross-validation
seasons = ['2015-2016', '2016-2017', '2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022']

# Define the range of number of trees to tune in
n_estimators_values = [25, 50, 75, 100, 125, 150, 175, 200]
results = []

for n_estimators in n_estimators_values:
    print(f"Testing with n_estimators = {n_estimators}")
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=7)

    fold_results = []

    for i in range(2, len(seasons) - 1):
        # Define training and validation seasons
        train_seasons = seasons[i - 2:i]
        test_season = seasons[i + 1]

        # Filter training and testing data
        X_train = training_data[training_data['Season'].isin(train_seasons)][features]
        y_train = training_data[training_data['Season'].isin(train_seasons)]['Outcome_Label']
        X_test = training_data[training_data['Season'] == test_season][features]
        y_test = training_data[training_data['Season'] == test_season]['Outcome_Label']

        # Train the model
        model.fit(X_train, y_train)

        # Test the model
        y_pred = model.predict(X_test)

        # Evaluate the model
        report = classification_report(y_test, y_pred, output_dict=True)
        accuracy = report['accuracy']
        fold_results.append(accuracy)

    # Average results for this n_estimators value
    mean_accuracy = np.mean(fold_results)
    results.append({'n_estimators': n_estimators, 'Mean Accuracy': mean_accuracy})
    print(f"Mean Accuracy for n_estimators={n_estimators}: {mean_accuracy}")

# Find the best n_estimators value
results_df = pd.DataFrame(results)
best_result = results_df.loc[results_df['Mean Accuracy'].idxmax()]
print(f"\nBest n_estimators Value: {best_result['n_estimators']} with Mean Accuracy: {best_result['Mean Accuracy']}")


Testing with n_estimators = 25
Mean Accuracy for n_estimators=25: 0.513157894736842
Testing with n_estimators = 50
Mean Accuracy for n_estimators=50: 0.5315789473684212
Testing with n_estimators = 75
Mean Accuracy for n_estimators=75: 0.5217105263157895
Testing with n_estimators = 100
Mean Accuracy for n_estimators=100: 0.5256578947368421
Testing with n_estimators = 125
Mean Accuracy for n_estimators=125: 0.5236842105263158
Testing with n_estimators = 150
Mean Accuracy for n_estimators=150: 0.5269736842105264
Testing with n_estimators = 175
Mean Accuracy for n_estimators=175: 0.5210526315789474
Testing with n_estimators = 200
Mean Accuracy for n_estimators=200: 0.5197368421052632

Best n_estimators Value: 50.0 with Mean Accuracy: 0.5315789473684212


In [None]:
# Train the final model using the entire training set with the best n_estimators value
X_train = training_data[features]
y_train = training_data['Outcome_Label']
model = RandomForestClassifier(n_estimators=75, max_depth=7)
model.fit(X_train, y_train)

# Predict on the training data
y_pred = model.predict(X_train)

# Evaluate the model on the training data
training_confusion_matrix = confusion_matrix(y_train, y_pred)

print("Training Confusion Matrix:")
print(training_confusion_matrix)

Training Confusion Matrix:
[[ 585    0  274]
 [ 173   83  368]
 [ 130    0 1047]]
