In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

In [None]:
# Load the dataset (adjust the file path as needed)
df = pd.read_csv("English Premier League_head_to_head.csv")

In [None]:
df['homeClassment'] = df.groupby('strHomeTeam')['homeClassment'].transform(lambda x: x.fillna(x.median()))
df['awayClassment'] = df.groupby('strAwayTeam')['awayClassment'].transform(lambda x: x.fillna(x.median()))
print(len(df))
df = df.dropna()
print(len(df))
print("Check missing values:")
print(df.isnull().sum())

1579
1516
Check missing values:
strLeague        0
strSeason        0
strEvent         0
strHomeTeam      0
intHomeScore     0
strAwayTeam      0
intAwayScore     0
homeClassment    0
awayClassment    0
dtype: int64


In [None]:
df = df.drop(columns=['strSeason', 'strCountry', 'strVenue', 'intRound', 'strTime', 'dateEvent'])

In [None]:
df = df.drop(columns=['strLeague', 'strSeason', 'strEvent'])

In [None]:
print(len(df))
print("Check missing values:")
print(df.isnull().sum())

5076
Check missing values:
strLeague          0
strHomeTeam        0
strAwayTeam        0
intHomeScore      68
intAwayScore      68
homeClassement     0
awayClassement     0
dtype: int64


In [None]:
df = df.dropna()

In [None]:
print(len(df))
print("Check missing values:")
print(df.isnull().sum())

5008
Check missing values:
strLeague         0
strHomeTeam       0
strAwayTeam       0
intHomeScore      0
intAwayScore      0
homeClassement    0
awayClassement    0
dtype: int64


In [None]:
df.head()

Unnamed: 0,strLeague,strHomeTeam,strAwayTeam,intHomeScore,intAwayScore,homeClassement,awayClassement
0,English Premier League,Chelsea,Arsenal,2.0,0.0,2,4
1,English Premier League,Chelsea,Tottenham,2.0,1.0,2,5
2,English Premier League,Chelsea,Liverpool,0.0,1.0,2,6
3,English Premier League,Chelsea,Everton,1.0,1.0,2,7
4,English Premier League,Chelsea,Fulham,1.0,0.0,2,8


In [None]:
top_league = df['strLeague'].value_counts().idxmax()

# Step 2: Replace all other leagues with "Other"
df['strLeague'] = df['strLeague'].apply(lambda x: x if x == top_league else 'Other')

In [None]:
df['strLeague'].value_counts()

Unnamed: 0_level_0,count
strLeague,Unnamed: 1_level_1
English Premier League,4828
Other,180


In [None]:
# Define features and targets
X = df[['strLeague' ,'strHomeTeam', 'strAwayTeam', 'homeClassement', 'awayClassement']]
y = df[['intHomeScore', 'intAwayScore']]

# ------------------------------------------------
# Step 1: Split the data into training, CV, and test sets.
# First split: 80% for training+CV, 20% for testing.
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
# Second split: from the 80%, use 75% for training and 25% for CV.
# That gives roughly 60% training, 20% CV, and 20% test overall.
X_train, X_cv, y_train, y_cv = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42
)

In [None]:
# ------------------------------------------------
# Step 2: Define a preprocessor for the features.
categorical_features = ['strLeague', 'strHomeTeam', 'strAwayTeam']
numerical_features = ['homeClassement', 'awayClassement']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', StandardScaler(), numerical_features)
    ]
)

In [None]:
# ------------------------------------------------
# Step 3: Create a pipeline that first preprocesses the data then fits the model.
# MultiOutputRegressor wraps the XGBRegressor to predict two targets.
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', MultiOutputRegressor(XGBRegressor(objective='reg:squarederror', random_state=42)))
])

In [None]:
# ------------------------------------------------
# Step 4: Define the parameter grid for hyperparameter tuning.
# Note: parameters for XGBRegressor are specified using "model__estimator__" as prefix.
param_grid = {
    'model__estimator__n_estimators': [50, 100, 128],
    'model__estimator__max_depth': [3, 5, 6],
    'model__estimator__learning_rate': [0.01, 0.05, 0.08]
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'model__estimator__n_estimators': [100, 200, 300, 400],
    'model__estimator__max_depth': [3, 5, 7, 10],
    'model__estimator__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'model__estimator__subsample': [0.6, 0.8, 1.0],
    'model__estimator__colsample_bytree': [0.6, 0.8, 1.0]
}

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=50,
    scoring='r2',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# ------------------------------------------
# STEP 6: Fit the model
random_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", random_search.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters: {'model__estimator__subsample': 0.8, 'model__estimator__n_estimators': 400, 'model__estimator__max_depth': 3, 'model__estimator__learning_rate': 0.01, 'model__estimator__colsample_bytree': 0.6}


In [None]:
# ------------------------------------------------
# Step 6: Evaluate the best model on the cross-validation set.
y_cv_pred = random_search.predict(X_cv)
cv_mse = mean_squared_error(y_cv, y_cv_pred)
cv_r2 = r2_score(y_cv, y_cv_pred)
print("CV Set - MSE:", cv_mse)
print("CV Set - R2 Score:", cv_r2)

# ------------------------------------------------
# Step 7: (Optional) Retrain the best model on training+CV data,
# then evaluate on the test set.
best_model = random_search.best_estimator_
best_model.fit(X_train_val, y_train_val)

y_test_pred = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)
print("Test Set - MSE:", test_mse)
print("Test Set - R2 Score:", test_r2)

Best Parameters from Grid Search: {'model__estimator__learning_rate': 0.01, 'model__estimator__max_depth': 3, 'model__estimator__n_estimators': 128}
CV Set - MSE: 1.5026934146881104
CV Set - R2 Score: 0.07520127296447754
Test Set - MSE: 1.2831408977508545
Test Set - R2 Score: 0.08491304516792297


In [None]:
import joblib
joblib.dump(best_model, "best_xgboost_model.pkl")

['best_xgboost_model.pkl']

In [None]:
import pandas as pd
import joblib  # To load the trained model

# Load the trained model
best_model = joblib.load("best_xgboost_model.pkl")

In [None]:
# Define the input for the specific match (Arsenal vs Chelsea)
match_data = pd.DataFrame({
    'strLeague': ['English Premier League'],
    'strHomeTeam': ['Arsenal'],  # Replace with actual home team
    'strAwayTeam': ['Chelsea'],  # Replace with actual away team
    'homeClassement': [4],  # Replace with actual ranking
    'awayClassement': [5]   # Replace with actual ranking
})

# Make prediction
predicted_score = best_model.predict(match_data)

# Extract the predicted home and away scores
predicted_home_score = predicted_score[0][0]
predicted_away_score = predicted_score[0][1]

# Print result
print(f"Predicted Score: Arsenal {predicted_home_score:.1f} - {predicted_away_score:.1f} Chelsea")

Predicted Score: Arsenal 2.0 - 1.2 Chelsea
