# XGBoost Training
This notebook implements the training process for the XGBoost model to predict target variables in the soccer dataset. It includes data preparation, model training, and evaluation.

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style='whitegrid')

ModuleNotFoundError: No module named 'xgboost'

In [None]:
# Load the dataset
df = pd.read_csv('../data/final/final.csv')

# Data preprocessing (assuming preprocessing functions are defined in src/data/preprocessing.py)
from src.data.preprocessing import preprocess_data
df_clean = preprocess_data(df)

# Define features and targets
FEATURES = [
    'Playing_Time_Min', 'Playing_Time_90s', 'Starts_Starts',
    'Per_90_Minutes_Gls', 'Per_90_Minutes_Ast', 'Per_90_Minutes_xG',
    'Standard_SoT%', 'KP', 'Ast', 'Tkl+Int', 'Blocks_Blocks'
]

TARGETS = [
    'Per_90_Minutes_npxG', 'Per_90_Minutes_xG', 'Standard_Sh/90'
]

In [None]:
# Split the data into training and testing sets
X = df_clean[FEATURES]
y = df_clean[TARGETS]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}')

In [None]:
# Initialize the XGBoost model
xgb_model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=42
)

# Fit the model
xgb_model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred_train = xgb_model.predict(X_train)
y_pred_test = xgb_model.predict(X_test)

# Evaluate the model
results = {
    'Train MAE': mean_absolute_error(y_train, y_pred_train),
    'Test MAE': mean_absolute_error(y_test, y_pred_test),
    'Train R²': r2_score(y_train, y_pred_train),
    'Test R²': r2_score(y_test, y_pred_test)
}
results

In [None]:
# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(X.columns, xgb_model.feature_importances_, color='skyblue')
plt.xlabel('Feature Importance')
plt.title('XGBoost Feature Importance')
plt.show()

In [None]:
# Visualize loss curves
train_errors, test_errors = [], []
for m in range(1, 101):
    xgb_model = XGBRegressor(n_estimators=m, random_state=42)
    xgb_model.fit(X_train, y_train)
    y_train_predict = xgb_model.predict(X_train)
    y_test_predict = xgb_model.predict(X_test)
    train_errors.append(mean_squared_error(y_train, y_train_predict))
    test_errors.append(mean_squared_error(y_test, y_test_predict))

plt.figure(figsize=(12, 6))
plt.plot(range(1, 101), train_errors, label='Train MSE', color='blue')
plt.plot(range(1, 101), test_errors, label='Test MSE', color='red')
plt.title('Loss Curves for XGBoost')
plt.xlabel('Number of Trees')
plt.ylabel('Mean Squared Error')
plt.legend()
plt.grid(True)
plt.show()