In [22]:
import pandas as pd
import numpy as np
import unicodedata
import re
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# --- Your existing functions ---
def get_csv_data():
    df = pd.read_csv('df_2013_2024_camp_brasileiro.csv')
    return df

def define_target_variable(df):
    df['result'] = df.apply(
        lambda row: (
            'home winner' if row['home_team_goal_count'] > row['away_team_goal_count']
            else 'away winner' if row['home_team_goal_count'] < row['away_team_goal_count']
            else 'draw'
        ),
        axis=1
    )
    mapping = {'home winner': 0, 'away winner': 1, 'draw': 2}
    df['result_code'] = df['result'].map(mapping)

    df = df.drop(columns=['result','home_team_goal_count','away_team_goal_count'], axis=1) 

    return df, mapping

def dummie_categorical_values_fixed(df):
    dummies_variables = pd.get_dummies(df, columns=['home_team_name', 'away_team_name'], drop_first=True, dtype=int) # Added dtype=int
    return dummies_variables

def clean_column_names(df):
    def clean_name(name):
        name = str(name).lower() # Ensure name is string
        name = name.replace(' ', '_')
        name = ''.join(
            (c for c in unicodedata.normalize('NFD', name) 
            if unicodedata.category(c) != 'Mn')
        )
        name = re.sub(r'\W+', '_', name)
        return name
    df.columns = [clean_name(col) for col in df.columns]
    return df

def get_relevant_features(df):
    df_subset = df[['home_team_name', 'away_team_name','home_team_goal_count', 'away_team_goal_count', 'away_team_red_cards', 'home_team_shots', 'home_team_shots_on_target', 'away_team_shots_on_target', 'away_team_fouls']].copy()
    return df_subset

def get_train_test_data():
    df_raw = get_csv_data()
    df_features = get_relevant_features(df_raw)
    df_features, actual_mapping = define_target_variable(df_features)
    df_features = dummie_categorical_values_fixed(df_features)
    df_features = clean_column_names(df_features)
    return df_features, actual_mapping

# --- New XGBoost function ---
def train_and_tune_xgboost(df_processed):
    """
    Trains an XGBoost classifier, finds the best hyperparameters using GridSearchCV,
    and prints the results.

    Args:
        df_processed (pd.DataFrame): The preprocessed DataFrame with features and target.

    Returns:
        xgboost.XGBClassifier: The best trained XGBoost model.
    """
    # 1. Separate features (X) and target (y)
    # Ensure 'result_code' is not in X and is the target
    if 'result_code' not in df_processed.columns:
        raise ValueError("Target column 'result_code' not found in DataFrame.")
    
    if 'result' in df_processed.columns:
        X = df_processed.drop(['result_code', 'result'], axis=1)
    else:
        X = df_processed.drop('result_code', axis=1)
        
    y = df_processed['result_code']
    X.columns = [str(col) for col in X.columns]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print(f"Training set shape: X_train={X_train.shape}, y_train={y_train.shape}")
    print(f"Test set shape: X_test={X_test.shape}, y_test={y_test.shape}")
    print(f"Training target distribution:\n{y_train.value_counts(normalize=True)}")
    print(f"Test target distribution:\n{y_test.value_counts(normalize=True)}")


    # Expanded parameter grid [1]
    param_grid = {
        'learning_rate': [0.01, 0.05, 0.1],         # Smaller learning rates often require more n_estimators
        'max_depth': [3, 5, 7, 9],                  # Maximum depth of a tree
        'n_estimators': [100, 200, 300, 400, 500],  # Number of boosting rounds/trees
        'subsample': [0.6, 0.7, 0.8, 0.9],           # Fraction of samples used for fitting the individual base learners
        'colsample_bytree': [0.6, 0.7, 0.8, 0.9],    # Fraction of features used for fitting the individual base learners
        'gamma': [0, 0.1, 0.2, 0.3],                 # Minimum loss reduction required to make a further partition
        'min_child_weight': [1, 3, 5],               # Minimum sum of instance weight (hessian) needed in a child
        'reg_alpha': [0, 0.01, 0.1, 0.5, 1],         # L1 regularization term on weights
        'reg_lambda': [0.1, 0.5, 1, 1.5, 2]          # L2 regularization term on weights
        # 'scale_pos_weight': [1, 2, 3] # See note below regarding this parameter for multi-class
    }
    # Note on scale_pos_weight for multi-class:
    # The 'scale_pos_weight' parameter is primarily designed for binary classification or when one class
    # is disproportionately important. For multi-class problems like yours (3 classes), its behavior
    # in XGBoost's scikit-learn wrapper can be complex, potentially boosting only one class relative
    # to others. Given your class imbalance (home winner: 0.57, away winner: 0.44, draw: 0.36 from
    # previous precision), other methods like computing class weights for the 'sample_weight'
    # parameter in fit(), or using SMOTE, might be more targeted for multi-class imbalance.
    # If you want to experiment, you can add it, but carefully observe its impact on all classes.
    # Example values from your dataset distribution:
    # total_samples = len(y_train)
    # count_home_winner = (y_train == 0).sum()
    # count_away_winner = (y_train == 1).sum()
    # count_draw = (y_train == 2).sum()
    # A simple scale_pos_weight for 'away winner' (class 1) vs others might be:
    # (total_samples - count_away_winner) / count_away_winner if you were treating it as binary.
    # For now, it's commented out to simplify, but you can add it to the grid if you wish to explore.

    num_class = len(y.unique())
    xgb_model = xgb.XGBClassifier(
        objective='multi:softmax',
        num_class=num_class,
        use_label_encoder=False,  # Suppress warning for newer XGBoost versions
        eval_metric='mlogloss',   # Evaluation metric for multi-class classification
        random_state=42
    )

    # Adjust cv and n_jobs based on your computational power
    # cv=5 is a common choice for more robust cross-validation.
    # n_jobs=-1 will use all available CPU cores.
    grid_search = GridSearchCV(
        estimator=xgb_model,
        param_grid=param_grid,
        scoring='accuracy', # You could also try 'f1_macro' or 'f1_weighted' given class imbalance
        cv=3,  # Increase to 5 if you have time/power for more robust evaluation
        n_jobs=-1, # Use all available cores
        verbose=2  # Set to 1 for less output, 2 for more detailed updates
    )

    print("\nStarting GridSearchCV with expanded hyperparameter grid...")
    grid_search.fit(X_train, y_train)

    print("\nGridSearchCV finished.")
    print(f"Best cross-validation score (accuracy): {grid_search.best_score_:.4f}")
    print(f"Best Hyperparameters: {grid_search.best_params_}")

    best_xgb_model = grid_search.best_estimator_
    y_pred_test = best_xgb_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    print(f"\nAccuracy of the best model on the test set: {test_accuracy:.4f}")

    print("\nClassification Report on Test Set:")
    # Ensure your target_names match the mapping: {'home winner': 0, 'away winner': 1, 'draw': 2}
    target_names = sorted(mapping, key=mapping.get) # Dynamically get target names in order of codes
    print(classification_report(y_test, y_pred_test, target_names=target_names))

    return best_xgb_model

# --- How to use it ---
if __name__ == '__main__':
    # First, get your preprocessed data
    df_processed_data, class_mapping_used = get_train_test_data()
    
    # Clean column names again after dummification as new columns are added
    df_processed_data = clean_column_names(df_processed_data) 

    print("\nPreprocessed DataFrame head:")
    print(df_processed_data.head())
    print(f"\nShape of preprocessed DataFrame: {df_processed_data.shape}")
    print(f"\nColumns in preprocessed DataFrame: {df_processed_data.columns.tolist()}")
    
    if 'result_code' in df_processed_data.columns:
        print(f"\nValue counts for target 'result_code':\n{df_processed_data['result_code'].value_counts()}")
        
        # Now, run the XGBoost training and tuning function
        best_model = train_and_tune_xgboost(df_processed_data,class_mapping_used)
        
        # The 'best_model' can now be used for further predictions or saved
        # For example, to save the model:
        # best_model.save_model("best_xgboost_model.json") # [3]
        print("\nBest XGBoost model training and tuning complete.")
    else:
        print("\nError: 'result_code' column not found in the preprocessed data. Cannot proceed with XGBoost training.")




Preprocessed DataFrame head:
   away_team_red_cards  home_team_shots  home_team_shots_on_target  \
0                    1                9                          2   
1                    0               11                          2   
2                    0               10                          6   
3                    0               15                          7   
4                    1               15                          5   

   away_team_shots_on_target  away_team_fouls  result_code  \
0                          2               20            0   
1                          7               10            2   
2                          5               26            2   
3                          4               15            0   
4                          5               25            1   

   home_team_name_atletico_go  home_team_name_atletico_mineiro  \
0                           0                                0   
1                           0               

TypeError: train_and_tune_xgboost() takes 1 positional argument but 2 were given