In [40]:
import pandas as pd
import os
import unicodedata
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb

In [41]:
def union_csv_files(folder_path, csv_files):
    df_list = []
    for file in csv_files:
        file_path = os.path.join(folder_path, file)
        print(f"Reading {file_path}")
        try:
            df = pd.read_csv(file_path, delimiter=',', on_bad_lines='skip')
            df_list.append(df)
        except pd.errors.ParserError as e:
            print(f"Error reading {file_path}: {e}")
    
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

In [42]:
def read_all_csvs_in_folder(folder_path):
    csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]
    df_list = []
    for file in csv_files:
        file_path = os.path.join(folder_path, file)
        print(f"Reading {file_path}")
        try:
            df = pd.read_csv(file_path, delimiter=',', on_bad_lines='skip')
            df_list.append(df)
        except pd.errors.ParserError as e:
            print(f"Error reading {file_path}: {e}")
    
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

In [43]:
def random_forest_model(features, labels):
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)
    
    # Initialize and train the Random Forest model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = rf_model.predict(X_test)
    
    # Generate and print the classification report
    report = classification_report(y_test, y_pred)
    print("Classification Report:\n", report)
    
    # Print the accuracy score
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy:.2f}')
    
    return rf_model, report, accuracy

In [44]:
folder_path = 'D:/tcc_predictve_models'
combined_df = read_all_csvs_in_folder(folder_path)

Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2013.csv
Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2014.csv
Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2015.csv
Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2016.csv
Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2017.csv
Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2018.csv
Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2019.csv
Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2020.csv
Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2021.csv
Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2022.csv
Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2023.csv
Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2024.csv


In [45]:
combined_df.head()

Unnamed: 0,timestamp,date_GMT,status,attendance,home_team_name,away_team_name,referee,Game Week,Pre-Match PPG (Home),Pre-Match PPG (Away),...,odds_ft_home_team_win,odds_ft_draw,odds_ft_away_team_win,odds_ft_over15,odds_ft_over25,odds_ft_over35,odds_ft_over45,odds_btts_yes,odds_btts_no,stadium_name
0,1369517400,May 25 2013 - 9:30pm,complete,11099.0,Vasco da Gama,Portuguesa,,1,0.0,0.0,...,1.7,3.8,5.6,0.0,0.0,0.0,0.0,0.0,0.0,Estádio Club de Regatas Vasco da Gama
1,1369517400,May 25 2013 - 9:30pm,complete,8955.0,Vitória,Internacional,,1,0.0,0.0,...,2.85,3.34,2.63,0.0,0.0,0.0,0.0,0.0,0.0,Arena Fonte Nova
2,1369526400,May 26 2013 - 12:00am,complete,29295.0,Corinthians,Botafogo,,1,0.0,0.0,...,1.99,3.5,4.12,0.0,0.0,0.0,0.0,0.0,0.0,Estádio Municipal Paulo Machado de Carvalho (S...
3,1369594800,May 26 2013 - 7:00pm,complete,9560.0,Grêmio,Náutico,,1,0.0,0.0,...,1.5,4.38,7.38,0.0,0.0,0.0,0.0,0.0,0.0,Estádio Alfredo Jaconi
4,1369594800,May 26 2013 - 7:00pm,complete,6267.0,Ponte Preta,São Paulo,,1,0.0,0.0,...,3.18,3.33,2.41,0.0,0.0,0.0,0.0,0.0,0.0,Estádio Moisés Lucarelli


In [46]:
features_df = combined_df[['home_team_name', 'away_team_name', 'home_team_goal_count', 'away_team_goal_count', 'home_team_shots', 'home_team_shots_on_target', 'away_team_shots_on_target', 'away_team_fouls']]

features_df['result'] = features_df.apply(
    lambda row: (
        'home winner' if row['home_team_goal_count'] > row['away_team_goal_count'] else 
        ('away winner' if row['home_team_goal_count'] < row['away_team_goal_count'] else 'draw')
    ) if pd.notnull(row['home_team_goal_count']) and pd.notnull(row['away_team_goal_count']) else np.nan,
    axis=1
)

mapping = {'home winner': 0, 'away winner': 1, 'draw': 2}

# Apply the mapping to the column
features_df['result_code'] = features_df['result'].map(mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_df['result'] = features_df.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_df['result_code'] = features_df['result'].map(mapping)


In [47]:
# Dummy encode the 'home_team_code' and 'away_team_code' columns
dummies = pd.get_dummies(features_df[['home_team_name', 'away_team_name']], prefix=['home_team', 'away_team'], drop_first=True)

# Convert dummy variables to 0 and 1
dummies = dummies.astype(int)

In [48]:
# Drop the original columns and concatenate the dummy variables
features_df = pd.concat([features_df.drop(['home_team_name', 'away_team_name'], axis=1), dummies], axis=1)

In [49]:
def clean_column_names(df):
    def clean_name(name):
        # Convert to lowercase
        name = name.lower()
        # Replace spaces with underscores
        name = name.replace(' ', '_')
        # Remove accents
        name = ''.join(
            (c for c in unicodedata.normalize('NFD', name) 
            if unicodedata.category(c) != 'Mn')
        )
        # Replace any remaining non-alphanumeric characters with underscores
        name = re.sub(r'\W+', '_', name)
        return name

    # Apply the cleaning function to all column names
    df.columns = [clean_name(col) for col in df.columns]
    return df

In [50]:
# Apply the function to the DataFrame
features_df = clean_column_names(features_df)

In [51]:
labels = features_df['result_code']
features_df = features_df.drop(['result', 'result_code'], axis=1)

In [52]:
# Assuming 'features' and 'labels' are already defined
X_train, X_test, y_train, y_test = train_test_split(features_df, labels, test_size=0.3, random_state=42)

param_grid = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 1.5, 2]
}

xgboost = xgb.XGBClassifier(random_state=42, eval_metric='mlogloss')
grid_search = GridSearchCV(estimator=xgboost, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")

best_xgb = grid_search.best_estimator_
y_pred = best_xgb.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

Fitting 3 folds for each of 2187 candidates, totalling 6561 fits
Best Parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 0.7}
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       636
           1       1.00      1.00      1.00       336
           2       1.00      1.00      1.00       374

    accuracy                           1.00      1346
   macro avg       1.00      1.00      1.00      1346
weighted avg       1.00      1.00      1.00      1346

Accuracy: 1.00
