In [1]:
import pandas as pd
import os


In [2]:
def union_csv_files(folder_path, csv_files):
    df_list = []
    for file in csv_files:
        file_path = os.path.join(folder_path, file)
        print(f"Reading {file_path}")
        try:
            df = pd.read_csv(file_path, delimiter=',', on_bad_lines='skip')
            df_list.append(df)
        except pd.errors.ParserError as e:
            print(f"Error reading {file_path}: {e}")
    
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

In [3]:
def read_all_csvs_in_folder(folder_path):
    csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]
    df_list = []
    for file in csv_files:
        file_path = os.path.join(folder_path, file)
        print(f"Reading {file_path}")
        try:
            df = pd.read_csv(file_path, delimiter=',', on_bad_lines='skip')
            df_list.append(df)
        except pd.errors.ParserError as e:
            print(f"Error reading {file_path}: {e}")
    
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

In [9]:
folder_path = 'D:/tcc_predictve_models'
combined_df = read_all_csvs_in_folder(folder_path)

Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2013.csv
Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2014.csv
Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2015.csv
Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2016.csv
Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2017.csv
Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2018.csv
Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2019.csv
Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2020.csv
Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2021.csv
Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2022.csv
Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2023.csv
Reading D:/tcc_predictve_models\campeonato_brasileiro_serie_a_2024.csv


In [10]:
combined_df.head()

Unnamed: 0,timestamp,date_GMT,status,attendance,home_team_name,away_team_name,referee,Game Week,Pre-Match PPG (Home),Pre-Match PPG (Away),...,odds_ft_home_team_win,odds_ft_draw,odds_ft_away_team_win,odds_ft_over15,odds_ft_over25,odds_ft_over35,odds_ft_over45,odds_btts_yes,odds_btts_no,stadium_name
0,1369517400,May 25 2013 - 9:30pm,complete,11099.0,Vasco da Gama,Portuguesa,,1,0.0,0.0,...,1.7,3.8,5.6,0.0,0.0,0.0,0.0,0.0,0.0,Estádio Club de Regatas Vasco da Gama
1,1369517400,May 25 2013 - 9:30pm,complete,8955.0,Vitória,Internacional,,1,0.0,0.0,...,2.85,3.34,2.63,0.0,0.0,0.0,0.0,0.0,0.0,Arena Fonte Nova
2,1369526400,May 26 2013 - 12:00am,complete,29295.0,Corinthians,Botafogo,,1,0.0,0.0,...,1.99,3.5,4.12,0.0,0.0,0.0,0.0,0.0,0.0,Estádio Municipal Paulo Machado de Carvalho (S...
3,1369594800,May 26 2013 - 7:00pm,complete,9560.0,Grêmio,Náutico,,1,0.0,0.0,...,1.5,4.38,7.38,0.0,0.0,0.0,0.0,0.0,0.0,Estádio Alfredo Jaconi
4,1369594800,May 26 2013 - 7:00pm,complete,6267.0,Ponte Preta,São Paulo,,1,0.0,0.0,...,3.18,3.33,2.41,0.0,0.0,0.0,0.0,0.0,0.0,Estádio Moisés Lucarelli


In [11]:
columns_to_drop = ['timestamp', 'status', 'date_GMT', 'attendance', 'referee', 'Game Week', 'Pre-Match PPG (Home)',  'Pre-Match PPG (Away)', 'home_ppg', 'away_ppg', 'total_goal_count', 'total_goals_at_half_time', 'home_team_goal_count_half_time','away_team_goal_count_half_time', 'home_team_corner_count','away_team_corner_count', 'home_team_yellow_cards', 'home_team_red_cards', 'away_team_yellow_cards' ,'stadium_name','btts_percentage_pre_match', 'over_15_percentage_pre_match','over_25_percentage_pre_match', 'over_35_percentage_pre_match','over_45_percentage_pre_match', 'over_15_HT_FHG_percentage_pre_match','over_05_HT_FHG_percentage_pre_match','over_15_2HG_percentage_pre_match', 'over_05_2HG_percentage_pre_match', 'odds_ft_home_team_win','odds_ft_draw', 'odds_ft_away_team_win', 'odds_ft_over15','odds_ft_over25', 'odds_ft_over35', 'odds_ft_over45', 'odds_btts_yes','odds_btts_no','home_team_goal_timings','away_team_goal_timings']
combined_df.drop(columns=columns_to_drop, inplace=True)

In [12]:
combined_df.columns

Index(['home_team_name', 'away_team_name', 'home_team_goal_count',
       'away_team_goal_count', 'away_team_red_cards',
       'home_team_first_half_cards', 'home_team_second_half_cards',
       'away_team_first_half_cards', 'away_team_second_half_cards',
       'home_team_shots', 'away_team_shots', 'home_team_shots_on_target',
       'away_team_shots_on_target', 'home_team_shots_off_target',
       'away_team_shots_off_target', 'home_team_fouls', 'away_team_fouls',
       'home_team_possession', 'away_team_possession',
       'Home Team Pre-Match xG', 'Away Team Pre-Match xG', 'team_a_xg',
       'team_b_xg', 'average_goals_per_match_pre_match',
       'average_corners_per_match_pre_match',
       'average_cards_per_match_pre_match'],
      dtype='object')