In [25]:
import pandas as pd
import json
import os

In [26]:
def process_odds_data(json_data):
    records = []
    for match in json_data:
        match_id = match["id"]
        commence_time = match["commence_time"]
        home_team = match["home_team"]
        away_team = match["away_team"]

        match_record = {
            "match_id": match_id,
            "commence_time": commence_time,
            "home_team": home_team,
            "away_team": away_team
        }

        for bookmaker in match["bookmakers"]:
            bookmaker_name = bookmaker["title"]

            home_odds, away_odds, draw_odds = None, None, None

            for market in bookmaker["markets"]:
                if market["key"] == "h2h":  # Only process head-to-head odds
                    for outcome in market["outcomes"]:
                        if outcome["name"] == home_team:
                            home_odds = outcome["price"]
                        elif outcome["name"] == away_team:
                            away_odds = outcome["price"]
                        elif outcome["name"] == "Draw":
                            draw_odds = outcome["price"]

            # Add bookmaker-specific odds to the match record
            match_record[f"{bookmaker_name}_home_odds"] = home_odds
            match_record[f"{bookmaker_name}_away_odds"] = away_odds
            match_record[f"{bookmaker_name}_draw_odds"] = draw_odds

        # Append the consolidated match record to the list
        records.append(match_record)

    return pd.DataFrame(records)



In [27]:

def process_all_files(directory_path, function):
    all_dataframes = []
    
    # Get a sorted list of JSON files in the directory
    json_files = sorted([f for f in os.listdir(directory_path) if f.startswith('events_') and f.endswith('.json')])
    
    for json_file in json_files:
        file_path = os.path.join(directory_path, json_file)
        
        # Load JSON data
        with open(file_path, 'r') as file:
            json_data = json.load(file)
        
        # Apply the function to process the JSON data
        df = function(json_data)
        
        # Add a column to identify the source file
        df['source_file'] = json_file
        
        # Append the DataFrame to the list
        all_dataframes.append(df)
    
    # Combine all DataFrames into one
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    return combined_df

# Directory path
directory_path = "../data/CSV's/raw/grouped_events/"

# Apply the function to all files and get the combined DataFrame
combined_dataframe = process_all_files(directory_path, process_odds_data)

combined_dataframe

Unnamed: 0,match_id,commence_time,home_team,away_team,Unibet_home_odds,Unibet_away_odds,Unibet_draw_odds,Sky Bet_home_odds,Sky Bet_away_odds,Sky Bet_draw_odds,...,Kwiff_draw_odds,Betfair Sportsbook_home_odds,Betfair Sportsbook_away_odds,Betfair Sportsbook_draw_odds,Grosvenor_home_odds,Grosvenor_away_odds,Grosvenor_draw_odds,Smarkets_home_odds,Smarkets_away_odds,Smarkets_draw_odds
0,2dd4a4f8663e6f835226a5209c614a60,2020-06-17T17:00:00Z,Aston Villa,Sheffield United,3.35,2.32,3.25,3.10,2.25,3.30,...,,,,,,,,,,
1,b1e029a0d989b4c11e843204003044f9,2020-06-17T19:15:00Z,Manchester City,Arsenal,1.35,8.50,5.60,1.36,7.50,5.25,...,,,,,,,,,,
2,59d68295dc2213634772cd941c91fa11,2020-06-19T19:15:00Z,Tottenham Hotspur,Manchester United,2.75,2.60,3.30,2.70,2.50,3.40,...,,,,,,,,,,
3,88352746f45f6beb4e2cb662d9414d0f,2020-06-20T11:30:00Z,Watford,Leicester City,3.40,2.15,3.45,3.25,2.20,3.40,...,,,,,,,,,,
4,065ae59da20562892de52b7f5598ecbf,2020-06-20T16:30:00Z,West Ham United,Wolverhampton Wanderers,3.50,2.15,3.35,3.30,2.20,3.30,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29691,4cdaad5e33d4d13e8992d35e4e0bf7dc,2025-01-15T19:30:00Z,Leicester City,Crystal Palace,,,,3.20,2.05,3.60,...,,,,,,,,3.05,2.02,3.40
29692,c88ab8eae714e18c07dff785f0dce8f7,2025-01-15T19:30:00Z,Newcastle United,Wolverhampton Wanderers,1.49,6.25,4.60,1.53,5.25,4.33,...,,,,,1.48,6.25,4.6,1.50,5.00,4.00
29693,5688a6d82f46a249c51caa737b98227e,2025-01-15T20:00:00Z,Arsenal,Tottenham Hotspur,1.43,6.75,4.90,1.44,6.25,4.60,...,,,,,1.43,6.75,4.9,1.39,5.80,4.40
29694,1ccb012b593a7b3ef8390cc30674add9,2025-01-16T19:30:00Z,Ipswich Town,Brighton and Hove Albion,4.10,1.83,3.80,4.00,1.80,3.75,...,,,,,4.10,1.83,3.8,3.90,1.74,3.55


In [28]:
def load_and_concatenate_premier_league_data(directory: str):
    """
    Loads multiple CSV files from a directory, extracts specific columns, and concatenates them into a single DataFrame.

    Args:
        directory (str): The directory containing the CSV files.

    Returns:
        pd.DataFrame: A DataFrame containing concatenated data with only the required columns.
    """
    # Define the columns to extract
    required_columns = ['Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTR']

    # Initialize an empty list to store individual DataFrames
    dataframes = []

    # Iterate over the files in the directory
    for filename in sorted(os.listdir(directory)):
        # Check if the file matches the naming pattern
        if filename.endswith('.csv') and filename[:2].isdigit() and '-' in filename:
            filepath = os.path.join(directory, filename)
            
            # Load the CSV file and extract the required columns
            df = pd.read_csv(filepath, usecols=required_columns)
            
            # Append the DataFrame to the list
            dataframes.append(df)

    # Concatenate all DataFrames in the list
    concatenated_df = pd.concat(dataframes, ignore_index=True)

    return concatenated_df

# Directory containing the CSV files
directory_csv = "../data/CSV's/"

# Call the function and get the concatenated DataFrame
results_data = load_and_concatenate_premier_league_data(directory_csv)

results_data

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,FTR
0,09/08/2019,20:00,Liverpool,Norwich,H
1,10/08/2019,12:30,West Ham,Man City,A
2,10/08/2019,15:00,Bournemouth,Sheffield United,D
3,10/08/2019,15:00,Burnley,Southampton,H
4,10/08/2019,15:00,Crystal Palace,Everton,D
...,...,...,...,...,...
2044,08/12/2024,14:00,Fulham,Arsenal,D
2045,08/12/2024,14:00,Ipswich,Bournemouth,A
2046,08/12/2024,14:00,Leicester,Brighton,D
2047,08/12/2024,16:30,Tottenham,Chelsea,A
