In [31]:
import pandas as pd
import json
import os

In [32]:
def process_odds_data(json_data):
    records = []

    # Track unique match IDs to avoid duplicates
    unique_matches = set()

    for match in json_data:
        match_id = match["id"]
        commence_time = match["commence_time"]
        home_team = match["home_team"]
        away_team = match["away_team"]

        # Check if this match has already been processed
        if match_id in unique_matches:
            continue  # Skip duplicates
        unique_matches.add(match_id)

        # Initialize a single match record
        match_record = {
            "match_id": match_id,
            "commence_time": commence_time,
            "home_team": home_team,
            "away_team": away_team
        }

        # Prepare to store bookmaker odds
        bookmaker_odds = {}

        for bookmaker in match["bookmakers"]:
            bookmaker_name = bookmaker["title"]

            home_odds, away_odds, draw_odds = None, None, None

            for market in bookmaker["markets"]:
                if market["key"] == "h2h":  # Only process head-to-head odds
                    for outcome in market["outcomes"]:
                        if outcome["name"] == home_team:
                            home_odds = outcome["price"]
                        elif outcome["name"] == away_team:
                            away_odds = outcome["price"]
                        elif outcome["name"] == "Draw":
                            draw_odds = outcome["price"]

            # Add bookmaker-specific odds
            bookmaker_odds[f"{bookmaker_name}_home_odds"] = home_odds
            bookmaker_odds[f"{bookmaker_name}_away_odds"] = away_odds
            bookmaker_odds[f"{bookmaker_name}_draw_odds"] = draw_odds

        # Merge bookmaker odds into the match record
        match_record.update(bookmaker_odds)

        # Append the consolidated match record to the list
        records.append(match_record)

    # Convert records to DataFrame and remove duplicates (as an extra safety step)
    df = pd.DataFrame(records).drop_duplicates(subset=["match_id"])
    return df

with open("../data/CSV's/raw/grouped_events/events_2020-06.json", 'r') as file:
    data = json.load(file)

odds = process_odds_data(data)

odds

Unnamed: 0,match_id,commence_time,home_team,away_team,Unibet_home_odds,Unibet_away_odds,Unibet_draw_odds,Sky Bet_home_odds,Sky Bet_away_odds,Sky Bet_draw_odds,...,Ladbrokes_draw_odds,Paddy Power_home_odds,Paddy Power_away_odds,Paddy Power_draw_odds,Matchbook_home_odds,Matchbook_away_odds,Matchbook_draw_odds,William Hill_home_odds,William Hill_away_odds,William Hill_draw_odds
0,2dd4a4f8663e6f835226a5209c614a60,2020-06-17T17:00:00Z,Aston Villa,Sheffield United,3.35,2.32,3.25,3.1,2.25,3.3,...,3.2,3.2,2.25,3.25,3.2,2.3,3.3,,,
1,b1e029a0d989b4c11e843204003044f9,2020-06-17T19:15:00Z,Manchester City,Arsenal,1.35,8.5,5.6,1.36,7.5,5.25,...,5.2,1.33,7.5,5.5,1.36,8.2,5.7,,,
2,59d68295dc2213634772cd941c91fa11,2020-06-19T19:15:00Z,Tottenham Hotspur,Manchester United,2.75,2.6,3.3,2.7,2.5,3.4,...,3.3,2.7,2.5,3.4,2.76,2.58,3.38,,,
3,88352746f45f6beb4e2cb662d9414d0f,2020-06-20T11:30:00Z,Watford,Leicester City,3.4,2.15,3.45,3.25,2.2,3.4,...,3.4,3.25,2.1,3.4,3.1,2.15,3.42,,,
4,065ae59da20562892de52b7f5598ecbf,2020-06-20T16:30:00Z,West Ham United,Wolverhampton Wanderers,3.5,2.15,3.35,3.3,2.2,3.3,...,3.3,3.4,2.1,3.3,3.54,2.16,3.36,,,
5,18b2fce7810a08864683bfab670e43a7,2020-06-20T18:45:00Z,Bournemouth,Crystal Palace,2.5,2.95,3.25,2.5,2.9,3.2,...,3.1,2.4,2.88,3.2,2.57,2.93,3.28,,,
6,66af1dc708c86c1b0c8ac7d616c4df18,2020-06-21T15:15:00Z,Aston Villa,Chelsea,5.75,1.55,4.3,5.5,1.57,4.2,...,4.2,5.5,1.5,4.33,5.7,1.56,4.35,,,
7,1eae5766b3aae2f12b90d37892675e48,2020-06-22T19:00:00Z,Manchester City,Burnley,1.14,16.0,9.0,1.17,15.0,7.5,...,8.5,1.12,17.0,8.5,1.16,11.0,8.6,,,
8,8c2fa8b258421acfa67b357d3e5e10f7,2020-06-19T17:00:00Z,Norwich City,Southampton,3.2,2.2,3.55,3.25,2.15,3.5,...,3.5,3.1,2.2,3.4,3.18,2.16,3.54,,,
9,d90fa72383b4c47582c997713e8caccb,2020-06-21T13:00:00Z,Newcastle United,Sheffield United,3.45,2.28,3.1,3.3,2.3,3.1,...,3.1,3.3,2.3,3.0,3.46,2.33,3.18,,,


In [33]:

def process_all_files(directory_path, function):
    all_dataframes = []
    
    # Get a sorted list of JSON files in the directory
    json_files = sorted([f for f in os.listdir(directory_path) if f.startswith('events_') and f.endswith('.json')])
    
    for json_file in json_files:
        file_path = os.path.join(directory_path, json_file)
        
        # Load JSON data
        with open(file_path, 'r') as file:
            json_data = json.load(file)
        
        # Apply the function to process the JSON data
        df = function(json_data)
        
        # Add a column to identify the source file
        df['source_file'] = json_file
        
        # Append the DataFrame to the list
        all_dataframes.append(df)
    
    # Combine all DataFrames into one
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    return combined_df

# Directory path
directory_path = "../data/CSV's/raw/grouped_events/"

# Apply the function to all files and get the combined DataFrame
combined_dataframe = process_all_files(directory_path, process_odds_data)

combined_dataframe



Unnamed: 0,match_id,commence_time,home_team,away_team,Unibet_home_odds,Unibet_away_odds,Unibet_draw_odds,Sky Bet_home_odds,Sky Bet_away_odds,Sky Bet_draw_odds,...,Kwiff_draw_odds,Betfair Sportsbook_home_odds,Betfair Sportsbook_away_odds,Betfair Sportsbook_draw_odds,Grosvenor_home_odds,Grosvenor_away_odds,Grosvenor_draw_odds,Smarkets_home_odds,Smarkets_away_odds,Smarkets_draw_odds
0,2dd4a4f8663e6f835226a5209c614a60,2020-06-17T17:00:00Z,Aston Villa,Sheffield United,3.35,2.32,3.25,3.10,2.25,3.30,...,,,,,,,,,,
1,b1e029a0d989b4c11e843204003044f9,2020-06-17T19:15:00Z,Manchester City,Arsenal,1.35,8.50,5.60,1.36,7.50,5.25,...,,,,,,,,,,
2,59d68295dc2213634772cd941c91fa11,2020-06-19T19:15:00Z,Tottenham Hotspur,Manchester United,2.75,2.60,3.30,2.70,2.50,3.40,...,,,,,,,,,,
3,88352746f45f6beb4e2cb662d9414d0f,2020-06-20T11:30:00Z,Watford,Leicester City,3.40,2.15,3.45,3.25,2.20,3.40,...,,,,,,,,,,
4,065ae59da20562892de52b7f5598ecbf,2020-06-20T16:30:00Z,West Ham United,Wolverhampton Wanderers,3.50,2.15,3.35,3.30,2.20,3.30,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015,4cdaad5e33d4d13e8992d35e4e0bf7dc,2025-01-15T19:30:00Z,Leicester City,Crystal Palace,,,,3.20,2.05,3.60,...,,,,,,,,3.05,2.02,3.40
2016,c88ab8eae714e18c07dff785f0dce8f7,2025-01-15T19:30:00Z,Newcastle United,Wolverhampton Wanderers,1.49,6.25,4.60,1.53,5.25,4.33,...,,,,,1.48,6.25,4.6,1.50,5.00,4.00
2017,5688a6d82f46a249c51caa737b98227e,2025-01-15T20:00:00Z,Arsenal,Tottenham Hotspur,1.43,6.75,4.90,1.44,6.25,4.60,...,,,,,1.43,6.75,4.9,1.39,5.80,4.40
2018,1ccb012b593a7b3ef8390cc30674add9,2025-01-16T19:30:00Z,Ipswich Town,Brighton and Hove Albion,4.10,1.83,3.80,4.00,1.80,3.75,...,,,,,4.10,1.83,3.8,3.90,1.74,3.55


In [38]:
def load_and_concatenate_premier_league_data(directory: str):
    """
    Loads multiple CSV files from a directory, extracts specific columns, and concatenates them into a single DataFrame.

    Args:
        directory (str): The directory containing the CSV files.

    Returns:
        pd.DataFrame: A DataFrame containing concatenated data with only the required columns.
    """
    # Define the columns to extract
    required_columns = ['Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTR']

    # Initialize an empty list to store individual DataFrames
    dataframes = []

    # Iterate over the files in the directory
    for filename in sorted(os.listdir(directory)):
        # Check if the file matches the naming pattern
        if filename.endswith('.csv') and filename[:2].isdigit() and '-' in filename:
            filepath = os.path.join(directory, filename)
            
            # Load the CSV file and extract the required columns
            df = pd.read_csv(filepath, usecols=required_columns)
            
            # Append the DataFrame to the list
            dataframes.append(df)

    # Concatenate all DataFrames in the list
    concatenated_df = pd.concat(dataframes, ignore_index=True)

    return concatenated_df

# Directory containing the CSV files
directory_csv = "../data/CSV's/"

# Call the function and get the concatenated DataFrame
results_data = load_and_concatenate_premier_league_data(directory_csv)

results_data

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,FTR
0,09/08/2019,20:00,Liverpool,Norwich,H
1,10/08/2019,12:30,West Ham,Man City,A
2,10/08/2019,15:00,Bournemouth,Sheffield United,D
3,10/08/2019,15:00,Burnley,Southampton,H
4,10/08/2019,15:00,Crystal Palace,Everton,D
...,...,...,...,...,...
2044,08/12/2024,14:00,Fulham,Arsenal,D
2045,08/12/2024,14:00,Ipswich,Bournemouth,A
2046,08/12/2024,14:00,Leicester,Brighton,D
2047,08/12/2024,16:30,Tottenham,Chelsea,A


In [35]:
premier_league_teams_api = {
    "Aston Villa": "AVL",
    "Manchester United": "MUN",
    "Tottenham Hotspur": "TOT",
    "Arsenal": "ARS",
    "Everton": "EVE",
    "Chelsea": "CHE",
    "West Ham United": "WHU",
    "Newcastle United": "NEW",
    "Brighton and Hove Albion": "BHA",
    "Wolverhampton Wanderers": "WOL",
    "Manchester City": "MCI",
    "Crystal Palace": "CRY",
    "Liverpool": "LIV",
    "Southampton": "SOU",
    "Leicester City": "LEI",
    "Fulham": "FUL",
    "Burnley": "BUR",
    "Brentford": "BRE",
    "Leeds United": "LEE",
    "Bournemouth": "BOU",
    "Sheffield United": "SHU",
    "Nottingham Forest": "NFO",
    "Watford": "WAT",
    "Norwich City": "NOR",
    "West Bromwich Albion": "WBA",
    "Luton": "LUT",
    "Ipswich Town": "IPS"
}

premier_league_teams_csv = {
    "Aston Villa": "AVL",
    "Crystal Palace": "CRY",
    "Tottenham": "TOT",
    "West Ham": "WHU",
    "Man United": "MUN",
    "Liverpool": "LIV",
    "Wolves": "WOL",
    "Chelsea": "CHE",
    "Man City": "MCI",
    "Brighton": "BHA",
    "Everton": "EVE",
    "Arsenal": "ARS",
    "Newcastle": "NEW",
    "Leicester": "LEI",
    "Southampton": "SOU",
    "Burnley": "BUR",
    "Fulham": "FUL",
    "Brentford": "BRE",
    "Bournemouth": "BOU",
    "Sheffield United": "SHU",
    "Leeds": "LEE",
    "Nott'm Forest": "NFO",
    "Norwich": "NOR",
    "Watford": "WAT",
    "West Brom": "WBA",
    "Luton": "LUT",
    "Ipswich": "IPS"
}

In [None]:
def generate_fixture_id_api(row, team_dict):
    home_abbr = team_dict.get(row['home_team'], "UNK")  # Default to 'UNK' if not found
    away_abbr = team_dict.get(row['away_team'], "UNK")  # Default to 'UNK' if not found
    match_date = pd.to_datetime(row['commence_time']).strftime('%d%m%y')
    return f"{home_abbr}{away_abbr}{match_date}"


combined_dataframe['fixture_id'] = combined_dataframe.apply(generate_fixture_id_api, axis=1, team_dict=premier_league_teams_api)

Unnamed: 0,match_id,commence_time,home_team,away_team,Unibet_home_odds,Unibet_away_odds,Unibet_draw_odds,Sky Bet_home_odds,Sky Bet_away_odds,Sky Bet_draw_odds,...,Betfair Sportsbook_home_odds,Betfair Sportsbook_away_odds,Betfair Sportsbook_draw_odds,Grosvenor_home_odds,Grosvenor_away_odds,Grosvenor_draw_odds,Smarkets_home_odds,Smarkets_away_odds,Smarkets_draw_odds,fixture_id
0,2dd4a4f8663e6f835226a5209c614a60,2020-06-17T17:00:00Z,Aston Villa,Sheffield United,3.35,2.32,3.25,3.10,2.25,3.30,...,,,,,,,,,,AVLSHU170620
1,b1e029a0d989b4c11e843204003044f9,2020-06-17T19:15:00Z,Manchester City,Arsenal,1.35,8.50,5.60,1.36,7.50,5.25,...,,,,,,,,,,MCIARS170620
2,59d68295dc2213634772cd941c91fa11,2020-06-19T19:15:00Z,Tottenham Hotspur,Manchester United,2.75,2.60,3.30,2.70,2.50,3.40,...,,,,,,,,,,TOTMUN190620
3,88352746f45f6beb4e2cb662d9414d0f,2020-06-20T11:30:00Z,Watford,Leicester City,3.40,2.15,3.45,3.25,2.20,3.40,...,,,,,,,,,,WATLEI200620
4,065ae59da20562892de52b7f5598ecbf,2020-06-20T16:30:00Z,West Ham United,Wolverhampton Wanderers,3.50,2.15,3.35,3.30,2.20,3.30,...,,,,,,,,,,WHUWOL200620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015,4cdaad5e33d4d13e8992d35e4e0bf7dc,2025-01-15T19:30:00Z,Leicester City,Crystal Palace,,,,3.20,2.05,3.60,...,,,,,,,3.05,2.02,3.40,LEICRY150125
2016,c88ab8eae714e18c07dff785f0dce8f7,2025-01-15T19:30:00Z,Newcastle United,Wolverhampton Wanderers,1.49,6.25,4.60,1.53,5.25,4.33,...,,,,1.48,6.25,4.6,1.50,5.00,4.00,NEWWOL150125
2017,5688a6d82f46a249c51caa737b98227e,2025-01-15T20:00:00Z,Arsenal,Tottenham Hotspur,1.43,6.75,4.90,1.44,6.25,4.60,...,,,,1.43,6.75,4.9,1.39,5.80,4.40,ARSTOT150125
2018,1ccb012b593a7b3ef8390cc30674add9,2025-01-16T19:30:00Z,Ipswich Town,Brighton and Hove Albion,4.10,1.83,3.80,4.00,1.80,3.75,...,,,,4.10,1.83,3.8,3.90,1.74,3.55,IPSBHA160125


In [41]:
def generate_fixture_id_csv(row, team_dict):
    home_abbr = team_dict.get(row['HomeTeam'], "UNK")  # Default to 'UNK' if not found
    away_abbr = team_dict.get(row['AwayTeam'], "UNK")  # Default to 'UNK' if not found
    match_date = pd.to_datetime(row['Date'], dayfirst=True).strftime('%d%m%y')
    return f"{home_abbr}{away_abbr}{match_date}"

results_data['fixture_id'] = results_data.apply(generate_fixture_id_csv, axis=1, team_dict=premier_league_teams_csv)

results_data



Unnamed: 0,Date,Time,HomeTeam,AwayTeam,FTR,fixture_id
0,09/08/2019,20:00,Liverpool,Norwich,H,LIVNOR090819
1,10/08/2019,12:30,West Ham,Man City,A,WHUMCI100819
2,10/08/2019,15:00,Bournemouth,Sheffield United,D,BOUSHU100819
3,10/08/2019,15:00,Burnley,Southampton,H,BURSOU100819
4,10/08/2019,15:00,Crystal Palace,Everton,D,CRYEVE100819
...,...,...,...,...,...,...
2044,08/12/2024,14:00,Fulham,Arsenal,D,FULARS081224
2045,08/12/2024,14:00,Ipswich,Bournemouth,A,IPSBOU081224
2046,08/12/2024,14:00,Leicester,Brighton,D,LEIBHA081224
2047,08/12/2024,16:30,Tottenham,Chelsea,A,TOTCHE081224
