# Data Processing:

Authors: Matthew Thoomkuzhy, Xinyan Liao and Noah Salehi

Date Last edited: 13/12/2024

Objective: Using the data collected and the data from ['Premier League Matches'](https://www.football-data.co.uk/englandm.php) to create a database for storing betting data and outcome data in a database

---

### 1. Cleaning the raw historical odds data

In [3]:
import json
import pandas as pd

def process_historical_odds(filepath):
    """
    Process a JSON file containing historical odds data to extract the highest odds for each event.

    Parameters:
        filepath (str): Path to the JSON file containing the historical odds data.

    Returns:
        pd.DataFrame: A DataFrame containing the highest odds for each event and their corresponding bookmakers.
    """
    # Load the historical odds data
    with open(filepath, 'r') as file:
        data = json.load(file)

    # Initialize a list to store the processed rows
    rows = []

    # Process each event in the dataset
    for event in data:
        event_id = event.get('id')
        home_team = event.get('home_team')
        away_team = event.get('away_team')
        commence_time = event.get('commence_time')

        # Track the highest odds and corresponding bookmakers for each outcome
        highest_home = {"bookie": None, "odds": 0}
        highest_away = {"bookie": None, "odds": 0}
        highest_draw = {"bookie": None, "odds": 0}

        for bookmaker in event.get('bookmakers', []):
            bookie_name = bookmaker.get('title')
            for market in bookmaker.get('markets', []):
                if market.get('key') == 'h2h':  # Only process head-to-head markets
                    outcomes = market.get('outcomes', [])
                    if len(outcomes) == 3:  # Ensure there are three outcomes (Win/Draw/Loss)
                        home_odds = outcomes[1].get('price', 0)
                        away_odds = outcomes[0].get('price', 0)
                        draw_odds = outcomes[2].get('price', 0)

                        # Update the highest odds and bookmakers
                        if home_odds > highest_home["odds"]:
                            highest_home = {"bookie": bookie_name, "odds": home_odds}
                        if away_odds > highest_away["odds"]:
                            highest_away = {"bookie": bookie_name, "odds": away_odds}
                        if draw_odds > highest_draw["odds"]:
                            highest_draw = {"bookie": bookie_name, "odds": draw_odds}

        # Add the event details with highest odds to the rows
        rows.append({
            "EventID": event_id,
            "Home Team": home_team,
            "Away Team": away_team,
            "Bookie Home": highest_home["bookie"],
            "Odds Home": highest_home["odds"],
            "Bookie Away": highest_away["bookie"],
            "Odds Away": highest_away["odds"],
            "Bookie Draw": highest_draw["bookie"],
            "Odds Draw": highest_draw["odds"],
            "Commence Time": commence_time
        })

    # Create a DataFrame from the rows
    return pd.DataFrame(rows)

# Example usage
file_path = '../data/raw/historical_odds_data.json'
historical_odds_df = process_historical_odds(file_path)
historical_odds_df


Unnamed: 0,EventID,Home Team,Away Team,Bookie Home,Odds Home,Bookie Away,Odds Away,Bookie Draw,Odds Draw,Commence Time
0,900cea9d210f2cd5a07a2fa20d331ec0,Manchester United,Everton,Unibet,1.54,Betfair,7.80,Marathon Bet,4.55,2021-10-02T11:30:00Z
1,a8261c8c9ddd409a0709fe341e47578c,Burnley,Norwich City,Marathon Bet,4.50,Matchbook,1.95,Matchbook,3.75,2021-10-02T14:00:00Z
2,eaffae3bf4c300d0c24cab4a3b797624,Chelsea,Southampton,Matchbook,10.00,Matchbook,1.41,Matchbook,5.20,2021-10-02T14:00:00Z
3,c20a07f1415454e7870171c1ebbcd853,Leeds United,Watford,Betfair,4.90,Matchbook,1.83,Betfair,4.00,2021-10-02T14:00:00Z
4,0b146cd6a7d664ec6a930b68716839ba,Brighton and Hove Albion,Arsenal,Matchbook,3.15,Matchbook,2.62,Marathon Bet,3.32,2021-10-02T16:30:00Z
...,...,...,...,...,...,...,...,...,...,...
190,24d6800c10627ea9ba06b9eaf6ea43a8,Crystal Palace,Newcastle United,Ladbrokes,4.00,888sport,2.05,Marathon Bet,3.74,2021-10-23T14:00:00Z
191,10db580c738d6f77345b2b06b1e6c88d,Brighton and Hove Albion,Manchester City,Marathon Bet,1.48,William Hill,7.50,Marathon Bet,4.70,2021-10-23T16:30:00Z
192,29f7901a3d56ef634fa069e77de3f928,Brentford,Leicester City,Bet Victor,2.55,Marathon Bet,2.89,Marathon Bet,3.50,2021-10-24T13:00:00Z
193,5c2f70543e3856a9b5ccbbd7c19a66da,West Ham United,Tottenham Hotspur,Sky Bet,2.45,Marathon Bet,2.93,Marathon Bet,3.68,2021-10-24T13:00:00Z


### 2. Obtaining the outcome data in a dataframe

In [8]:
import pandas as pd

# Paths to the uploaded CSV files
file_paths = [
    "../data/CSV's/19-20.csv",
    "../data/CSV's/20-21.csv",
    "../data/CSV's/21-22.csv",
    "../data/CSV's/22-23.csv",
    "../data/CSV's/23-24.csv",
    "../data/CSV's/24-25.csv"
]

# Read each CSV into a DataFrame
dataframes = [pd.read_csv(file_path) for file_path in file_paths]

# Concatenate all DataFrames chronologically
combined_df = pd.concat(dataframes, ignore_index=True)

# Keep only the specified columns
selected_columns = ['Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTR']
temp_df = combined_df[selected_columns]

# Convert the 'Date' column to datetime for filtering
temp_df.loc[:, 'Date'] = pd.to_datetime(temp_df['Date'], format='%d/%m/%Y', errors='coerce')

# Filter rows where 'Date' is on or after 01/07/2020
outcomes_df = temp_df[temp_df['Date'] >= pd.Timestamp('2020-07-01')]

# Display the filtered DataFrame
outcomes_df


Unnamed: 0,Date,Time,HomeTeam,AwayTeam,FTR
314,2020-07-01 00:00:00,18:00,Arsenal,Norwich,H
315,2020-07-01 00:00:00,18:00,Bournemouth,Newcastle,A
316,2020-07-01 00:00:00,18:00,Everton,Leicester,H
317,2020-07-01 00:00:00,20:15,West Ham,Chelsea,H
318,2020-07-02 00:00:00,18:00,Sheffield United,Tottenham,H
...,...,...,...,...,...
2044,2024-12-08 00:00:00,14:00,Fulham,Arsenal,D
2045,2024-12-08 00:00:00,14:00,Ipswich,Bournemouth,A
2046,2024-12-08 00:00:00,14:00,Leicester,Brighton,D
2047,2024-12-08 00:00:00,16:30,Tottenham,Chelsea,A
