---
# Data Collection:

**Author:** Matthew Thoomkuzhy

**Date last edited:** 4/2/2025

**Objective:** Collecting historical match data regarding all English Premier League matches over the past 4 years from the Odds API.

---

In [7]:
import requests
import json
import os
from datetime import datetime , timedelta
from dotenv import load_dotenv

# 1.1 Getting API  Key

In [8]:
API_key = os.getenv('API_KEY')

---

# 1.2 Odds-API In-built function

In [9]:
class HistoricalOddsAPIClient:
    def __init__(self, api_key, base_url="https://api.the-odds-api.com/v4/historical"):
        self.api_key = api_key
        self.base_url = base_url

    def construct_historical_url(self, sport, date, regions="us", markets="h2h", odds_format="american"):
        return f"{self.base_url}/sports/{sport}/odds/?apiKey={self.api_key}&regions={regions}&markets={markets}&oddsFormat={odds_format}&date={date}"

    def fetch_historical_odds(self, sport, date, regions="us", markets="h2h", odds_format="american"):
        url = self.construct_historical_url(sport, date, regions, markets, odds_format)
        try:
            response = requests.get(url)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data: {e}")
            return None

##### The key problem is that it the inbuilt function only returns data for one time period, but not all data within a range
---

# 1.3 Functions needed for NB01

I have written 2  more functions:

1. save_to_json: saves data to the raw folder ans dumps data from a request in a file

2. fetch_daily_odds: takes a start date, end date and timestamp. It collects data each day at that given timestamp then concatenates the results


In [10]:
def save_to_json(data, file_name, folder="../data/raw/"):
    os.makedirs(folder, exist_ok=True)
    file_path = os.path.join(folder, f"{file_name}.json")
    try:
        with open(file_path, "w") as json_file:
            json.dump(data, json_file, indent=4)
        print(f"Data saved to {file_path}.")
    except IOError as e:
        print(f"Error saving data: {e}")

def fetch_odds(client, sport, start_date, end_date, regions, markets, odds_format, daily_time="12:00:00Z"):
    current_date = datetime.fromisoformat(start_date)
    end_date = datetime.fromisoformat(end_date)
    all_data = []

    while current_date <= end_date:
        date_with_time = current_date.strftime(f"%Y-%m-%dT{daily_time}")
        print(f"Fetching historical odds for {sport} on {date_with_time}...")
        response = client.fetch_historical_odds(sport, date_with_time, regions, markets, odds_format)

        if response and "data" in response:
            all_data.extend(response["data"])  # Extract only the match data
        else:
            print(f"No data fetched for {date_with_time}.")

        current_date += timedelta(days=1)

    return all_data

---
# 1.4 Data Collection

In [None]:

    # Parameters, change according to what you want to find
    sport = "soccer_epl"
    start_date = "2020-06-10"
    end_date = "2024-12-30"
    regions = "uk"
    markets = "h2h"
    odds_format = "decimal"

    client = HistoricalOddsAPIClient(API_KEY)

    # Fetch data for the date range
    all_data = fetch_odds(client, sport, start_date, end_date, regions, markets, odds_format)

    if all_data:
        file_name = f"{sport}_odds_{start_date}_to_{end_date}"
        save_to_json(all_data, file_name)
    else:
        print("No data fetched for the specified date range.")


---
# 1.5 Code For Splitting Large File

- The data collected was far too large to be pushed to github (300MB)

- So I  have written code to split the large JSON file into several smaller JSON files categorised by month and year e.g. 2020_06

- It saves the categorised events in a folder within raw called 'grouped events'

In [None]:

def split_json_by_month(file_path):
    """
    Splits a large JSON file into smaller files, grouping by the commence time's month.

    :param file_path: Path to the large JSON file.
    """
    try:
        # Load the large JSON file
        with open(file_path, 'r') as file:
            data = json.load(file)

        # Dictionary to hold grouped events
        grouped_data = {}

        # Process each event
        for event in data:
            commence_time = event.get("commence_time")
            if commence_time:
                # Extract year and month (e.g., "2020-06")
                month_key = datetime.fromisoformat(commence_time.replace("Z", "")).strftime("%Y-%m")

                # Group events by month
                if month_key not in grouped_data:
                    grouped_data[month_key] = []
                grouped_data[month_key].append(event)

        # Save grouped events into separate files
        output_dir = "../data/raw/grouped_events"
        os.makedirs(output_dir, exist_ok=True)

        for month, events in grouped_data.items():
            output_file = os.path.join(output_dir, f"events_{month}.json")
            with open(output_file, 'w') as out_file:
                json.dump(events, out_file, indent=4)

        print(f"Files saved in directory: {output_dir}")

    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
file_path = "../data/raw/soccer_epl_odds_2020-06-10_to_2024-12-30.json"  
split_json_by_month(file_path)
