# Data Collection:

**Author:** Matthew Thoomkuzhy

**Date last edited:** 13/12/2024

**Objective:** Collecting historical match data regarding all premier league matches over the past 4 years

---

In [7]:
import requests
import json
import os
from datetime import datetime , timedelta
from dotenv import load_dotenv

# 1.1 Getting API  Key

In [8]:
if __name__ == "__main__":
    load_dotenv()
    API_KEY = os.getenv("API_KEY")  # Load the API key from the .env file

    if not API_KEY:
        print("API key not found. Please set it in the .env file.")
        exit(1)

---

# 1.2 Odds-API In-built function

In [9]:
class HistoricalOddsAPIClient:
    def __init__(self, api_key, base_url="https://api.the-odds-api.com/v4/historical"):
        self.api_key = api_key
        self.base_url = base_url

    def construct_historical_url(self, sport, date, regions="us", markets="h2h", odds_format="american"):
        return f"{self.base_url}/sports/{sport}/odds/?apiKey={self.api_key}&regions={regions}&markets={markets}&oddsFormat={odds_format}&date={date}"

    def fetch_historical_odds(self, sport, date, regions="us", markets="h2h", odds_format="american"):
        url = self.construct_historical_url(sport, date, regions, markets, odds_format)
        try:
            response = requests.get(url)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data: {e}")
            return None

##### The key problem is that it the inbuilt function only returns data for one time period, but not all data within a range
---

# 1.3 Functions needed for NB01

I have written 2  more functions:

1. save_to_json: saves data to the raw folder ans dumps data from a request in a file
2. fetch_daily_odds: takes a start date, end date and timestamp. It collects data each day at that given timestamp then concatenates the results


In [10]:


def save_to_json(data, file_name, folder="../data/raw/"):
    os.makedirs(folder, exist_ok=True)
    file_path = os.path.join(folder, f"{file_name}.json")
    try:
        with open(file_path, "w") as json_file:
            json.dump(data, json_file, indent=4)
        print(f"Data saved to {file_path}.")
    except IOError as e:
        print(f"Error saving data: {e}")

def fetch_odds(client, sport, start_date, end_date, regions, markets, odds_format, daily_time="12:00:00Z"):
    current_date = datetime.fromisoformat(start_date)
    end_date = datetime.fromisoformat(end_date)
    all_data = []

    while current_date <= end_date:
        date_with_time = current_date.strftime(f"%Y-%m-%dT{daily_time}")
        print(f"Fetching historical odds for {sport} on {date_with_time}...")
        response = client.fetch_historical_odds(sport, date_with_time, regions, markets, odds_format)

        if response and "data" in response:
            all_data.extend(response["data"])  # Extract only the match data
        else:
            print(f"No data fetched for {date_with_time}.")

        current_date += timedelta(days=1)

    return all_data

# 1.4 Code run to collect data

In [13]:
if __name__ == "__main__":
    load_dotenv()
    API_KEY = os.getenv("API_KEY")  # Load the API key from the .env file

    if not API_KEY:
        print("API key not found. Please set it in the .env file.")
        exit(1)

    # Parameters, change according to what you want to find
    sport = "soccer_epl"
    start_date = "2020-06-10"
    end_date = "2024-12-30"
    regions = "uk"
    markets = "h2h"
    odds_format = "decimal"

    client = HistoricalOddsAPIClient(API_KEY)

    # Fetch data for the date range
    all_data = fetch_odds(client, sport, start_date, end_date, regions, markets, odds_format)

    if all_data:
        file_name = f"{sport}_odds_{start_date}_to_{end_date}"
        save_to_json(all_data, file_name)
    else:
        print("No data fetched for the specified date range.")


Fetching historical odds for soccer_epl on 2020-06-10T12:00:00Z...
Fetching historical odds for soccer_epl on 2020-06-11T12:00:00Z...
Fetching historical odds for soccer_epl on 2020-06-12T12:00:00Z...
Fetching historical odds for soccer_epl on 2020-06-13T12:00:00Z...
Fetching historical odds for soccer_epl on 2020-06-14T12:00:00Z...
Fetching historical odds for soccer_epl on 2020-06-15T12:00:00Z...
Fetching historical odds for soccer_epl on 2020-06-16T12:00:00Z...
Fetching historical odds for soccer_epl on 2020-06-17T12:00:00Z...
Fetching historical odds for soccer_epl on 2020-06-18T12:00:00Z...
Fetching historical odds for soccer_epl on 2020-06-19T12:00:00Z...
Fetching historical odds for soccer_epl on 2020-06-20T12:00:00Z...
Fetching historical odds for soccer_epl on 2020-06-21T12:00:00Z...
Fetching historical odds for soccer_epl on 2020-06-22T12:00:00Z...
Fetching historical odds for soccer_epl on 2020-06-23T12:00:00Z...
Fetching historical odds for soccer_epl on 2020-06-24T12:00:00