# Data Collection:

Authors: Matthew Thoomkuzhy, Xinyan Liao and Noah Salehi

Date Last edited: 13/12/2024

Objective: collecting historical match data regarding all premier league matches over the past 4 years

---

### **Importing necessary libraries**

In [1]:
import requests
import json
import os
from datetime import datetime
from dotenv import load_dotenv

# Collecting Historical odds data from the odds api 

first timestamp is mid june

In [None]:
import requests
import json
import os
from datetime import datetime, timedelta
from dotenv import load_dotenv

class HistoricalOddsAPIClient:
    def __init__(self, api_key, base_url="https://api.the-odds-api.com/v4/historical"):
        self.api_key = api_key
        self.base_url = base_url

    def construct_historical_url(self, sport, date, regions="us", markets="h2h", odds_format="american"):
        return f"{self.base_url}/sports/{sport}/odds/?apiKey={self.api_key}&regions={regions}&markets={markets}&oddsFormat={odds_format}&date={date}"

    def fetch_historical_odds(self, sport, date, regions="us", markets="h2h", odds_format="american"):
        url = self.construct_historical_url(sport, date, regions, markets, odds_format)
        try:
            response = requests.get(url)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data: {e}")
            return None

def save_to_json(data, file_name, folder="../data/raw/"):
    os.makedirs(folder, exist_ok=True)
    file_path = os.path.join(folder, f"{file_name}.json")
    try:
        with open(file_path, "w") as json_file:
            json.dump(data, json_file, indent=4)
        print(f"Data saved to {file_path}.")
    except IOError as e:
        print(f"Error saving data: {e}")

def fetch_daily_odds(client, sport, start_date, end_date, regions, markets, odds_format, daily_time="12:00:00Z"):
    current_date = datetime.fromisoformat(start_date)
    end_date = datetime.fromisoformat(end_date)
    all_data = []

    while current_date <= end_date:
        date_with_time = current_date.strftime(f"%Y-%m-%dT{daily_time}")
        print(f"Fetching historical odds for {sport} on {date_with_time}...")
        response = client.fetch_historical_odds(sport, date_with_time, regions, markets, odds_format)

        if response and "data" in response:
            all_data.extend(response["data"])  # Extract only the match data
        else:
            print(f"No data fetched for {date_with_time}.")

        current_date += timedelta(days=1)

    return all_data

if __name__ == "__main__":
    load_dotenv()
    API_KEY = os.getenv("API_KEY")  # Load the API key from the .env file

    if not API_KEY:
        print("API key not found. Please set it in the .env file.")
        exit(1)

    # Parameters, change according to what you want to find
    sport = "soccer_epl"
    start_date = "2021-10-01"
    end_date = "2021-10-10"
    regions = "uk"
    markets = "h2h"
    odds_format = "decimal"

    client = HistoricalOddsAPIClient(API_KEY)

    # Fetch data for the date range
    all_data = fetch_daily_odds(client, sport, start_date, end_date, regions, markets, odds_format)

    if all_data:
        file_name = f"{sport}_odds_{start_date}_to_{end_date}"
        save_to_json(all_data, file_name)
    else:
        print("No data fetched for the specified date range.")


Fetching historical odds for soccer_epl on 2019-10-01T12:00:00Z...
Fetching historical odds for soccer_epl on 2019-10-02T12:00:00Z...
Fetching historical odds for soccer_epl on 2019-10-03T12:00:00Z...
Fetching historical odds for soccer_epl on 2019-10-04T12:00:00Z...
Fetching historical odds for soccer_epl on 2019-10-05T12:00:00Z...
Fetching historical odds for soccer_epl on 2019-10-06T12:00:00Z...
Fetching historical odds for soccer_epl on 2019-10-07T12:00:00Z...
Fetching historical odds for soccer_epl on 2019-10-08T12:00:00Z...
Fetching historical odds for soccer_epl on 2019-10-09T12:00:00Z...


KeyboardInterrupt: 