In [41]:
import pandas as pd
from dotenv import load_dotenv

from pydantic import BaseModel
from langchain_google_genai import ChatGoogleGenerativeAI

from api_helpers.clients import get_postgres_client


load_dotenv(
    dotenv_path="/Users/tomwattley/App/racing-api-project/racing-api-project/libraries/api-helpers/src/api_helpers/.env"

)

from api_helpers.config import config

pg = get_postgres_client()


In [42]:
# ...existing code...
import requests
import json
from typing import Any

class MatchbookClient:
    def __init__(
        self,
        username: str,
        password: str,
        base_urls: dict | None = None,
        user_agent: str = "api-doc-test-client",
        default_service: str = "bpapi",
        timeout: float | None = 15.0,
    ):
        self.username = username
        self.password = password
        self.base_urls = base_urls or {
            "bpapi": "https://api.matchbook.com/bpapi/rest",
            "edge": "https://api.matchbook.com/edge/rest",
        }
        self.default_service = default_service
        self.timeout = timeout

        self.session = requests.Session()
        self.session.headers.update({
            "accept": "application/json",
            "User-Agent": user_agent,
            "content-type": "application/json;charset=UTF-8",
        })
        self.token = None

    def _build_url(self, service: str, path: str) -> str:
        base = self.base_urls[service].rstrip("/")
        return f"{base}/{path.lstrip('/')}"

    def login(self):
        r = self.session.post(
            self._build_url("bpapi", "security/session"),
            json={"username": self.username, "password": self.password},
            timeout=self.timeout,
        )
        r.raise_for_status()
        self.token = r.json()["session-token"]
        self.session.headers["session-token"] = self.token
        return self.token

    def logout(self):
        try:
            self.session.delete(
                self._build_url("bpapi", "security/session"),
                timeout=self.timeout,
            )
        finally:
            self.token = None
            self.session.headers.pop("session-token", None)

    def _needs_reauth(self, r):
        if r.status_code in (401, 403):
            return True
        try:
            body = r.json()
            if isinstance(body, dict):
                text = json.dumps(body)
                if "AUTHENTICATION_REQUIRED" in text or "INVALID_SESSION" in text:
                    return True
        except Exception:
            if "AUTHENTICATION_REQUIRED" in r.text or "INVALID_SESSION" in r.text:
                return True
        return False

    def request(
        self,
        method: str,
        path: str,
        *,
        service: str | None = None,
        retries: int = 1,
        ensure_login: bool = True,
        params: dict | None = None,
        json: Any = None,
        data: Any = None,
        headers: dict | None = None,
        files: dict | None = None,
        **kwargs,
    ):
        if ensure_login and not self.token:
            self.login()

        url = path if path.startswith("http") else self._build_url(service or self.default_service, path)
        if "timeout" not in kwargs and self.timeout is not None:
            kwargs["timeout"] = self.timeout

        r = self.session.request(
            method.upper(),
            url,
            params=params,
            json=json,
            data=data,
            headers=headers,  # merged with session headers by requests
            files=files,
            **kwargs,
        )
        if self._needs_reauth(r) and retries > 0:
            self.login()
            return self.request(
                method,
                path,
                service=service,
                retries=retries - 1,
                ensure_login=False,
                params=params,
                json=json,
                data=data,
                headers=headers,
                files=files,
                **kwargs,
            )
        return r

    def get(self, path: str, *, service: str | None = None, params: dict | None = None, **kwargs):
        return self.request("GET", path, service=service, params=params, **kwargs)

    def post(self, path: str, *, service: str | None = None, json: Any = None, data: Any = None, **kwargs):
        return self.request("POST", path, service=service, json=json, data=data, **kwargs)

    def delete(self, path: str, *, service: str | None = None, **kwargs):
        return self.request("DELETE", path, service=service, **kwargs)

# Usage
client = MatchbookClient(config.mb_username, config.mb_password)


In [62]:
# ...existing code...
import pandas as pd
from zoneinfo import ZoneInfo

HORSE_RACING_SPORT_ID = "24735152712200"

def fetch_horseracing_events_by_tag(client, tag_url_names: str = "uk", include_prices: bool = True) -> list[dict]:
    params = {
        "sport-ids": HORSE_RACING_SPORT_ID,
        "tag-url-names": tag_url_names,
        "states": "open,suspended",   # include suspended to see more markets
        "include-prices": include_prices,
        "odds-type": "DECIMAL",
        "price-depth": 3,
        "per-page": 200,
        "offset": 0,
    }
    events: list[dict] = []
    while True:
        r = client.get("events", service="edge", params=params)
        r.raise_for_status()
        data = r.json()
        page = data.get("events", []) or []
        events.extend(page)
        if not page or len(events) >= int(data.get("total", 0)):
            break
        params["offset"] += params["per-page"]
    return events


def create_market_data(market_map: dict) -> pd.DataFrame:

    events = fetch_horseracing_events_by_tag(client, tag_url_names="uk")

    rows = []
    for e in events:
        course = next((t.get("name") for t in e.get("meta-tags", []) or [] if t.get("type") == "LOCATION"), None)
        race_time = pd.to_datetime(e["start"], utc=True).tz_convert(ZoneInfo("Europe/London")).tz_localize(None)
        for m in e.get("markets", []) or []:
            mname = (m.get("name") or "").strip()
            is_win = mname.upper() == "WIN"
            is_place = mname.lower().startswith("place")
            if not (is_win or is_place):
                continue
            for r in m.get("runners", []) or []:
                prices = r.get("prices", []) or []
                backs = [p for p in prices if p.get("side") == "back"]
                lays  = [p for p in prices if p.get("side") == "lay"]
                best_back = max(backs, key=lambda p: p["decimal-odds"]) if backs else None
                best_lay  = min(lays,  key=lambda p: p["decimal-odds"]) if lays  else None
                rows.append({
                    "event_id": e["id"],
                    "course": course,
                    "race_time": race_time,
                    "market_id": m["id"],
                    "market_name": mname,  # "WIN" or "Place (n)"
                    "runner_id": r["id"],
                    "runner_name": r["name"],
                    "best_back_odds": best_back["decimal-odds"] if best_back else None,
                    "best_back_available": best_back["available-amount"] if best_back else None,
                    "best_lay_odds": best_lay["decimal-odds"] if best_lay else None,
                    "best_lay_available": best_lay["available-amount"] if best_lay else None,
                })

    df = pd.DataFrame(rows)
    df["runner_name"] = df["runner_name"].str.replace(r"^\s*\d+\s*\.?\s*", "", regex=True).str.strip()

    df['market_win_place'] = df['market_name'].map(market_map).astype(int)

    df['best_back_available'] = df['best_back_available'].fillna(0).round(0).astype(int)
    df['best_lay_available'] = df['best_lay_available'].fillna(0).round(0).astype(int)

    final_df = df[
        [
            "event_id","course","race_time","market_id", 'market_win_place',
            "runner_id","runner_name","best_back_odds","best_back_available","best_lay_odds","best_lay_available",
        ]
    ].rename(
        columns={'runner_name': 'horse_name'}
    ).sort_values(["race_time","course","event_id","horse_name"]).reset_index(drop=True)

    return final_df
# display(final_df.head(30))
# ...existing code...

In [63]:
MARKET_MAP = {
    "WIN": 1,
    "Place (2)": 2,
    "Place (3)": 3,
    "Place (4)": 4,
    "Place (5)": 5,
    "Place (6)": 6,
    "Place (7)": 7,
    "Place (8)": 8,
    "Place (9)": 9,
    "Place (10)": 10,

}

df = create_market_data(MARKET_MAP)

In [64]:
df

Unnamed: 0,event_id,course,race_time,market_id,market_win_place,runner_id,horse_name,best_back_odds,best_back_available,best_lay_odds,best_lay_available
0,31231927517200058,Hamilton,2025-09-21 14:13:00,31231927579701058,1,31231927580100058,Beyond The Bar,5.60,2,13.50,2
1,31231927517200058,Hamilton,2025-09-21 14:13:00,31231927583600058,2,31231927584100058,Beyond The Bar,2.24,2,3.30,2
2,31231927517200058,Hamilton,2025-09-21 14:13:00,31231927585700058,3,31231927586202058,Beyond The Bar,1.57,2,1.94,2
3,31231927517200058,Hamilton,2025-09-21 14:13:00,31231927579701058,1,31231927580303058,Maxwellcan,13.50,2,21.00,1
4,31231927517200058,Hamilton,2025-09-21 14:13:00,31231927583600058,2,31231927584306058,Maxwellcan,3.55,2,6.40,2
...,...,...,...,...,...,...,...,...,...,...,...
715,31232518611801061,Southwell,2025-09-21 18:45:00,31232518749200061,2,31232518749504061,Sax Appeal,1.83,2,3.55,2
716,31232518611801061,Southwell,2025-09-21 18:45:00,31232518746700061,1,31232518747506061,Simiyann,16.00,2,65.00,3
717,31232518611801061,Southwell,2025-09-21 18:45:00,31232518749200061,2,31232518750002061,Simiyann,2.00,5,13.00,2
718,31232518611801061,Southwell,2025-09-21 18:45:00,31232518746700061,1,31232518747205061,Zealandia,3.65,3,10.00,2


In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   event_id             720 non-null    int64         
 1   course               720 non-null    object        
 2   race_time            720 non-null    datetime64[ns]
 3   market_id            720 non-null    int64         
 4   market_win_place     720 non-null    int64         
 5   runner_id            720 non-null    int64         
 6   horse_name           720 non-null    object        
 7   best_back_odds       716 non-null    float64       
 8   best_back_available  720 non-null    int64         
 9   best_lay_odds        716 non-null    float64       
 10  best_lay_available   720 non-null    int64         
dtypes: datetime64[ns](1), float64(2), int64(6), object(2)
memory usage: 62.0+ KB


In [39]:
df.to_csv('~/Desktop/matchbook.csv', index=False)

In [40]:
'31223512809500061' == '31223512809901061'

False

In [None]:
td = pg.fetch_data(
    """
        SELECT 
            race_time, 
            horse_name,
            race_id,
            horse_id,
            meeting_id,
            course_id,
            course 
        FROM 
            public.todays_data
    """
)

In [55]:
def format_horse_name(data: pd.DataFrame) -> pd.DataFrame:
    data = data.assign(
        filtered_horse_name=lambda x: x["horse_name"]
        .str.replace("'", "")
        .str.replace(" ", "")
        .str.replace(r"\(.*?\)", "", regex=True)
        .str.strip()
        .str.lower()
    )

    return data

In [56]:
sf = td[
        [
            'race_time',
            "horse_name",
            "race_id",
            "horse_id",
            "meeting_id",
            "course_id",
            "course",
        ]
    ]


In [66]:
sf = format_horse_name(sf)
df = format_horse_name(df)

In [67]:
sf

Unnamed: 0,race_time,horse_name,race_id,horse_id,meeting_id,course_id,course,filtered_horse_name
0,2025-09-21 15:50:00,Howth,901711,164511,734e878e07fadf0d593ab548b05d6a9a41f15eb36eaedb...,63,Plumpton,howth
1,2025-09-21 14:43:00,Stormy Pearl,901663,176537,029aecee69daa4d3ff7accd92ddd44bdb552739e5eaa9c...,33,Hamilton,stormypearl
2,2025-09-21 14:20:00,Fine By Me,901715,170097,734e878e07fadf0d593ab548b05d6a9a41f15eb36eaedb...,63,Plumpton,finebyme
3,2025-09-21 16:37:00,Alhather,901811,189187,78236d107b68763e15829ff74fed1e189b7f6b0324ba40...,73,Southwell Aw,alhather
4,2025-09-21 16:13:00,Arkenstaar,901660,178654,029aecee69daa4d3ff7accd92ddd44bdb552739e5eaa9c...,33,Hamilton,arkenstaar
...,...,...,...,...,...,...,...,...
191,2025-09-21 16:20:00,Square Du Roule,901716,188356,734e878e07fadf0d593ab548b05d6a9a41f15eb36eaedb...,63,Plumpton,squareduroule
192,2025-09-21 15:43:00,Iris Dancer,901657,156768,029aecee69daa4d3ff7accd92ddd44bdb552739e5eaa9c...,33,Hamilton,irisdancer
193,2025-09-21 14:43:00,Bring Her Home,901663,6680948,029aecee69daa4d3ff7accd92ddd44bdb552739e5eaa9c...,33,Hamilton,bringherhome
194,2025-09-21 16:43:00,Parisiac,901659,170660,029aecee69daa4d3ff7accd92ddd44bdb552739e5eaa9c...,33,Hamilton,parisiac


In [68]:
df

Unnamed: 0,event_id,course,race_time,market_id,market_win_place,runner_id,horse_name,best_back_odds,best_back_available,best_lay_odds,best_lay_available,filtered_horse_name
0,31231927517200058,Hamilton,2025-09-21 14:13:00,31231927579701058,1,31231927580100058,Beyond The Bar,5.60,2,13.50,2,beyondthebar
1,31231927517200058,Hamilton,2025-09-21 14:13:00,31231927583600058,2,31231927584100058,Beyond The Bar,2.24,2,3.30,2,beyondthebar
2,31231927517200058,Hamilton,2025-09-21 14:13:00,31231927585700058,3,31231927586202058,Beyond The Bar,1.57,2,1.94,2,beyondthebar
3,31231927517200058,Hamilton,2025-09-21 14:13:00,31231927579701058,1,31231927580303058,Maxwellcan,13.50,2,21.00,1,maxwellcan
4,31231927517200058,Hamilton,2025-09-21 14:13:00,31231927583600058,2,31231927584306058,Maxwellcan,3.55,2,6.40,2,maxwellcan
...,...,...,...,...,...,...,...,...,...,...,...,...
715,31232518611801061,Southwell,2025-09-21 18:45:00,31232518749200061,2,31232518749504061,Sax Appeal,1.83,2,3.55,2,saxappeal
716,31232518611801061,Southwell,2025-09-21 18:45:00,31232518746700061,1,31232518747506061,Simiyann,16.00,2,65.00,3,simiyann
717,31232518611801061,Southwell,2025-09-21 18:45:00,31232518749200061,2,31232518750002061,Simiyann,2.00,5,13.00,2,simiyann
718,31232518611801061,Southwell,2025-09-21 18:45:00,31232518746700061,1,31232518747205061,Zealandia,3.65,3,10.00,2,zealandia


In [74]:
sf.merge(df, on=["filtered_horse_name", "race_time"], how="left")[
    ["race_id", "horse_id", "event_id", "runner_id"]
].drop_duplicates(subset=["race_id", "horse_id"])

Unnamed: 0,race_id,horse_id,event_id,runner_id
0,901711,164511,31231925485802061,31231925540600061
2,901663,176537,31231927517402058,31231927580901058
6,901715,170097,31231925485000061,31231925540703061
10,901811,189187,31232518611102061,31232518720020061
14,901660,178654,31231927518002058,31231927580005058
...,...,...,...,...
697,901716,188356,31231925486001061,31231925541905061
701,901657,156768,31231927517801058,31231927583606058
705,901663,6680948,31231927517402058,31231927582204058
709,901659,170660,31231927518201058,31231927581909058
