In [1]:
import pandas as pd
from dotenv import load_dotenv
import numpy as np
from pydantic import BaseModel
from langchain_google_genai import ChatGoogleGenerativeAI
import re 
from api_helpers.clients import get_postgres_client


load_dotenv(
    dotenv_path="/Users/tomwattley/App/racing-api-project/racing-api-project/libraries/api-helpers/src/api_helpers/.env"
)

pg = get_postgres_client()

2026-02-10T20:32:38Z | INFO - Logging configuration initialized with level: INFO


In [2]:

def convert_distance_to_float(distance_str):
    text_code_to_numeric = {
        "dht": 0,
        "nse": 0.01,
        "shd": 0.1,
        "sht-hd": 0.1,
        "hd": 0.2,
        "sht-nk": 0.3,
        "snk": 0.3,
        "nk": 0.5,
        "dist": 999,
    }
    if pd.isna(distance_str) or not distance_str:
        return 0.0
    clean_str = distance_str.strip("[]")

    if clean_str in text_code_to_numeric:
        return text_code_to_numeric[clean_str]

    if not clean_str:
        return 0.0

    match = re.match(r"(\d+)?(?:\s*)?([½¼¾⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞])?", clean_str)
    whole_number, fraction = match[1], match[2]

    whole_number_part = float(whole_number) if whole_number else 0.0

    fraction_to_decimal = {
        "½": 0.5,
        "⅓": 0.33,
        "⅔": 0.66,
        "¼": 0.25,
        "¾": 0.75,
        "⅕": 0.2,
        "⅖": 0.4,
        "⅗": 0.6,
        "⅘": 0.8,
        "⅙": 0.167,
        "⅚": 0.833,
        "⅛": 0.125,
        "⅜": 0.375,
        "⅝": 0.625,
        "⅞": 0.875,
    }
    fraction_part = fraction_to_decimal.get(fraction, 0.0)

    return whole_number_part + fraction_part

def get_adj_total_distance_beaten(df):
    # fmt: off

    df = df.copy()
    if len(df) == 1 and df["finishing_position"].iloc[0] == "1":
        df["adj_total_distance_beaten"] = "WO"
        return df

    if df["total_distance_beaten"].unique().tolist() == [""]:
        df["adj_total_distance_beaten"] = "FOG"
        return df

    if len(df) == 2 and df["finishing_position"].tolist() == ["1", "1"]:
        df["adj_total_distance_beaten"] = "0"
        return df

    if len(df) == 2 and len(df[df["finishing_position"] == "1"]) == 1:
        df["adj_total_distance_beaten"] = np.select(
            [df["finishing_position"] == "1"],
            ["0"],
            df["finishing_position"].astype(str),
        )
        return df

    df = df.assign(
        float_total_distance_beaten=df["total_distance_beaten"].apply(
            convert_distance_to_float
        ),
    )
    df = df.assign(
        float_total_distance_beaten=df["float_total_distance_beaten"].round(2)
    )

    second_place = "2"

    if len(df[df["finishing_position"] == "1"]) > 1:
        finishing_positions = df["finishing_position"].unique()
        numeric_finishing_positions = pd.to_numeric(
            finishing_positions, errors="coerce"
        )
        second_place = str(
            numeric_finishing_positions[numeric_finishing_positions > 1].min()
        ).replace(".0", "")

    dsq_df = df[
        (pd.to_numeric(df["finishing_position"], errors="coerce") > 1)
        & (df["total_distance_beaten"] == "")
    ]
    if not dsq_df.empty:
        winning_distance = df[df["finishing_position"] == "1"][
            "float_total_distance_beaten"
        ].iloc[0]
        df["float_total_distance_beaten"] = (
            df["float_total_distance_beaten"] - winning_distance
        )

    # Handle case where second_place doesn't exist
    second_place_df = df[df["finishing_position"] == second_place]
    if second_place_df.empty:
        winner_adj_value = 0.0
    else:
        winner_adj_value = -second_place_df["float_total_distance_beaten"].iloc[0]

    df = df.assign(
        adj_total_distance_beaten=np.select(
            [
                df["finishing_position"].str.contains(r"[A-Za-z]", na=False),
                (df["finishing_position"] == "0") & (df["total_distance_beaten"] == ""),
                (df["finishing_position"] != "1") & (df["total_distance_beaten"] == ""),
                (df["finishing_position"] == "1"),
            ],
            [
                df["finishing_position"],
                "UND",
                "(DSQ)",
                winner_adj_value,
            ],
            df["float_total_distance_beaten"],
        )
    ).drop(columns=["float_total_distance_beaten"])
    df = df.assign(
        adj_total_distance_beaten=np.where(
            df["adj_total_distance_beaten"] == "(DSQ)",
            "(DSQ) " + df["finishing_position"],
            df["adj_total_distance_beaten"],
        )
    )
    df["adj_total_distance_beaten"] = df["adj_total_distance_beaten"].astype(str)
    return df

# fmt: on

In [3]:
df = pg.fetch_data("""
            select 
                unique_id, 
                race_time, 
                race_id,
                finishing_position, 
                total_distance_beaten
            from rp_raw.results_data 
            where race_time > '2020-01-01'
                
            """)

In [4]:
df

Unnamed: 0,unique_id,race_time,race_id,finishing_position,total_distance_beaten
0,5bfddbc66e4fd86a41305d33661a487ffcb4e1e751a8c3...,2024-12-09 14:30:00,881297,7,[12]
1,c6763801c79aa64b80255bcadcf7c1730f6baa248b3c3d...,2024-12-17 12:45:00,882349,6,[21¼]
2,0fed0179cf47501fe29ebb1ce5ff992a83c0e2d91ba9fe...,2024-12-21 15:40:00,884247,15,[29¾]
3,e50e63b31fdad943c174d49dfdaf25f7f2e46068310002...,2024-10-27 14:12:00,877731,10,[28½]
4,48ae029bf65491aad061a7bd8032c6d8e5e2c06b49eec2...,2024-12-29 14:17:00,884709,PU,
...,...,...,...,...,...
725633,c54e1481a4c132b7eff4bf7ef96ca69a12593c96a29e6e...,2024-10-29 13:20:00,877743,7,[37¼]
725634,954257460ad2eb835323104a8a8640b882f1322632f7d9...,2024-10-29 13:20:00,877743,6,[32¼]
725635,900b8c8e4d7daec38e3804930dd60a53d9ee7b06cbf319...,2024-10-30 15:30:00,877760,PU,
725636,da3638319ae81ed8268eba82ebe63a1db5ee1e099984e6...,2024-10-31 15:15:00,877771,13,[47¼]


In [18]:
sp = df[(df['finishing_position'] == '2')][['race_id']]

In [7]:
ttf = pd.merge(df, sp, how='inner', on='race_id')

In [19]:
ttf = df[~df['race_id'].isin(sp['race_id'])]

In [22]:
get_adj_total_distance_beaten(ttf[ttf['race_id'] == '879182'])

Unnamed: 0,unique_id,race_time,race_id,finishing_position,total_distance_beaten,adj_total_distance_beaten
1361,bf7d871dfef2a91db6ced83c33b20ab876b7b844151adb...,2024-11-07 11:40:00,879182,5,[½],0.5
1362,209e6d714ab26a4aeedef3e7de0b4c5218bb8781546e8a...,2024-11-07 11:40:00,879182,6,[2],2.0
1363,0a4645690e7cec6bf10a3bd626544b778c69f15114708a...,2024-11-07 11:40:00,879182,7,[2½],2.5
1364,0e284358a9c034a43f5b38a4fcb921aafadcbfe2428bf5...,2024-11-07 11:40:00,879182,8,[3],3.0
1365,d19a0a6fa036689a9b2b208d4b9db832967f96d8b14eae...,2024-11-07 11:40:00,879182,9,[4¼],4.25
1366,7349cce27978ff42817372408ec4785c42fdb83fda0c9c...,2024-11-07 11:40:00,879182,10,[6½],6.5
167140,cad71129442d7a24700d08991971ffbce59f470eda5f9f...,2024-11-07 11:40:00,879182,12,[7¾],7.75
398893,0c6f09c8a10fb368f0fa09af95980e4952b1330218e6cc...,2024-11-07 11:40:00,879182,1,,-0.01
398894,64184bea3ff5aa241e087d21885823a693bb2e33003bbd...,2024-11-07 11:40:00,879182,1,dht,-0.01
415025,e086a980715318b1cd434869d099864b7132c05682127b...,2024-11-07 11:40:00,879182,3,[nse],0.01


In [8]:
# Apply the function to all races grouped by race_id
result_df = ttf.groupby('race_id', group_keys=False).apply(get_adj_total_distance_beaten)

# Prepare the update data
update_df = result_df[['unique_id', 'adj_total_distance_beaten']].copy()

  result_df = ttf.groupby('race_id', group_keys=False).apply(get_adj_total_distance_beaten)


In [9]:
update_df

Unnamed: 0,unique_id,adj_total_distance_beaten
0,5bfddbc66e4fd86a41305d33661a487ffcb4e1e751a8c3...,12.0
1,c6763801c79aa64b80255bcadcf7c1730f6baa248b3c3d...,21.25
2,0fed0179cf47501fe29ebb1ce5ff992a83c0e2d91ba9fe...,29.75
3,e50e63b31fdad943c174d49dfdaf25f7f2e46068310002...,28.5
4,48ae029bf65491aad061a7bd8032c6d8e5e2c06b49eec2...,PU
...,...,...
725632,c54e1481a4c132b7eff4bf7ef96ca69a12593c96a29e6e...,37.25
725633,954257460ad2eb835323104a8a8640b882f1322632f7d9...,32.25
725634,900b8c8e4d7daec38e3804930dd60a53d9ee7b06cbf319...,PU
725635,da3638319ae81ed8268eba82ebe63a1db5ee1e099984e6...,47.25


In [11]:
pg.store_data(update_df, 'results_data_distance', 'rp_raw')