In [1]:
import pandas as pd
from dotenv import load_dotenv

from pydantic import BaseModel
from langchain_google_genai import ChatGoogleGenerativeAI

load_dotenv(
    dotenv_path="/Users/tomwattley/App/racing-api-project/racing-api-project/libraries/api-helpers/src/api_helpers/.env"
)
# from trader.fetch_requests import *
# from trader.prepare_requests import *
from api_helpers.clients import get_betfair_client, get_s3_client, get_postgres_client

postgres_client = get_postgres_client()

2025-08-22T07:03:03Z | INFO - Logging configuration initialized with level: INFO


In [5]:
df = pd.read_csv('~/Desktop/test.csv')

In [6]:
df.columns

Index(['race_id', 'horse_id', 'horse_name', 'age', 'race_time', 'race_date',
       'race_title', 'race_type', 'race_class', 'distance', 'distance_yards',
       'distance_meters', 'distance_kilometers', 'conditions', 'going',
       'betfair_win_sp', 'number_of_runners', 'hcap_range', 'age_range',
       'surface', 'total_prize_money', 'first_place_prize_money', 'course_id',
       'course', 'data_type'],
      dtype='object')

In [17]:
def add_min_sp_skip_flag(df):
    """Add skip flag based on minimum betfair_win_sp per race_id being > 4"""
    min_sp_per_race = df.groupby('race_id')['betfair_win_sp'].min()
    df["skip_flag"] = df["skip_flag"] | df['race_id'].map(min_sp_per_race > 4)
    return df

In [18]:
test_data = {
    'race_id': [1, 1, 1, 2, 2, 2],
    'race_title': ['Test Race 1', 'Test Race 1', 'Test Race 1', 'Test Race 2', 'Test Race 2', 'Test Race 2'],
    'betfair_win_sp': [2.5, 6.0, 3.2, 5.5, 8.0, 4.5],
    'skip_flag': [False, False, False, False, False, False]
}
test_df = pd.DataFrame(test_data)


In [19]:
test_df

Unnamed: 0,race_id,race_title,betfair_win_sp,skip_flag
0,1,Test Race 1,2.5,False
1,1,Test Race 1,6.0,False
2,1,Test Race 1,3.2,False
3,2,Test Race 2,5.5,False
4,2,Test Race 2,8.0,False
5,2,Test Race 2,4.5,False


In [20]:
test_df = add_min_sp_skip_flag(test_df)
test_df

Unnamed: 0,race_id,race_title,betfair_win_sp,skip_flag
0,1,Test Race 1,2.5,False
1,1,Test Race 1,6.0,False
2,1,Test Race 1,3.2,False
3,2,Test Race 2,5.5,True
4,2,Test Race 2,8.0,True
5,2,Test Race 2,4.5,True


In [6]:
df[
    [
        "horse_name",
        "official_rating",
        "race_id",
        "horse_id",
        "race_date",
        "race_class",
        "race_type",
        "distance",
        "going",
        "surface",
        "course",
        "rating",
        "speed_figure",
        "unique_id",
    ]
]

Unnamed: 0,horse_name,official_rating,race_id,horse_id,race_date,race_class,race_type,distance,going,surface,course,rating,speed_figure,unique_id
0,Boom The Groom,62.0,813646,99370,2022-06-21,6,Flat,6f,Good To Firm,turf,Brighton,67,61,023889d994a650fe57c48e6d8a67678e1f12f51e92a964...
1,Boom The Groom,62.0,812445,99370,2022-06-07,6,Flat,6f,Good To Soft,turf,Brighton,66,46,a26d25841959cb254b9ffdff2f6444bb125fbac8f14e37...
2,Boom The Groom,64.0,811183,99370,2022-05-24,6,Flat,5f,Good To Soft,turf,Bath,56,48,d36b8702e6d37dfc9efc4198f26da6098593e0c855de88...
3,Boom The Groom,67.0,808547,99370,2022-04-25,5,Flat,5f,Good To Firm,turf,Windsor,41,29,36f2a95b661bd384165121ba9ce47060d81bd13b6e07eb...
4,Boom The Groom,70.0,804316,99370,2022-03-07,5,Flat,6f,Standard,tapeta,Wolverhampton,44,36,0d3f0d39c8315ddb7d5fe3032a12f7818f64b776df469d...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,Desert Team,73.0,811288,170057,2022-05-28,5,Flat,5f,Good To Firm,turf,Catterick,87,64,4a277c92154597700a0ccc3d21d09f06e88644472f5554...
123,Desert Team,73.0,808701,170057,2022-05-02,5,Flat,5f,Good,turf,Beverley,76,62,f4edb217d990770b6fac91d45f48a3fadc3f7f1f0451b9...
124,Desert Team,,807712,170057,2022-04-23,5,Flat,5f,Standard,tapeta,Wolverhampton,79,73,6715faa3af6aca2de6236742e732a04238211dc0d5ca2e...
125,Desert Team,,806511,170057,2022-04-06,5,Flat,6f,Standard To Slow,polytrack,Kempton,70,45,659c1089f94edcfc1dca17ed9cfd810e4dd8d3970048a2...


In [26]:
def add_all_skip_flags(df):
    """Add all skip flag conditions as separate columns, then combine into final skip_flag"""
    races_to_ignore = [
        "shergar",
        "maiden",
        "novice",
        "hurdle",
        "chase",
        "hunter",
        "heritage",
    ]

    # Flag 1: Race title contains ignored words
    df["skip_race_type"] = (
        df["race_title"].str.lower().str.contains("|".join(races_to_ignore), na=False)
    )

    # Flag 2: Minimum SP per race > 4
    min_sp_per_race = df.groupby("race_id")["betfair_win_sp"].min()
    df["skip_min_sp"] = df["race_id"].map(min_sp_per_race > 4)

    # Flag 3: >10 runners AND favorite > 4
    min_sp_per_race = df.groupby("race_id")["betfair_win_sp"].min()
    max_runners_per_race = df.groupby("race_id")["number_of_runners"].max()
    condition = (max_runners_per_race > 10) & (min_sp_per_race > 4)
    df["skip_runners_fav"] = df["race_id"].map(condition)

    # Final skip flag: True if ANY of the conditions are True
    df["skip_flag"] = df["skip_race_type"] | df["skip_min_sp"] | df["skip_runners_fav"]

    return df.drop(
        columns=["skip_race_type", "skip_min_sp", "skip_runners_fav"]
    )


# Test the function
df = add_all_skip_flags(df)
df[
    [
        "race_id",
        "race_title",
        "betfair_win_sp",
        "number_of_runners",
        "skip_flag",
    ]
]

Unnamed: 0,race_id,race_title,betfair_win_sp,number_of_runners,skip_flag
0,816799,Dubai Duty Free Shergar Cup Curtain Raiser Cla...,36.00,10,True
1,816799,Dubai Duty Free Shergar Cup Curtain Raiser Cla...,7.60,10,True
2,816799,Dubai Duty Free Shergar Cup Curtain Raiser Cla...,8.80,10,True
3,816799,Dubai Duty Free Shergar Cup Curtain Raiser Cla...,8.10,10,True
4,816799,Dubai Duty Free Shergar Cup Curtain Raiser Cla...,4.40,10,True
...,...,...,...,...,...
330,816811,Watch Race Replays On Racing TV Handicap,7.40,11,True
331,816811,Watch Race Replays On Racing TV Handicap,17.50,11,True
332,816811,Watch Race Replays On Racing TV Handicap,9.00,11,True
333,816811,Watch Race Replays On Racing TV Handicap,4.90,11,True
