In [202]:
import pandas as pd
from dotenv import load_dotenv

from pydantic import BaseModel
from langchain_google_genai import ChatGoogleGenerativeAI

from api_helpers.clients import get_postgres_client


load_dotenv(
    dotenv_path="/Users/tomwattley/App/racing-api-project/racing-api-project/libraries/api-helpers/src/api_helpers/.env"

)

# pg = get_postgres_client()


True

In [203]:
import numpy as np
import pandas as pd
from numba import njit

@njit(cache=True)
def _simulate_loop(base_probs, n_sims, n_places):
    """JIT-compiled inner loop - no seed setting per call."""
    n_horses = len(base_probs)
    win_counts = np.zeros(n_horses, dtype=np.int32)
    place_counts = np.zeros(n_horses, dtype=np.int32)
    
    # Pre-allocate arrays once (reuse in loop)
    ps = np.empty(n_horses, dtype=np.float64)
    indices = np.empty(n_horses, dtype=np.int32)
    cumsum_buf = np.empty(n_horses, dtype=np.float64)
    
    for _ in range(n_sims):
        # Reset to full field
        ps[:] = base_probs
        for idx in range(n_horses):
            indices[idx] = idx
        n_remaining = n_horses
        
        K = min(n_places, n_horses)
        for i in range(K):
            # Normalize in-place
            ps_sum = 0.0
            for j in range(n_remaining):
                ps_sum += ps[j]
            if ps_sum > 0:
                for j in range(n_remaining):
                    ps[j] /= ps_sum
            
            # Cumsum and sample
            cumsum_buf[0] = ps[0]
            for j in range(1, n_remaining):
                cumsum_buf[j] = cumsum_buf[j-1] + ps[j]
            
            rand_val = np.random.rand()
            idx = 0
            for j in range(n_remaining):
                if rand_val <= cumsum_buf[j]:
                    idx = j
                    break
            
            horse_idx = indices[idx]
            
            # Record win/place
            if i == 0:
                win_counts[horse_idx] += 1
            place_counts[horse_idx] += 1
            
            # Remove selected horse (shift arrays left)
            for j in range(idx, n_remaining - 1):
                indices[j] = indices[j + 1]
                ps[j] = ps[j + 1]
            n_remaining -= 1
    
    return win_counts, place_counts


def simulate_place_counts(
    df, price_col="betfair_win_sp", horse_col='horse_name', n_places=3, n_sims=10000, seed=42
):
    
    map_bf = dict(zip(df['horse_name'], df['betfair_place_sp']))
    """Optimized - assumes unique horses in df."""
    horses = df[horse_col].values
    prices = df[price_col].values
    
    # Compute implied probabilities
    base_probs = 1.0 / prices
    base_probs = base_probs / base_probs.sum()

    np.random.seed(seed)
    win_counts, place_counts = _simulate_loop(base_probs, n_sims, n_places)

    out = pd.DataFrame({
        "horse": horses,
        "win_prob": win_counts / n_sims,
        "place_prob_topN": place_counts / n_sims,
    }).sort_values(["place_prob_topN", "win_prob"], ascending=False).reset_index(drop=True)
    
    out["sim_place_sp"] = 1 / out["place_prob_topN"]
    return out

In [204]:
import pandas as pd
import numpy as np
# df = pd.DataFrame(
#     {
#         "horse": ['horse_a',
#                 'horse_b',
#                 'horse_c',
#                 'horse_d',
#                 'horse_e',
#                 'horse_f',
#                 'horse_g',
#                 'horse_h',
#                 'horse_i',
#                    ],
#         "betfair_win_sp": [2.6, 5.3, 9.4, 10.0, 14.0, 18.0, 20.0, 30.0, 75.0],

#     }
# )
df =  pd.read_csv('~/Desktop/test.csv')
df['proba'] = ((1/df['betfair_win_sp']).astype(float) / (1/df['betfair_win_sp']).astype(float).sum()).round(2)
df = df[[
    'horse_name',
    'betfair_win_sp',
    'betfair_place_sp',
    'proba'
]].astype(
    {
        'betfair_win_sp': 'float',
        'betfair_place_sp': 'float'
    }
)


In [192]:
p

Unnamed: 0,horse,win_prob,place_prob_topN,sim_place_sp,horse_name,betfair_win_sp,betfair_place_sp,proba,sim_proba
0,Twilight Madness,0.3441,0.7733,1.293159,Twilight Madness,2.9,1.5,0.35,0.77
1,Khabib,0.149,0.4695,2.129925,Khabib,7.0,2.2,0.14,0.47
2,High Velocity,0.1249,0.4141,2.414876,High Velocity,8.1,2.5,0.12,0.41
3,Be Proud,0.1173,0.3871,2.583312,Be Proud,8.7,2.7,0.12,0.39
4,Ramon Di Loria,0.1089,0.3781,2.644803,Ramon Di Loria,9.0,2.9,0.11,0.38
5,Army Of India,0.0688,0.2483,4.027386,Army Of India,14.7,3.4,0.07,0.25
6,Ballyare,0.0584,0.2172,4.604052,Ballyare,16.6,4.1,0.06,0.22
7,Primos Comet,0.0286,0.1124,8.896797,Primos Comet,33.7,6.7,0.03,0.11


In [205]:
res = simulate_place_counts(df, n_places=3, n_sims=10000, seed=7)
p = res.merge(df, left_on='horse', right_on='horse_name')
p['sim_proba'] = (1/p['sim_place_sp']).round(2)
p['place_proba'] = (1/p['betfair_place_sp']).round(2)
p['diff_proba'] = p['sim_proba'] - p['place_proba']
# p['inverse_lay'] = 1/

In [206]:
p[
    [
        "horse_name",
        "betfair_place_sp",
        "sim_place_sp",
        "place_proba",
        "sim_proba",
        "diff_proba"
    ]
]

Unnamed: 0,horse_name,betfair_place_sp,sim_place_sp,place_proba,sim_proba,diff_proba
0,Twilight Madness,1.5,1.278936,0.67,0.78,0.11
1,Khabib,2.2,2.185792,0.45,0.46,0.01
2,High Velocity,2.5,2.43843,0.4,0.41,0.01
3,Be Proud,2.7,2.515723,0.37,0.4,0.03
4,Ramon Di Loria,2.9,2.644803,0.34,0.38,0.04
5,Army Of India,3.4,4.140787,0.29,0.24,-0.05
6,Ballyare,4.1,4.44247,0.24,0.23,-0.01
7,Primos Comet,6.7,9.23361,0.15,0.11,-0.04


In [23]:
sum(res['place_prob_topN'])

3.0

In [199]:
(1/6.7) + 1

1.1492537313432836

In [4]:
res

Unnamed: 0,horse,win_prob,place_prob_topN,sim_place_sp
0,horse_a,0.388,0.822,1.216545
1,horse_b,0.1803,0.5826,1.716444
2,horse_c,0.1064,0.3776,2.648305
3,horse_d,0.0992,0.3544,2.82167
4,horse_e,0.0757,0.2687,3.721623
5,horse_f,0.0584,0.2198,4.549591
6,horse_g,0.0489,0.1913,5.227392
7,horse_h,0.0321,0.1314,7.61035
8,horse_i,0.011,0.0522,19.157088


In [119]:
df = pg.fetch_data("""           
        SELECT 
            race_date,
            race_id, 
            horse_name, 
            betfair_win_sp, 
            betfair_place_sp, 
            finishing_position,
            number_of_runners,
            race_class,
            race_type,
            weeks_since_last_ran
        FROM public.unioned_results_data 
        WHERE race_id IN (
            SELECT race_id
            FROM public.unioned_results_data
            WHERE betfair_win_sp > 2 
            AND betfair_win_sp < 3
            AND race_date > '2020-01-01'
            AND EXTRACT(MONTH FROM race_date) NOT IN (4, 10)
        )""")

In [120]:
df

Unnamed: 0,race_date,race_id,horse_name,betfair_win_sp,betfair_place_sp,finishing_position,number_of_runners,race_class,race_type,weeks_since_last_ran
0,2020-06-27,759652,Brando,9.99,3.25,2,11,1.0,Flat,3
1,2020-08-22,761933,Brando,30.90,5.61,6,8,1.0,Flat,6
2,2020-07-29,763775,Blakeney Point,2.06,1.32,1,11,4.0,Hurdle,33
3,2020-12-30,772721,Moidore,7.40,2.07,4,9,5.0,Hurdle,5
4,2020-07-23,763379,Moscato,2.70,,1,8,,Hurdle,44
...,...,...,...,...,...,...,...,...,...,...
152457,2025-09-30,904681,Almeiyda,23.33,5.50,11,16,,Flat,0
152458,2025-09-30,904681,Itsyouitsyouitsyou,137.45,26.00,15,16,,Flat,0
152459,2025-09-30,904681,Lamberella,597.87,150.00,16,16,,Flat,0
152460,2025-09-30,904681,Keepsgettingbetter,75.00,12.50,14,16,,Flat,0


In [121]:
prices = []
for i, race_id in enumerate(df['race_id'].unique()):
    print(f"running {i} iteration of {len(df['race_id'].unique())} sims")
    sub_df = df[df['race_id'] == race_id]
    if sub_df['betfair_place_sp'].isna().sum() > 0:
        continue
    if sub_df['betfair_win_sp'].isna().sum() > 0:
        continue
    res = simulate_place_counts(sub_df, n_places=3, n_sims=10000, seed=7)

    res = res.rename(columns={
        'horse': 'horse_name'
    })
    res['race_id'] = race_id
    nf = res.merge(
        df,
        on=['horse_name', 'race_id'],
        how='left'
    )
    
    prices.append(
        nf.round(2)
    )

running 0 iteration of 19118 sims
running 1 iteration of 19118 sims
running 2 iteration of 19118 sims
running 3 iteration of 19118 sims
running 4 iteration of 19118 sims
running 5 iteration of 19118 sims
running 6 iteration of 19118 sims
running 7 iteration of 19118 sims
running 8 iteration of 19118 sims
running 9 iteration of 19118 sims
running 10 iteration of 19118 sims
running 11 iteration of 19118 sims
running 12 iteration of 19118 sims
running 13 iteration of 19118 sims
running 14 iteration of 19118 sims
running 15 iteration of 19118 sims
running 16 iteration of 19118 sims
running 17 iteration of 19118 sims
running 18 iteration of 19118 sims
running 19 iteration of 19118 sims
running 20 iteration of 19118 sims
running 21 iteration of 19118 sims
running 22 iteration of 19118 sims
running 23 iteration of 19118 sims
running 24 iteration of 19118 sims
running 25 iteration of 19118 sims
running 26 iteration of 19118 sims
running 27 iteration of 19118 sims
running 28 iteration of 19118 

In [122]:
res['race_id'] = race_id
res.merge(
    df,
    on=['horse_name', 'race_id'],
    how='left'
)

Unnamed: 0,horse_name,win_prob,place_prob_topN,sim_place_sp,race_id,race_date,betfair_win_sp,betfair_place_sp,finishing_position,number_of_runners,race_class,race_type,weeks_since_last_ran
0,Morshdi,0.4902,0.9706,1.030291,901600,2025-09-17,2.04,1.25,2,5,4.0,Flat,0
1,Esna,0.3408,0.9402,1.063603,901600,2025-09-17,2.96,1.43,1,5,4.0,Flat,3
2,Montague Menace,0.105,0.6756,1.480166,901600,2025-09-17,9.41,3.25,4,5,4.0,Flat,0
3,Shadowmere,0.0429,0.2701,3.702332,901600,2025-09-17,25.0,7.4,3,5,4.0,Flat,3
4,Midnight Bandit,0.0211,0.1435,6.968641,901600,2025-09-17,48.0,11.69,5,5,4.0,Flat,2


In [123]:
sub_df

Unnamed: 0,race_date,race_id,horse_name,betfair_win_sp,betfair_place_sp,finishing_position,number_of_runners,race_class,race_type,weeks_since_last_ran
152105,2025-09-17,901600,Shadowmere,25.0,7.4,3,5,4.0,Flat,3
152107,2025-09-17,901600,Esna,2.96,1.43,1,5,4.0,Flat,3
152160,2025-09-17,901600,Midnight Bandit,48.0,11.69,5,5,4.0,Flat,2
152312,2025-09-17,901600,Montague Menace,9.41,3.25,4,5,4.0,Flat,0
152320,2025-09-17,901600,Morshdi,2.04,1.25,2,5,4.0,Flat,0


In [125]:
tf

Unnamed: 0,horse_name,win_prob,place_prob_topN,sim_place_sp,race_id,race_date,betfair_win_sp,betfair_place_sp,finishing_position,number_of_runners,race_class,race_type,weeks_since_last_ran
0,Mubakker,0.34,0.77,1.31,759652,2020-06-27,2.93,1.43,11,11,1.0,Flat,4
1,Major Jumbo,0.15,0.47,2.15,759652,2020-06-27,6.62,2.20,3,11,1.0,Flat,3
2,Brando,0.10,0.34,2.96,759652,2020-06-27,9.99,3.25,2,11,1.0,Flat,3
3,Chiefofchiefs,0.08,0.29,3.43,759652,2020-06-27,11.47,3.50,7,11,1.0,Flat,1
4,Judicial,0.07,0.27,3.74,759652,2020-06-27,13.00,3.19,1,11,1.0,Flat,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,Morshdi,0.49,0.97,1.03,901600,2025-09-17,2.04,1.25,2,5,4.0,Flat,0
1,Esna,0.34,0.94,1.06,901600,2025-09-17,2.96,1.43,1,5,4.0,Flat,3
2,Montague Menace,0.10,0.68,1.48,901600,2025-09-17,9.41,3.25,4,5,4.0,Flat,0
3,Shadowmere,0.04,0.27,3.70,901600,2025-09-17,25.00,7.40,3,5,4.0,Flat,3


In [124]:
tf = pd.concat(prices)

In [149]:
# After running your original query
tf = tf[tf['number_of_runners'].isin([5, 8, 9, 10])]

In [150]:
tf

Unnamed: 0,horse_name,win_prob,place_prob_topN,sim_place_sp,race_id,race_date,betfair_win_sp,betfair_place_sp,finishing_position,number_of_runners,race_class,race_type,weeks_since_last_ran
0,One Master,0.36,0.80,1.25,761933,2020-08-22,2.84,1.42,2,8,1.0,Flat,3
1,Safe Voyage,0.18,0.56,1.80,761933,2020-08-22,5.60,1.59,1,8,1.0,Flat,4
2,San Donato,0.15,0.50,2.01,761933,2020-08-22,6.66,1.96,8,8,1.0,Flat,3
3,Beat Le Bon,0.11,0.39,2.58,761933,2020-08-22,9.17,3.05,5,8,1.0,Flat,2
4,Threat,0.08,0.29,3.45,761933,2020-08-22,12.43,3.55,4,8,1.0,Flat,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,Morshdi,0.49,0.97,1.03,901600,2025-09-17,2.04,1.25,2,5,4.0,Flat,0
1,Esna,0.34,0.94,1.06,901600,2025-09-17,2.96,1.43,1,5,4.0,Flat,3
2,Montague Menace,0.10,0.68,1.48,901600,2025-09-17,9.41,3.25,4,5,4.0,Flat,0
3,Shadowmere,0.04,0.27,3.70,901600,2025-09-17,25.00,7.40,3,5,4.0,Flat,3


In [151]:
test = tf[(tf['betfair_win_sp'] > 2 ) & (tf['betfair_win_sp'] <=3 )]

In [152]:
test['actual_place_prob'] = 1 / test['betfair_place_sp']
test['sim_place_prob'] = 1 / test['sim_place_sp']
test['edge'] = test['sim_place_prob']  - test['actual_place_prob'] 
test['bet'] = np.where(test['edge'] > 0, True, False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['actual_place_prob'] = 1 / test['betfair_place_sp']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['sim_place_prob'] = 1 / test['sim_place_sp']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['edge'] = test['sim_place_prob']  - test['actual_place_prob']
A value is trying to be set on 

In [153]:
test[test['bet'] == True]

Unnamed: 0,horse_name,win_prob,place_prob_topN,sim_place_sp,race_id,race_date,betfair_win_sp,betfair_place_sp,finishing_position,number_of_runners,race_class,race_type,weeks_since_last_ran,actual_place_prob,sim_place_prob,edge,bet
0,One Master,0.36,0.80,1.25,761933,2020-08-22,2.84,1.42,2,8,1.0,Flat,3,0.704225,0.800000,0.095775,True
0,Falberto,0.41,0.86,1.17,772721,2020-12-30,2.55,1.27,2,9,5.0,Hurdle,4,0.787402,0.854701,0.067299,True
0,Schnabel,0.35,0.79,1.27,761828,2020-08-17,2.84,1.39,1,9,5.0,Chase,1,0.719424,0.787402,0.067977,True
0,Armattiekan,0.43,0.87,1.16,769438,2020-11-20,2.36,1.33,1,10,5.0,Chase,1,0.751880,0.862069,0.110189,True
0,Eskylane,0.42,0.86,1.16,771194,2020-11-08,2.38,1.30,5,8,,Hurdle,4,0.769231,0.862069,0.092838,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,Lost Boys,0.45,0.88,1.13,902370,2025-09-27,2.24,1.28,1,9,4.0,Flat,3,0.781250,0.884956,0.103706,True
0,Act Of Kindness,0.41,0.85,1.17,902357,2025-09-27,2.41,1.43,3,10,2.0,Flat,0,0.699301,0.854701,0.155400,True
0,Lost Boys,0.35,0.82,1.21,900739,2025-09-06,2.90,1.72,3,5,2.0,Flat,3,0.581395,0.826446,0.245051,True
0,Morshdi,0.49,0.97,1.03,901600,2025-09-17,2.04,1.25,2,5,4.0,Flat,0,0.800000,0.970874,0.170874,True


In [129]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19919 entries, 0 to 1
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   horse_name            19919 non-null  object 
 1   win_prob              19919 non-null  float64
 2   place_prob_topN       19919 non-null  float64
 3   sim_place_sp          19919 non-null  float64
 4   race_id               19919 non-null  int64  
 5   race_date             19919 non-null  object 
 6   betfair_win_sp        19919 non-null  float64
 7   betfair_place_sp      19919 non-null  float64
 8   finishing_position    19919 non-null  object 
 9   number_of_runners     19919 non-null  int64  
 10  race_class            15939 non-null  float64
 11  race_type             19919 non-null  object 
 12  weeks_since_last_ran  19919 non-null  int64  
 13  actual_place_prob     19919 non-null  float64
 14  sim_place_prob        19919 non-null  float64
 15  edge                  19919 

In [154]:
def calculate_win_place_flags(data: pd.DataFrame) -> pd.DataFrame:
    data["win"] = data["finishing_position"] == "1"
    data["place"] = (
        (data["number_of_runners"] < 8)
        & (data["finishing_position"].isin(["1", "2"]))
    ) | (
        (data["number_of_runners"] >= 8)
        & (data["finishing_position"].isin(["1", "2", "3"]))
    )
    return data

In [155]:
test = calculate_win_place_flags(test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["win"] = data["finishing_position"] == "1"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["place"] = (


In [164]:
test = test[(test['bet'] == True) & (test['edge'] >  0.2) ]

In [165]:
test["profit"] = np.where(
    (test["place"] == True) & (test["bet"] == True),
    (test["betfair_place_sp"] - 1) * 0.8,
    -1,
)

In [167]:
test .to_csv('~/Desktop/test.csv', index=False)

In [168]:
test['profit'].sum()

np.float64(-134.816)

In [166]:
test['profit'].sum() / len(test['profit'])

np.float64(-0.05936415675913694)

In [158]:
back_bank = 100
lay_bank = 100

for i in test.itertuples():
    if i.bet:
        size = 100 * i.edge
        
        if i.place:  # Horse places
            # BACK: Win (odds - 1) * size, minus 5% commission on profit
            profit = size * (i.betfair_place_sp - 1)
            back_bank += profit * 0.95  # Keep 95% after commission
            
            # LAY: Lose liability (we pay out the profit to the backer)
            lay_bank -= profit
        
        else:  # Horse loses
            # BACK: Lose stake
            back_bank -= size
            
            # LAY: Win stake, minus 5% commission on profit
            lay_bank += size * 0.95  # Keep 95% after commission

In [159]:
back_bank

-723.5119063203862

In [160]:
lay_bank

-2606.634749415216

In [162]:
back_bank / len(test)

-0.08567340512970825