In [None]:
import pandas as pd
import os
os.system("kaggle datasets download -d dissfya/atp-tennis-2000-2023daily-pull --unzip")
df = pd.read_csv("live_atp_tennis.csv")
df = df.iloc[27526:]
df = df.reset_index(drop=True)

#print(df.head())
#print(df.columns)

               Tournament        Date  Series    Court Surface      Round  \
0  Brisbane International  2010-01-04  ATP250  Outdoor    Hard  1st Round   
1  Brisbane International  2010-01-04  ATP250  Outdoor    Hard  1st Round   
2  Brisbane International  2010-01-04  ATP250  Outdoor    Hard  1st Round   
3  Brisbane International  2010-01-04  ATP250  Outdoor    Hard  1st Round   
4  Brisbane International  2010-01-04  ATP250  Outdoor    Hard  1st Round   

   Best of       Player_1    Player_2       Winner  Rank_1  Rank_2  Pts_1  \
0        3     Odesnik W.  Clement A.   Odesnik W.     105      63    521   
1        3  Petzschner P.  Gicquel M.   Gicquel M.      80      58    587   
2        3       Falla A.   Chardy J.     Falla A.      81      32    587   
3        3      Llodra M.     Levy H.      Levy H.      67     119    649   
4        3    Bellucci T.  Chela J.I.  Bellucci T.      36      73   1021   

   Pts_2  Odd_1  Odd_2        Score  
0    667   2.25   1.57      6-4 7-6 

In [8]:
df = df.dropna(subset=['Player_1', 'Player_2', 'Winner', 'Score', 'Rank_1', 'Rank_2', 'Surface'])
df = df[(df['Rank_1'] > 0) & (df['Rank_2'] > 0)]


In [None]:
df['label'] = (df['Winner'] == df['Player_1']).astype(int)
df['rank_diff'] = df['Rank_2'] - df['Rank_1']
#if positive player 1 is higher ranked
#if negative player 2 is higer ranked

In [None]:
#track each players previous matches using a loop
from collections import defaultdict
from datetime import datetime
df['Date'] = pd.to_datetime(df['Date'])
#sort the data frame by date. It should already be sorted by date but just making sure
df = df.sort_values('Date')
#make dictionary and lamda:{'matches': []} is so that we don't constantly have to check "if player is in dictionary" each time in the loop.
player_stats = defaultdict(lambda: {'matches': []})



In [None]:
#Loop through each match row in the DataFrame
for idx, row in df.iterrows():
    winner = row['Winner']
    loser = row['Loser']
    match_date = row['Date']

    match_details = {
        'opponent': loser,
        'result': 'win',
        'date': match_date,
        'row_index': idx  # optionally keep track of the row
    }

    player_stats[winner]['matches'].append(match_details)

    # Do the same for the loser, but mark result as 'loss'
    match_details_loser = {
        'opponent': winner,
        'result': 'loss',
        'date': match_date,
        'row_index': idx
    }
    player_stats[loser]['matches'].append(match_details_loser)

In [None]:
def get_decay_weight(match_date, current_date, half_life_days=180):
    days_old = (current_date - match_date).days
    return 0.5 ** (days_old / half_life_days)

In [None]:
def calculate_score(p1, p2, row):
    current_date = row["Date"]
    tournament = row["Tournament"]
    surface = row["Surface"]

    score = {p1: 0, p2: 0}

    # Rank difference
    r1, r2 = row['Rank_1'], row['Rank_2']
    if pd.notna(r1) and pd.notna(r2):
        if r1 < r2:
            score[p1] += 10
        elif r2 < r1:
            score[p2] += 10

    # Previous matchups (with decay)
    for player, opponent in [(p1, p2), (p2, p1)]:
        for match in player_stats[player]['matches']:
            if match['opponent'] == opponent:
                w = get_decay_weight(match['date'], current_date)
                if match['result'] == 'win':
                    score[player] += 5 * w

    # Recent form: win streak in last 5
    for player in [p1, p2]:
        recent = player_stats[player]['matches'][-5:]
        for match in recent:
            w = get_decay_weight(match['date'], current_date)
            if match['result'] == 'win':
                score[player] += 2 * w

    # Surface performance
    for player in [p1, p2]:
        wins_on_surface = sum(
            get_decay_weight(m['date'], current_date)
            for m in player_stats[player]['matches']
            if m['surface'] == surface and m['result'] == 'win'
        )
        if wins_on_surface > 0:
            score[player] += 5

    # Tournament history
    for player in [p1, p2]:
        wins_in_tournament = sum(
            get_decay_weight(m['date'], current_date)
            for m in player_stats[player]['matches']
            if m['tournament'] == tournament and m['result'] == 'win'
        )
        if wins_in_tournament > 0:
            score[player] += 10

    return score

In [None]:
correct = 0
total = 0
predictions = []

for idx, row in df.iterrows():
    p1 = row['Player_1']
    p2 = row['Player_2']
    winner = row['Winner']

    if p1 not in player_stats or p2 not in player_stats:
        continue  # need match history

    scores = calculate_score(p1, p2, row)
    predicted = max(scores, key=scores.get)
    confidence = abs(scores[p1] - scores[p2])

    predictions.append({
        'index': idx,
        'p1': p1,
        'p2': p2,
        'winner': winner,
        'predicted': predicted,
        'confidence': confidence
    })

    if predicted == winner:
        correct += 1
    total += 1

print(f"Prediction Accuracy: {correct}/{total} = {correct/total:.2%}")

In [None]:
parlay_slip = sorted(predictions, key=lambda x: x['confidence'], reverse=True)[:5]
for match in parlay_slip:
    print(f"{match['p1']} vs {match['p2']} → Predicted: {match['predicted']} (Confidence: {match['confidence']:.2f})")

In [None]:
predictions_sorted = sorted(predictions, key=lambda x: x['confidence'], reverse=True)
top_n = 5
parlay_slip = predictions_sorted[:top_n]

In [None]:
print("\n Recommended Parlay Slip:")
for match in parlay_slip:
    print(f"{match['p1']} vs {match['p2']} → Predicted: {match['predicted']} | Confidence: {match['confidence']:.2f}")