In [54]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [10]:
np.random.seed(42)

In [14]:
plt.style.use("seaborn-v0_8-whitegrid")
sns.set_palette("husl")
%matplotlib inline

In [None]:
historical_winners = {
    'year': list(range(2000, 2025)),
    'men_winner': [
        'Gustavo Kuerten', 'Gustavo Kuerten', 'Albert Costa', 'Juan Carlos Ferrero',
        'Gaston Gaudio', 'Rafael Nadal', 'Rafael Nadal', 'Rafael Nadal',
        'Rafael Nadal', 'Robin Soderling', 'Rafael Nadal', 'Rafael Nadal',
        'Rafael Nadal', 'Stanislas Wawrinka', 'Rafael Nadal', 'Novak Djokovic',
        'Rafael Nadal', 'Rafael Nadal', 'Dominic Thiem', 'Rafael Nadal',
        'Novak Djokovic', 'Rafael Nadal', 'Novak Djokovic', 'Carlos Alcaraz',
        'TBD'
    ],
    'women_winner': [
        'Mary Pierce', 'Jennifer Capriati', 'Serena Williams', 'Justine Henin-Hardenne',
        'Anastasia Myskina', 'Justine Henin-Hardenne', 'Justine Henin',
        'Ana Ivanovic', 'Svetlana Kuznetsova', 'Francesca Schiavone',
        'Li Na', 'Maria Sharapova', 'Serena Williams', 'Maria Sharapova',
        'Serena Williams', 'Garbine Muguruza', 'Jelena Ostapenko', 'Simona Halep',
        'Ashleigh Barty', 'Barbora Krejcikova', 'Iga Swiatek', 'Iga Swiatek',
        'Iga Swiatek', 'Iga Swiatek', 'TBD'
    ]
}

In [24]:
tournament_results_2025 = {
    'men_results': {
        'Jannik Sinner': {
            'path_to_sf': ['R1: 6-3, 6-2, 6-1', 'R2: 6-4, 6-2, 6-3', 'R3: 6-1, 6-2, 6-4', 
                          'R4: 6-2, 6-4, 6-3', 'QF: 6-4, 6-3, 6-2 (vs Bublik)'],
            'sets_won': 15, 'sets_lost': 0, 'games_won': 90, 'games_lost': 35,
            'tournament_form': 0.95, 'dominance_score': 0.98
        },
        'Carlos Alcaraz': {
            'path_to_sf': ['R1: 6-4, 6-2, 6-3', 'R2: 6-3, 6-4, 6-2', 'R3: 6-2, 6-4, 6-1',
                          'R4: 6-3, 6-4, 6-2', 'QF: 6-4, 6-3, 6-2'],
            'sets_won': 15, 'sets_lost': 0, 'games_won': 88, 'games_lost': 42,
            'tournament_form': 0.92, 'dominance_score': 0.95
        },
        'Novak Djokovic': {
            'path_to_sf': ['R1: 6-4, 6-2, 6-3', 'R2: 6-3, 6-4, 6-4', 'R3: 6-2, 6-3, 6-4',
                          'R4: 6-3, 6-4, 6-2', 'QF: 6-4, 6-3, 6-4'],
            'sets_won': 15, 'sets_lost': 0, 'games_won': 85, 'games_lost': 48,
            'tournament_form': 0.88, 'dominance_score': 0.90
        },
        'Lorenzo Musetti': {
            'path_to_sf': ['R1: 6-4, 4-6, 6-3, 6-2', 'R2: 6-3, 6-4, 6-2', 'R3: 6-4, 6-3, 6-4',
                          'R4: 6-2, 6-4, 6-3', 'QF: 6-4, 6-3, 6-2'],
            'sets_won': 14, 'sets_lost': 1, 'games_won': 82, 'games_lost': 52,
            'tournament_form': 0.85, 'dominance_score': 0.82
        }
    },
    'women_results': {
        'Aryna Sabalenka': {
            'path_to_final': ['R1: 6-2, 6-3', 'R2: 6-4, 6-2', 'R3: 6-3, 6-4', 'R4: 6-2, 6-4',
                             'QF: 7-6(7), 6-3 (vs Zheng)', 'SF: 7-6(1), 4-6, 6-0 (vs Swiatek)'],
            'sets_won': 11, 'sets_lost': 1, 'games_won': 75, 'games_lost': 42,
            'tournament_form': 0.96, 'dominance_score': 0.92, 'beat_swiatek': True
        },
        'Coco Gauff': {
            'path_to_final': ['R1: 6-1, 6-2', 'R2: 6-3, 6-4', 'R3: 6-2, 6-1', 'R4: 6-4, 6-3',
                             'QF: 6-4, 6-2 (vs Keys)', 'SF: 6-1, 6-2 (vs Boisson)'],
            'sets_won': 12, 'sets_lost': 0, 'games_won': 70, 'games_lost': 28,
            'tournament_form': 0.94, 'dominance_score': 0.95, 'perfect_tournament': True
        },
        'Iga Swiatek': {
            'path_to_sf': ['R1: 6-1, 6-2', 'R2: 6-3, 6-2', 'R3: 6-2, 6-4', 
                          'R4: 6-1, 6-2', 'QF: 6-1, 7-5 (vs Svitolina)',
                          'SF: 6-7(1), 6-4, 0-6 (LOST to Sabalenka)'],
            'sets_won': 9, 'sets_lost': 3, 'games_won': 65, 'games_lost': 41,
            'tournament_form': 0.82, 'dominance_score': 0.75, 'upset_loss': True
        },
        'Lois Boisson': {
            'path_to_sf': ['R1: 6-3, 6-4', 'R2: 6-2, 6-4', 'R3: 6-4, 6-3', 'R4: 6-2, 6-3',
                          'QF: 7-6(8), 6-3 (vs Andreeva)', 'SF: 1-6, 2-6 (LOST to Gauff)'],
            'sets_won': 10, 'sets_lost': 2, 'games_won': 60, 'games_lost': 42,
            'tournament_form': 0.88, 'dominance_score': 0.78, 'french_wildcard': True
        }
    }
}

In [36]:
current_players = {
    'men': {
        'Jannik Sinner': {
            'current_ranking': 1, 'age': 23, 'clay_titles': 2, 'grand_slam_titles': 3,
            'rg_best_result': 'SF', 'recent_form': 0.85, 'h2h_vs_top10': 0.72,
            'clay_win_percentage': 0.78, 'years_pro': 7, 'fitness_score': 0.92,
            'tournament_performance': 0.98, 'sets_dropped': 0, 'momentum': 0.95,
            'h2h_vs_djokovic': 3/7
        },
        'Carlos Alcaraz': {
            'current_ranking': 2, 'age': 22, 'clay_titles': 5, 'grand_slam_titles': 4,
            'rg_best_result': 'W', 'recent_form': 0.88, 'h2h_vs_top10': 0.75,
            'clay_win_percentage': 0.85, 'years_pro': 6, 'fitness_score': 0.95,
            'tournament_performance': 0.95, 'sets_dropped': 0, 'momentum': 0.92,
            'defending_champion': True, 'h2h_vs_musetti': 5/6
        },
        'Novak Djokovic': {
            'current_ranking': 6, 'age': 38, 'clay_titles': 18, 'grand_slam_titles': 24,
            'rg_best_result': 'W', 'recent_form': 0.75, 'h2h_vs_top10': 0.68,
            'clay_win_percentage': 0.82, 'years_pro': 20, 'fitness_score': 0.88,
            'tournament_performance': 0.90, 'sets_dropped': 0, 'momentum': 0.85,
            'experience_factor': 0.98, 'h2h_vs_sinner': 4/7
        },
        'Lorenzo Musetti': {
            'current_ranking': 8, 'age': 22, 'clay_titles': 1, 'grand_slam_titles': 0,
            'rg_best_result': 'SF', 'recent_form': 0.80, 'h2h_vs_top10': 0.45,
            'clay_win_percentage': 0.72, 'years_pro': 5, 'fitness_score': 0.85,
            'tournament_performance': 0.82, 'sets_dropped': 1, 'momentum': 0.80,
            'breakthrough_factor': 0.85, 'h2h_vs_alcaraz': 1/6
        }
    },
    'women': {
        'Aryna Sabalenka': {
        'current_ranking': 1, 'age': 26, 'clay_titles': 1, 'grand_slam_titles': 3,
        'rg_best_result': 'F', 'recent_form': 0.85, 'h2h_vs_top10': 0.70,
        'clay_win_percentage': 0.68, 'years_pro': 9, 'fitness_score': 0.88,
        'tournament_performance': 0.96, 'sets_dropped': 1, 'momentum': 0.98,
        'beat_swiatek': True, 'final_experience': 0.90, 'h2h_vs_gauff': 5/10
    },
    'Coco Gauff': {
        'current_ranking': 2, 'age': 21, 'clay_titles': 0, 'grand_slam_titles': 1,
        'rg_best_result': 'F', 'recent_form': 0.88, 'h2h_vs_top10': 0.65,
        'clay_win_percentage': 0.70, 'years_pro': 6, 'fitness_score': 0.92,
        'tournament_performance': 0.94, 'sets_dropped': 0, 'momentum': 0.95,
        'perfect_tournament': True, 'us_open_champion': True, 'h2h_vs_sabalenka': 5/10
    }
}
}
current_players

{'men': {'Jannik Sinner': {'current_ranking': 1,
   'age': 23,
   'clay_titles': 2,
   'grand_slam_titles': 3,
   'rg_best_result': 'SF',
   'recent_form': 0.85,
   'h2h_vs_top10': 0.72,
   'clay_win_percentage': 0.78,
   'years_pro': 7,
   'fitness_score': 0.92,
   'tournament_performance': 0.98,
   'sets_dropped': 0,
   'momentum': 0.95,
   'h2h_vs_djokovic': 0.42857142857142855},
  'Carlos Alcaraz': {'current_ranking': 2,
   'age': 22,
   'clay_titles': 5,
   'grand_slam_titles': 4,
   'rg_best_result': 'W',
   'recent_form': 0.88,
   'h2h_vs_top10': 0.75,
   'clay_win_percentage': 0.85,
   'years_pro': 6,
   'fitness_score': 0.95,
   'tournament_performance': 0.95,
   'sets_dropped': 0,
   'momentum': 0.92,
   'defending_champion': True,
   'h2h_vs_musetti': 0.8333333333333334},
  'Novak Djokovic': {'current_ranking': 6,
   'age': 38,
   'clay_titles': 18,
   'grand_slam_titles': 24,
   'rg_best_result': 'W',
   'recent_form': 0.75,
   'h2h_vs_top10': 0.68,
   'clay_win_percentage'

In [44]:
def create_features(player_data, gender):
    """Enhanced feature engineering with more sophisticated metrics"""
    features = []
    names = []
    
    for player, stats in player_data.items():
        # Base features
        feature_vector = [
            stats['current_ranking'],
            stats['age'],
            stats['clay_titles'],
            stats['grand_slam_titles'],
            1 if stats['rg_best_result'] == 'W' else 
            0.75 if stats['rg_best_result'] == 'F' else 
            0.5 if stats['rg_best_result'] == 'SF' else 0.25,
            stats['recent_form'],
            stats['h2h_vs_top10'],
            stats['clay_win_percentage'],
            stats['years_pro'],
            stats['fitness_score'],
            1 / stats['current_ranking'],
            stats['clay_titles'] / max(1, stats['years_pro']),
            stats['grand_slam_titles'] * stats['clay_win_percentage'],
            stats.get('tournament_performance', 0.5),
            1 - (stats.get('sets_dropped', 5) / 15),
            stats.get('momentum', 0.5),
        ]
        
        # Special factors based on gender
        if gender == 'women':
            feature_vector.extend([
                stats.get('beat_swiatek', 0) * 0.2,
                stats.get('perfect_tournament', 0) * 0.15,
                stats.get('final_experience', 0) * 0.1
            ])
        else:
            feature_vector.extend([0, 0, 0])  
            
        features.append(feature_vector)
        names.append(player)
    
    return np.array(features), names

men_features, men_names = create_features(current_players['men'], 'men')
women_features, women_names = create_features(current_players['women'], 'women')

In [62]:
def make_synthetic_data(n_samples=1000):
    from sklearn.datasets import make_classification
    return make_classification(n_samples=n_samples, n_features=20, n_informative=15,
                                n_redundant=5, n_classes=2, random_state=42)

# Then call it
X, y = make_synthetic_data(n_samples=5000)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Create pipeline with feature selection
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', SelectKBest(f_classif, k=19)),
    ('classifier', VotingClassifier([
        ('rf', RandomForestClassifier(n_estimators=200, max_depth=7, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=150, learning_rate=0.1, random_state=42)),
    ], voting = 'soft'))
])

# Train model with cross-validation
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='roc_auc')
print(f"Cross-validated AUC: {np.mean(cv_scores):.3f} (±{np.std(cv_scores):.3f})")

# Final training
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict_proba(X_test)[:, 1]
print(f"Test AUC: {roc_auc_score(y_test, y_pred):.3f}")

Cross-validated AUC: 0.974 (±0.008)
Test AUC: 0.978


In [64]:
def predict_with_confidence(model, features, names):
    """Make predictions with confidence intervals"""
    # Get predictions from all base estimators
    probas = [estimator.predict_proba(features)[:, 1] 
              for estimator in model.named_steps['classifier'].estimators_]
    
    # Calculate mean and std of predictions
    mean_proba = np.mean(probas, axis=0)
    std_proba = np.std(probas, axis=0)
    
    # Create results dataframe
    results = pd.DataFrame({
        'player': names,
        'win_probability': mean_proba,
        'confidence_interval': std_proba * 1.96  # 95% CI
    }).sort_values('win_probability', ascending=False)
    
    return results

# Make predictions
men_results = predict_with_confidence(pipeline, men_features, men_names)
women_results = predict_with_confidence(pipeline, women_features, women_names)

# Display results
print("\nMEN'S SEMIFINAL PREDICTIONS:")
print(men_results.to_string(index=False))

print("\nWOMEN'S FINAL PREDICTION:")
print(women_results.to_string(index=False))


MEN'S SEMIFINAL PREDICTIONS:
         player  win_probability  confidence_interval
Lorenzo Musetti         0.545168             0.108447
  Jannik Sinner         0.528393             0.026465
 Novak Djokovic         0.481081             0.100291
 Carlos Alcaraz         0.475672             0.130461

WOMEN'S FINAL PREDICTION:
         player  win_probability  confidence_interval
     Coco Gauff         0.720037             0.151486
Aryna Sabalenka         0.596277             0.082732
