In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

df = pd.read_csv('masters&GS_2000-2024.csv')

playerStats = {}

def updateStats(name, won, bpSaved, bpFaced, oppBpSaved, oppBpFaced):
    if name not in playerStats:
        playerStats[name] = {'matches': 0, 'wins': 0, 'bpFacedServe': 0, 'bpSavedServe': 0, 'bpOppServe': 0, 'bpConvertedReturn': 0}

    stats = playerStats[name]
    stats['matches'] += 1
    if won:
        stats['wins'] += 1

    if not np.isnan(bpFaced):
        stats['bpFacedServe'] += bpFaced
        stats['bpSavedServe'] += bpSaved

    if not np.isnan(oppBpFaced):
        stats['bpOppServe'] += oppBpFaced
        stats['bpConvertedReturn'] += (oppBpFaced - oppBpSaved)

for _, row in df.iterrows():
    updateStats(row['winner_name'], True, row['w_bpSaved'], row['w_bpFaced'], row['l_bpSaved'], row['l_bpFaced'])
    updateStats(row['loser_name'], False, row['l_bpSaved'], row['l_bpFaced'], row['w_bpSaved'], row['w_bpFaced'])

playersDf = pd.DataFrame.from_dict(playerStats, orient='index')
playersDf = playersDf[playersDf['matches'] >= 10].copy()

playersDf['winRate'] = playersDf['wins'] / playersDf['matches']
playersDf['bpSaveRate'] = playersDf['bpSavedServe'] / playersDf['bpFacedServe']
playersDf['bpConvertRate'] = playersDf['bpConvertedReturn'] / playersDf['bpOppServe']
playersDf = playersDf.dropna()

X = playersDf[['bpSaveRate', 'bpConvertRate']]
scaler = StandardScaler()
XScaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=3, random_state=42).fit(XScaled)
playersDf['cluster'] = kmeans.labels_

clusterMeans = playersDf.groupby('cluster')['winRate'].mean().sort_values(ascending=False)
names = {clusterMeans.index[0]: "Elite Clutch", clusterMeans.index[1]: "Moderate Clutch", clusterMeans.index[2]: "Low Clutch"}
playersDf['clusterName'] = playersDf['cluster'].map(names)

centers = kmeans.cluster_centers_
dists = [np.linalg.norm(x - centers[c]) for x, c in zip(XScaled, kmeans.labels_)]
playersDf['dist'] = dists
outliers = playersDf.sort_values('dist', ascending=False).head(8)

plt.figure(figsize=(12, 8))
sns.scatterplot(data=playersDf, x='bpSaveRate', y='bpConvertRate', hue='clusterName', style='clusterName', s=150, palette='viridis', alpha=0.85)

for name, row in outliers.iterrows():
    plt.text(row['bpSaveRate']+0.002, row['bpConvertRate'], name, fontsize=9, fontweight='bold', color='#c0392b')

famous = ['Novak Djokovic', 'Rafael Nadal', 'Roger Federer', 'Daniil Medvedev', 'Carlos Alcaraz']
for name in famous:
    if name in playersDf.index and name not in outliers.index:
        plt.text(playersDf.loc[name, 'bpSaveRate']+0.002, playersDf.loc[name, 'bpConvertRate'], name, fontsize=10, fontweight='bold', color='black')

plt.title('Player Segmentation: Clutch Profiles & Outliers', fontsize=16)
plt.xlabel('Break Point Save Rate (Defense)', fontsize=12)
plt.ylabel('Break Point Conversion Rate (Offense)', fontsize=12)
plt.legend(loc='upper right')
plt.grid(True, linestyle='--', alpha=0.5)
plt.savefig('clustering_plot_clean.png')
plt.show()

playersDf['clutchScore'] = (playersDf['bpSaveRate'] + playersDf['bpConvertRate']) / 2
corr = playersDf['clutchScore'].corr(playersDf['winRate'])

plt.figure(figsize=(10, 8))
sns.regplot(data=playersDf, x='clutchScore', y='winRate',
            scatter_kws={'s': 100, 'alpha': 0.6, 'color': '#2980b9', 'edgecolor':'w'},
            line_kws={'color': '#e74c3c', 'linewidth': 3, 'label': f'Trend Line (Corr: {corr:.2f})'})

plt.title('Impact Analysis: Clutch Score vs Career Win Rate', fontsize=16)
plt.xlabel('Composite Clutch Score', fontsize=12)
plt.ylabel('Career Win Rate', fontsize=12)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.5)
plt.savefig('regression_plot_clean.png')
plt.show()