<a href="https://colab.research.google.com/github/yehyifan/About_me/blob/main/NBA_Rookie_Retention_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NBA Rookie Retention Prediction  
Developed a predictive analysis pipeline using rookie season statistics to estimate NBA player retention after five years. Performed data cleaning, exploratory analysis, and feature selection using **SelectKBest** and **RFE** to uncover key performance indicators. Built classification models with **Logistic Regression** and evaluated them using **precision-recall analysis** and **threshold tuning**. Visualised feature importance and model results with **Plotly** for clear, interactive insights into early-career success factors.


## Module Import and Data Loading

In [1]:
import numpy as np
import string
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from tqdm import tqdm
from urllib import request
module_url = [f"https://drive.google.com/uc?export=view&id=1DGhCMqrxRy_oRZDR7cy2OFDqjI_OHydR"]
name = ['nba.csv']
for i in range(len(name)):
    with request.urlopen(module_url[i]) as f, open(name[i],'w') as outf:
        a = f.read()
        outf.write(a.decode('ISO-8859-1'))

## Regression Analysis & Visualisation

### 1. Load Data

In [2]:
df = pd.read_csv('nba.csv')
print(df.head())

              name  games_played  minutes_played  points_scored  goals_scored  \
0   Brandon Ingram          36.0            27.4            7.4           2.6   
1  Andrew Harrison          35.0            26.9            7.2           2.0   
2   JaKarr Sampson          74.0            15.3            5.2           2.0   
3      Malik Sealy          58.0            11.6            5.7           2.3   
4      Matt Geiger          48.0            11.5            4.5           1.6   

   goals_attempted  3_point_scored  3_point_attempted  free_throws_scored  \
0              7.6             0.5                2.1                 1.6   
1              6.7             0.7                2.8                 2.6   
2              4.7             0.4                1.7                 0.9   
3              5.5             0.1                0.5                 0.9   
4              3.0             0.0                0.1                 1.3   

   free_throws_attempted  offensive_rebounds  defe

### 2. Data Preprocessing

2.1 Handle Missing Values

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Drop rows where target '5yrs' is missing
df = df.dropna(subset=['5yrs']).copy()

# Impute missing values in numeric features (excluding 'name' and '5yrs')
numerical_cols = df.columns.drop(['name', '5yrs'])
imputer = SimpleImputer(strategy='mean')
df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

2.2 Standardise Numerical Features

In [4]:
# Scale numeric features
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

print(df.head())

              name  games_played  minutes_played  points_scored  goals_scored  \
0   Brandon Ingram     -1.519189        1.124828       0.081639     -0.069180   
1  Andrew Harrison     -1.579021        1.065452       0.038162     -0.405243   
2   JaKarr Sampson      0.754431       -0.312066      -0.396606     -0.405243   
3      Malik Sealy     -0.202883       -0.751447      -0.287914     -0.237211   
4      Matt Geiger     -0.801204       -0.763323      -0.548775     -0.629285   

   goals_attempted  3_point_scored  3_point_attempted  free_throws_scored  \
0         0.367169        0.609371           1.233772            0.212367   
1         0.131245        1.171516           1.941910            1.136649   
2        -0.393030        0.328298           0.829122           -0.434631   
3        -0.183320       -0.514919          -0.384828           -0.434631   
4        -0.838664       -0.795992          -0.789479           -0.064918   

   free_throws_attempted  offensive_rebounds  defe

### 3. Feature Selection

3.1 SelectKBest and RFE

In [5]:
# Split features and target
X = df.drop(columns=['name', '5yrs'])
y = df['5yrs'].astype(int)
X_scaled = X.copy()
features = X.columns

# 1. SelectKBest (top 50% F-score)
skb = SelectKBest(score_func=f_classif).fit(X_scaled, y)
skb_scores = skb.scores_
skb_top = set(features[np.argsort(skb_scores)[::-1][:len(features) // 2]])

# 2. RFE (top 5)
rfe = RFE(LogisticRegression(solver='liblinear', max_iter=200, random_state=42), n_features_to_select=5)
rfe.fit(X_scaled, y)
rfe_top = set(features[rfe.support_])

# Final selected features
best_features = sorted(skb_top & rfe_top)

print("Best combination of features (SelectKBest & RFE):")
print(best_features)

Best combination of features (SelectKBest & RFE):
['free_throws_attempted', 'games_played', 'goals_attempted', 'points_scored']


3.2 Feature Selection Visulisation

In [6]:
# Normalize scores
skb_scores_norm = skb_scores / np.max(skb_scores)
rfe_selected = np.zeros(len(features))
rfe_selected[rfe.support_] = 1

# Build DataFrame
bar_df = pd.DataFrame({
    'Feature': features,
    'SelectKBest': skb_scores_norm,
    'RFE': rfe_selected,
    'Selected_by_Both': [feat in best_features for feat in features]
})

# Calculate combined importance score
bar_df['Combined'] = bar_df['SelectKBest'] + bar_df['RFE']
bar_df = bar_df.sort_values(by='Combined', ascending=False).reset_index(drop=True)

# Use Plotly pastel palette
pastel_colors = px.colors.qualitative.Pastel

fig = go.Figure()

# SelectKBest bars
fig.add_trace(go.Bar(
    x=bar_df['Feature'],
    y=bar_df['SelectKBest'],
    name='SelectKBest',
    marker_color=pastel_colors[0],
    opacity=0.9,
    text=bar_df['SelectKBest'].round(2),
    customdata=np.stack([bar_df['Feature'], bar_df['Selected_by_Both']], axis=-1),
    hovertemplate="Feature: %{customdata[0]}<br>" +
                  "Method: SelectKBest<br>" +
                  "F-score: %{y:.2f}<br>" +
                  "Selected by Both: %{customdata[1]}<extra></extra>",
    showlegend=True
))

# RFE bars
fig.add_trace(go.Bar(
    x=bar_df['Feature'],
    y=bar_df['RFE'],
    name='RFE',
    marker_color=pastel_colors[1],
    opacity=0.9,
    text=bar_df['RFE'],
    customdata=np.stack([bar_df['Feature'], bar_df['Selected_by_Both']], axis=-1),
    hovertemplate="Feature: %{customdata[0]}<br>" +
                  "Method: RFE<br>" +
                  "Selected (1=Yes, 0=No): %{y}<br>" +
                  "Selected by Both: %{customdata[1]}<extra></extra>",
    showlegend=True
))

# Star annotations
for i, feat in enumerate(bar_df['Feature']):
    if bar_df.loc[i, 'Selected_by_Both']:
        y_val = max(bar_df.loc[i, 'SelectKBest'], bar_df.loc[i, 'RFE'])
        fig.add_annotation(
            text="★",
            x=feat,
            y=y_val + 0.05,
            showarrow=False,
            font=dict(color="#9370DB", size=16)
        )

# Dummy trace for legend
fig.add_trace(go.Scatter(
    x=[None],
    y=[None],
    mode='markers',
    name='Selected by Both',
    marker=dict(symbol='star', size=10, color='#9370DB'),
    showlegend=True
))

# Layout
fig.update_layout(
    template='plotly_white',
    title="Feature Selection Comparison: SelectKBest vs RFE",
    title_font=dict(size=18, color='#333'),
    font=dict(family="Arial", size=14, color='#4F4F4F'),
    xaxis=dict(
        title="Features",
        titlefont=dict(size=16, color='#4F4F4F'),
        tickfont=dict(size=14, color='#666'),
        tickangle=45
    ),
    yaxis=dict(
        title="Score / Selection",
        titlefont=dict(size=16, color='#4F4F4F'),
        tickfont=dict(size=14, color='#666'),
        showgrid=False
    ),
    barmode='group',
    width=1000,
    height=500,
    margin=dict(t=60, b=60)
)

fig.show()

### 3. Prediction

3.1 My Model

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# -- My Model --
# Prepare data
X_selected = df[best_features]
y = df['5yrs'].astype(int)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Fit logistic regression model
regressor = LogisticRegression(solver='liblinear', max_iter=200, random_state=42)
regressor.fit(X_train, y_train)

# Predict probabilities and classes
y_pred = regressor.predict(X_test)
y_prob = regressor.predict_proba(X_test)[:, 1]

# Evaluate
print("Classification Report (Logistic Regression with Selected Features):")
print(classification_report(y_test, y_pred))

Classification Report (Logistic Regression with Selected Features):
              precision    recall  f1-score   support

           0       0.64      0.53      0.58       216
           1       0.71      0.80      0.75       319

    accuracy                           0.69       535
   macro avg       0.68      0.66      0.66       535
weighted avg       0.68      0.69      0.68       535



3.2 Comparison Models

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report

# -- Model 1 --
# Define features and target
X1 = df[['games_played']]
y1 = df['5yrs'].astype(int)

# Train-test split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

# Fit linear regression
regressor1 = LinearRegression()
regressor1.fit(X1_train, y1_train)

# Predict probabilities and apply threshold
y1_prob = regressor1.predict(X1_test)
y1_pred = (y1_prob >= 0.5).astype(int)

# Evaluate
print("Model 1: Linear Regression with 'games_played'")
print(classification_report(y1_test, y1_pred))


# -- Model 2 --
# Define features and target
X2 = df.drop(columns=['name', '5yrs'])
y2 = df['5yrs'].astype(int)

# Train-test split
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

# Fit logistic regression
regressor2 = LogisticRegression()
regressor2.fit(X2_train, y2_train)

# Predict probabilities and apply threshold
y2_prob = regressor2.predict_proba(X2_test)[:, 1]
y2_pred = (y2_prob >= 0.5).astype(int)

# Evaluate
print("Model 2: Logistic Regression with All Features")
print(classification_report(y2_test, y2_pred))


# -- Model 3 --
# Define features and target
X3 = df[best_features]
y3 = df['5yrs'].astype(int)

# Train-test split
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state=42)

# Fit Random Forest Regressor
regressor3 = RandomForestRegressor(n_estimators=100, random_state=42)
regressor3.fit(X3_train, y3_train)

# Predict probabilities and apply threshold
y3_prob = regressor3.predict(X3_test)
y3_pred = (y3_prob >= 0.5).astype(int)

# Evaluate
print("Model 3: Random Forest Regression with Selected Features")
print(classification_report(y3_test, y3_pred))

Model 1: Linear Regression with 'games_played'
              precision    recall  f1-score   support

           0       0.58      0.43      0.49       216
           1       0.67      0.79      0.72       319

    accuracy                           0.64       535
   macro avg       0.62      0.61      0.61       535
weighted avg       0.63      0.64      0.63       535

Model 2: Logistic Regression with All Features
              precision    recall  f1-score   support

           0       0.66      0.56      0.60       216
           1       0.73      0.80      0.76       319

    accuracy                           0.70       535
   macro avg       0.69      0.68      0.68       535
weighted avg       0.70      0.70      0.70       535

Model 3: Random Forest Regression with Selected Features
              precision    recall  f1-score   support

           0       0.56      0.56      0.56       216
           1       0.70      0.70      0.70       319

    accuracy                   

### 4. Precision-Recall Curve Analysis

4.1 Define _PR() Function

In [9]:
def _PR(y_true, y_pred):
    """
    Manually calculates precision and recall from binary predictions.

    Args:
        y_true: Ground truth labels (0 or 1)
        y_pred: Predicted labels (0 or 1)

    Returns:
        precision, recall
    """
    TP = sum((y_true == 1) & (y_pred == 1))
    FP = sum((y_true == 0) & (y_pred == 1))
    FN = sum((y_true == 1) & (y_pred == 0))

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall    = TP / (TP + FN) if (TP + FN) > 0 else 0

    return precision, recall

4.2 Identify Optimal Thresholds

In [10]:
# Calculate precision and recall at different thresholds
def calculate_pr_curve(y_true, y_scores, threshold_count=100):
    """
    Calculates precision and recall values across a range of thresholds
    for plotting a Precision-Recall (PR) curve.

    Args:
        y_true: Binary ground truth labels (0 or 1).
        y_scores: Continuous model output scores.
        threshold_count: Number of thresholds to test between the min and max of y_scores.

    Returns:
        tuple:
            precisions: List of precision values at each threshold.
            recalls: List of recall values at each threshold.
            thresholds: Threshold values used for evaluation.
    """

    # Generate thresholds from min to max of scores
    min_score = min(y_scores)
    max_score = max(y_scores)
    thresholds = np.linspace(min_score, max_score, threshold_count)

    precisions = []
    recalls = []

    for threshold in thresholds:
        # Apply threshold to get binary predictions
        y_pred_binary = (y_scores >= threshold).astype(int)

        # Calculate precision and recall using the _PR function
        precision, recall = _PR(y_true, y_pred_binary)

        precisions.append(precision)
        recalls.append(recall)

    return np.array(precisions), np.array(recalls), thresholds

# Calculate PR curves for each model
# My Model
precisions, recalls, thresholds = calculate_pr_curve(y_test, y_prob)

# Model 1
precisions1, recalls1, thresholds1 = calculate_pr_curve(y1_test, y1_prob)

# Model 2
precisions2, recalls2, thresholds2 = calculate_pr_curve(y2_test, y2_prob)

# Model 3
precisions3, recalls3, thresholds3 = calculate_pr_curve(y3_test, y3_prob)

# Find optimal thresholds (where precision = recall)
def find_optimal_threshold(precisions, recalls, thresholds):
    """
    Identifies the threshold where precision and recall are closest to each other,
    often used as a balanced trade-off point for classification.

    Args:
        precisions: Precision values corresponding to each threshold.
        recalls: Recall values corresponding to each threshold.
        thresholds: Threshold values evaluated.

    Returns:
        tuple:
            optimal_threshold: Threshold where the absolute difference between precision and recall is minimized.
            optimal_precision: Precision value at the optimal threshold.
            optimal_recall: Recall value at the optimal threshold.
    """
    differences = np.abs(np.array(precisions) - np.array(recalls))
    optimal_idx = np.argmin(differences)
    return thresholds[optimal_idx], precisions[optimal_idx], recalls[optimal_idx]

optimal_thresh, optimal_prec, optimal_rec = find_optimal_threshold(precisions, recalls, thresholds)
optimal_thresh1, optimal_prec1, optimal_rec1 = find_optimal_threshold(precisions1, recalls1, thresholds1)
optimal_thresh2, optimal_prec2, optimal_rec2 = find_optimal_threshold(precisions2, recalls2, thresholds2)
optimal_thresh3, optimal_prec3, optimal_rec3 = find_optimal_threshold(precisions3, recalls3, thresholds3)

4.3 Precision-Recall Curves Visualisation

In [11]:
# Use Plotly pastel palette
pastel_colors = px.colors.qualitative.Pastel

fig = go.Figure()

# PR curves
fig.add_trace(go.Scatter(x=recalls, y=precisions, mode='lines',
                         name='My Model: Logistic Regression (Selected Features)',
                         line=dict(color=pastel_colors[0])))
fig.add_trace(go.Scatter(x=recalls1, y=precisions1, mode='lines',
                         name='Model 1: Linear Regression (games_played)',
                         line=dict(color=pastel_colors[1])))
fig.add_trace(go.Scatter(x=recalls2, y=precisions2, mode='lines',
                         name='Model 2: Logistic Regression (All Features)',
                         line=dict(color=pastel_colors[2])))
fig.add_trace(go.Scatter(x=recalls3, y=precisions3, mode='lines',
                         name='Model 3: Random Forest (Selected Features)',
                         line=dict(color=pastel_colors[3])))

# Star markers for optimal thresholds
fig.add_trace(go.Scatter(x=[optimal_rec], y=[optimal_prec], mode='markers',
                         marker=dict(symbol='star', size=12, color=pastel_colors[0]),
                         name=f'My Model optimal (t={optimal_thresh:.2f})'))
fig.add_trace(go.Scatter(x=[optimal_rec1], y=[optimal_prec1], mode='markers',
                         marker=dict(symbol='star', size=12, color=pastel_colors[1]),
                         name=f'Model 1 optimal (t={optimal_thresh1:.2f})'))
fig.add_trace(go.Scatter(x=[optimal_rec2], y=[optimal_prec2], mode='markers',
                         marker=dict(symbol='star', size=12, color=pastel_colors[2]),
                         name=f'Model 2 optimal (t={optimal_thresh2:.2f})'))
fig.add_trace(go.Scatter(x=[optimal_rec3], y=[optimal_prec3], mode='markers',
                         marker=dict(symbol='star', size=12, color=pastel_colors[3]),
                         name=f'Model 3 optimal (t={optimal_thresh3:.2f})'))

# Layout matching bar chart
fig.update_layout(
    template='plotly_white',
    title="Precision-Recall Curves with Optimal Thresholds",
    title_font=dict(size=18, color='#333'),
    font=dict(family="Arial", size=14, color='#4F4F4F'),
    xaxis=dict(title="Recall", titlefont=dict(size=16, color='#4F4F4F'), tickfont=dict(size=14, color='#666')),
    yaxis=dict(title="Precision", titlefont=dict(size=16, color='#4F4F4F'), tickfont=dict(size=14, color='#666'), range=[0, 1], showgrid=True),
    width=1000,
    height=500,
    margin=dict(t=60, b=60)
)

fig.show()

### 5. Model Performance Comparison

In [12]:
# Create performance table
model_names = [
    'My Model: Logistic Regression (Selected Features)',
    'Model 1: Linear Regression (games_played)',
    'Model 2: Logistic Regression (All Features)',
    'Model 3: Random Forest (Selected Features)'
]

# Apply optimal thresholds
y_pred_opt = (y_prob >= optimal_thresh).astype(int)
y1_pred_opt = (y1_prob >= optimal_thresh1).astype(int)
y2_pred_opt = (y2_prob >= optimal_thresh2).astype(int)
y3_pred_opt = (y3_prob >= optimal_thresh3).astype(int)

# Calculate accuracy
from sklearn.metrics import accuracy_score, roc_auc_score
accuracy = accuracy_score(y_test, y_pred_opt) * 100
accuracy1 = accuracy_score(y1_test, y1_pred_opt) * 100
accuracy2 = accuracy_score(y2_test, y2_pred_opt) * 100
accuracy3 = accuracy_score(y3_test, y3_pred_opt) * 100

# Calculate AUC
auc = roc_auc_score(y_test, y_prob)
auc1 = roc_auc_score(y1_test, y1_prob)
auc2 = roc_auc_score(y2_test, y2_prob)
auc3 = roc_auc_score(y3_test, y3_prob)

# Create comparison dataframe
results_df = pd.DataFrame({
    'Model': model_names,
    'Accuracy (%)': [accuracy, accuracy1, accuracy2, accuracy3],
    'AUC': [auc, auc1, auc2, auc3],
    'Optimal Threshold': [optimal_thresh, optimal_thresh1, optimal_thresh2, optimal_thresh3]
})

print(results_df)

                                               Model  Accuracy (%)       AUC  \
0  My Model: Logistic Regression (Selected Features)     69.158879  0.750007   
1          Model 1: Linear Regression (games_played)     63.925234  0.694778   
2        Model 2: Logistic Regression (All Features)     70.093458  0.758940   
3         Model 3: Random Forest (Selected Features)     64.672897  0.680345   

   Optimal Threshold  
0           0.546902  
1           0.561487  
2           0.549102  
3           0.493864  
