In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, f1_score

# 1. LOAD DATA
df = pd.read_csv("data/raw/player_seasons_with_breakouts.csv")

# 2. IDENTICAL FEATURE ENGINEERING
# (Same "Blindfold" logic as NN and RF to prevent leakage)
history_cols = [col for col in df.columns if col.endswith('_PREV') or col.endswith('_2YRS_AGO')]

# Manually Calculate Trends (Momentum)
df['PTS_Trend'] = df['PTS_PREV'] - df['PTS_2YRS_AGO']
df['AST_Trend'] = df['AST_PREV'] - df['AST_2YRS_AGO']
df['MIN_Trend'] = df['MIN_PREV'] - df['MIN_2YRS_AGO']
df['Usage_Trend'] = df['E_USG_PCT_PREV'] - df['E_USG_PCT_2YRS_AGO']

feature_cols = history_cols + ['AGE', 'EXPERIENCE', 'PTS_Trend', 'AST_Trend', 'MIN_Trend', 'Usage_Trend']

# Drop rows where we lack history or target
df_clean = df.dropna(subset=feature_cols + ['BREAKOUT'])

X = df_clean[feature_cols].values
y = df_clean['BREAKOUT'].values

print(f"Cleaned Dataset Size: {len(df_clean)}")

# 3. SPLIT & SCALE (CRITICAL FOR LOG REGRESSION)
# Logistic Regression uses Gradient Descent (or similar solvers).
# Unlike Random Forest, it requires scaling so weights converge properly.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. TRAIN MODEL
# class_weight='balanced': This calculates the ratio automatically (n_neg / n_pos)
# solver='liblinear': Good for smaller datasets (thousands of rows)
# max_iter=1000: Ensures the solver has enough time to converge
log_model = LogisticRegression(
    class_weight='balanced', 
    random_state=42, 
    solver='liblinear',
    max_iter=1000
)

print("Training Logistic Regression...")
log_model.fit(X_train_scaled, y_train)

# 5. EVALUATION (WITH THRESHOLD TUNING)
# Get probabilities for Class 1 (Breakout)
y_pred_probs = log_model.predict_proba(X_test_scaled)[:, 1]

thresholds = np.arange(0.3, 0.8, 0.05)
best_f1 = 0
best_thresh = 0.5

for thresh in thresholds:
    preds = (y_pred_probs > thresh).astype(int)
    score = f1_score(y_test, preds, pos_label=1)
    if score > best_f1:
        best_f1 = score
        best_thresh = thresh

print(f"Best Threshold: {best_thresh:.2f} (F1: {best_f1:.2f})")

# Final Predictions
y_pred = (y_pred_probs > best_thresh).astype(int)
y_test_int = y_test.astype(int)

print("\n--- Final Model Evaluation (Logistic Regression) ---")
print(classification_report(
    y_test_int, 
    y_pred, 
    labels=[0, 1], 
    target_names=['No Breakout', 'Breakout'], 
    zero_division=0
))

# 6. COEFFICIENT ANALYSIS (The "Why")
# In LogReg, the coefficients tell you direction and magnitude.
# Positive Coeff = Increases probability of breakout
# Negative Coeff = Decreases probability
coeffs = pd.DataFrame({
    'Feature': feature_cols,
    'Coefficient': log_model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print("\n--- Top 5 Positive Signals (Predicts Breakout) ---")
print(coeffs.head(5))

print("\n--- Top 5 Negative Signals (Predicts No Breakout) ---")
print(coeffs.tail(5))

Cleaned Dataset Size: 6609
Training Logistic Regression...
Best Threshold: 0.60 (F1: 0.53)

--- Final Model Evaluation (Logistic Regression) ---
              precision    recall  f1-score   support

 No Breakout       0.91      0.83      0.87      1081
    Breakout       0.45      0.63      0.53       241

    accuracy                           0.79      1322
   macro avg       0.68      0.73      0.70      1322
weighted avg       0.83      0.79      0.80      1322


--- Top 5 Positive Signals (Predicts Breakout) ---
                  Feature  Coefficient
4       E_DEF_RATING_PREV     0.383264
6                PTS_PREV     0.277104
7            PTS_2YRS_AGO     0.220954
3   E_OFF_RATING_2YRS_AGO     0.127716
11           REB_2YRS_AGO     0.120848

--- Top 5 Negative Signals (Predicts No Breakout) ---
               Feature  Coefficient
12            MIN_PREV    -0.152515
19  E_USG_PCT_2YRS_AGO    -0.181000
18      E_USG_PCT_PREV    -0.234307
2    E_OFF_RATING_PREV    -0.535890
0    E_

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score

# 1. LOAD DATA
df = pd.read_csv("data/raw/player_seasons_with_breakouts.csv")

history_cols = [col for col in df.columns if col.endswith('_PREV') or col.endswith('_2YRS_AGO')]

df['PTS_Trend'] = df['PTS_PREV'] - df['PTS_2YRS_AGO']
df['AST_Trend'] = df['AST_PREV'] - df['AST_2YRS_AGO']
df['MIN_Trend'] = df['MIN_PREV'] - df['MIN_2YRS_AGO']
df['Usage_Trend'] = df['E_USG_PCT_PREV'] - df['E_USG_PCT_2YRS_AGO']

feature_cols = history_cols + ['AGE', 'EXPERIENCE', 'PTS_Trend', 'AST_Trend', 'MIN_Trend', 'Usage_Trend']
df_clean = df.dropna(subset=feature_cols + ['BREAKOUT'])

X = df_clean[feature_cols].values
y = df_clean['BREAKOUT'].values

print(f"Cleaned Dataset Size: {len(df_clean)}")

# 2. SPLIT (NO MANUAL SCALING)
# We pass raw X_train/X_test because the Pipeline contains a StandardScaler step.
# If we scaled here, the pipeline would scale it AGAIN (Double Scaling = Bad).
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 3. DEFINE & TRAIN PIPELINE
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ('logreg', LogisticRegression(
        class_weight='balanced', 
        solver='liblinear', 
        C=0.1, 
        max_iter=1000
    ))
])

print("Training Poly-LogReg...")
pipe.fit(X_train, y_train)

# 4. EVALUATION
# Pass RAW X_test (The pipeline will handle scaling and poly expansion)
y_pred_probs = pipe.predict_proba(X_test)[:, 1]

thresholds = np.arange(0.3, 0.8, 0.05)
best_f1 = 0
best_thresh = 0.5

for thresh in thresholds:
    preds = (y_pred_probs > thresh).astype(int)
    score = f1_score(y_test, preds, pos_label=1)
    if score > best_f1:
        best_f1 = score
        best_thresh = thresh

print(f"Best Threshold: {best_thresh:.2f} (F1: {best_f1:.2f})")

y_pred = (y_pred_probs > best_thresh).astype(int)
y_test_int = y_test.astype(int)

print("\n--- Final Model Evaluation (Logistic Regression) ---")
print(classification_report(
    y_test_int, 
    y_pred, 
    labels=[0, 1], 
    target_names=['No Breakout', 'Breakout'], 
    zero_division=0
))

# 5. COEFFICIENT ANALYSIS (THE FIX)
# Step A: Get the model step out of the pipeline
model = pipe.named_steps['logreg']

# Step B: Get the NEW feature names generated by PolynomialFeatures
# We must pass the original feature names so it knows how to label interactions
poly = pipe.named_steps['poly']
new_feature_names = poly.get_feature_names_out(input_features=feature_cols)

# Step C: Create DataFrame with aligned lengths
coeffs = pd.DataFrame({
    'Feature': new_feature_names,
    'Coefficient': model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print("\n--- Top 5 Positive Signals (Predicts Breakout) ---")
print(coeffs.head(5))

print("\n--- Top 5 Negative Signals (Predicts No Breakout) ---")
print(coeffs.tail(5))

Cleaned Dataset Size: 6609
Training Poly-LogReg...
Best Threshold: 0.55 (F1: 0.49)

--- Final Model Evaluation (Logistic Regression) ---
              precision    recall  f1-score   support

 No Breakout       0.91      0.77      0.83      1081
    Breakout       0.39      0.65      0.49       241

    accuracy                           0.75      1322
   macro avg       0.65      0.71      0.66      1322
weighted avg       0.81      0.75      0.77      1322


--- Top 5 Positive Signals (Predicts Breakout) ---
                                Feature  Coefficient
4                     E_DEF_RATING_PREV     0.505791
351             GP_PREV FG_PCT_2YRS_AGO     0.310724
230              PTS_2YRS_AGO PTS_Trend     0.306743
121  E_OFF_RATING_2YRS_AGO REB_2YRS_AGO     0.285553
16                          TS_PCT_PREV     0.285380

--- Top 5 Negative Signals (Predicts No Breakout) ---
                    Feature  Coefficient
308  REB_2YRS_AGO PTS_Trend    -0.239176
350     GP_PREV FG_PCT_PREV  