In [13]:
import pandas as pd
import io
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# 1. LOAD & CLEAN DATA
df = pd.read_csv("data/raw/player_seasons_with_breakouts.csv")

feature_cols = [col for col in df.columns if col.endswith('_PREV')]
feature_cols.extend(['AGE', 'EXPERIENCE'])

# CRITICAL FIX: Drop rows where we don't know the answer (BREAKOUT is NaN)
# This prevents errors if your data includes the current 2024-25 season.
df_clean = df.dropna(subset=feature_cols + ['BREAKOUT'])

X = df_clean[feature_cols].values
y = df_clean['BREAKOUT'].values

# 2. SPLIT & SCALE
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) 
X_test_scaled = scaler.transform(X_test)

# 3. SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Convert to Tensors
X_train_tensor = torch.tensor(X_train_resampled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_resampled, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

# 4. DEFINE MODEL
class BreakoutClassifier(nn.Module):
    def __init__(self, input_dim):
        super(BreakoutClassifier, self).__init__()
        self.layer_1 = nn.Linear(input_dim, 64) 
        self.layer_2 = nn.Linear(64, 32)
        self.layer_out = nn.Linear(32, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.3)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.relu(self.layer_1(x))
        x = self.dropout(x)
        x = self.relu(self.layer_2(x))
        x = self.dropout(x)
        x = self.sigmoid(self.layer_out(x))
        return x

model = BreakoutClassifier(input_dim=X_train.shape[1])

# 5. TRAIN
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

print("Training Neural Network...")
epochs = 1000
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 200 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# 6. EVALUATION (UPDATED)
model.eval()
with torch.no_grad():
    y_pred_probs = model(X_test_tensor)
    
    thresholds = np.arange(0.1, 0.9, 0.05)
    best_f1 = 0
    best_thresh = 0.5

    for thresh in thresholds:
        y_pred_t = (y_pred_probs > thresh).float().numpy().flatten().astype(int)
        # Calculate F1 for class 1 (Breakout) manually or use f1_score
        # Precision = TP / (TP + FP)
        # Recall = TP / (TP + FN)
        # F1 = 2 * (P * R) / (P + R)
        from sklearn.metrics import f1_score
        score = f1_score(y_test, y_pred_t, pos_label=1)
    
    if score > best_f1:
        best_f1 = score
        best_thresh = thresh

    print(f"Best Threshold: {best_thresh:.2f} | Best F1: {best_f1:.2f}")

    # Use the best threshold for your final report
    y_pred = (y_pred_probs > best_thresh).float().numpy().flatten().astype(int)
    y_test = y_test.astype(int)

print("\n--- Model Evaluation (Neural Net) ---")
# 3. FORCE LABELS: explicitly tell sklearn to look for 0 and 1.
print(classification_report(
    y_test, 
    y_pred, 
    labels=[0, 1], 
    target_names=['No Breakout', 'Breakout'], 
    zero_division=0
))

Training Neural Network...
Epoch [200/1000], Loss: 0.5038
Epoch [400/1000], Loss: 0.4876
Epoch [600/1000], Loss: 0.4764
Epoch [800/1000], Loss: 0.4653
Epoch [1000/1000], Loss: 0.4570
Best Threshold: 0.85 | Best F1: 0.28

--- Model Evaluation (Neural Net) ---
              precision    recall  f1-score   support

 No Breakout       0.83      0.98      0.90      1293
    Breakout       0.66      0.18      0.28       310

    accuracy                           0.82      1603
   macro avg       0.75      0.58      0.59      1603
weighted avg       0.80      0.82      0.78      1603



In [15]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, f1_score

# 1. LOAD DATA
df = pd.read_csv("data/raw/player_seasons_with_breakouts.csv")

# --- FIX: DATA LEAKAGE REMOVAL ---
# We strictly filter for columns ending in _PREV or _2YRS_AGO
# We DO NOT include 'CHANGE' or 'GROWTH' columns from the CSV.
history_cols = [col for col in df.columns if col.endswith('_PREV') or col.endswith('_2YRS_AGO')]

# --- FIX: MANUAL FEATURE ENGINEERING ---
# We manually calculate the *prior* trend. 
# "How much did they improve from 2 years ago to last year?"
# This is safe because it happened in the past.
df['PTS_Trend'] = df['PTS_PREV'] - df['PTS_2YRS_AGO']
df['AST_Trend'] = df['AST_PREV'] - df['AST_2YRS_AGO']
df['MIN_Trend'] = df['MIN_PREV'] - df['MIN_2YRS_AGO']
df['Usage_Trend'] = df['E_USG_PCT_PREV'] - df['E_USG_PCT_2YRS_AGO']

# Combine valid features
feature_cols = history_cols + ['AGE', 'EXPERIENCE', 'PTS_Trend', 'AST_Trend', 'MIN_Trend', 'Usage_Trend']

# Drop rows where we don't have enough history to calculate trends
# (Players need at least 2 prior years to have a "Trend")
df_clean = df.dropna(subset=feature_cols + ['BREAKOUT'])

print(f"Cleaned Dataset Size: {len(df_clean)}")

X = df_clean[feature_cols].values
y = df_clean['BREAKOUT'].values

# 2. SPLIT & SCALE
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) 
X_test_scaled = scaler.transform(X_test)

# Convert to Tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

# 3. CLASS WEIGHTS (Recalculated)
num_neg = (y_train == 0).sum()
num_pos = (y_train == 1).sum()
pos_weight = torch.tensor([num_neg / num_pos], dtype=torch.float32)

# 4. MODEL ARCHITECTURE (Reduced complexity to prevent overfitting)
class BreakoutClassifier(nn.Module):
    def __init__(self, input_dim):
        super(BreakoutClassifier, self).__init__()
        self.layer_1 = nn.Linear(input_dim, 32) # Reduced from 64
        self.bn1 = nn.BatchNorm1d(32)
        
        self.layer_2 = nn.Linear(32, 16)        # Reduced from 32
        self.bn2 = nn.BatchNorm1d(16)
        
        self.layer_out = nn.Linear(16, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5) # High dropout for regularization
        
    def forward(self, x):
        x = self.dropout(self.relu(self.bn1(self.layer_1(x))))
        x = self.dropout(self.relu(self.bn2(self.layer_2(x))))
        return self.layer_out(x)

model = BreakoutClassifier(input_dim=X_train.shape[1])

# 5. TRAINING
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
# Increased weight decay (L2 regularization) to fight overfitting
optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-3) 

print("Training Corrected Model...")
epochs = 1000
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

# 6. EVALUATION
model.eval()
with torch.no_grad():
    logits = model(X_test_tensor)
    y_pred_probs = torch.sigmoid(logits).numpy().flatten()

# Find Best Threshold
thresholds = np.arange(0.3, 0.8, 0.05)
best_f1 = 0
best_thresh = 0.5
for thresh in thresholds:
    preds = (y_pred_probs > thresh).astype(int)
    score = f1_score(y_test, preds, pos_label=1)
    if score > best_f1:
        best_f1 = score
        best_thresh = thresh

print(f"Best Threshold: {best_thresh:.2f} (F1: {best_f1:.2f})")
y_pred = (y_pred_probs > best_thresh).astype(int)

print(classification_report(y_test, y_pred, labels=[0, 1], target_names=['No Breakout', 'Breakout'], zero_division=0))

Cleaned Dataset Size: 6609
Training Corrected Model...
Best Threshold: 0.65 (F1: 0.53)
              precision    recall  f1-score   support

 No Breakout       0.91      0.85      0.88      1081
    Breakout       0.48      0.60      0.53       241

    accuracy                           0.81      1322
   macro avg       0.69      0.73      0.71      1322
weighted avg       0.83      0.81      0.82      1322

