In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('data/processed/games_with_features.csv')
print(f"Loaded {len(data)} games")
print(f"Columns: {data.columns.tolist()}")


Loaded 27401 games
Columns: ['GAME_DATE_EST', 'SEASON', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'pts_avg_10_home', 'fg_pct_avg_10_home', 'ft_pct_avg_10_home', 'fg3_pct_avg_10_home', 'ast_avg_10_home', 'reb_avg_10_home', 'pts_avg_10_away', 'fg_pct_avg_10_away', 'ft_pct_avg_10_away', 'fg3_pct_avg_10_away', 'ast_avg_10_away', 'reb_avg_10_away', 'rest_days_home', 'rest_days_away', 'back_to_back_home', 'back_to_back_away', 'target']


In [2]:
metadata_cols = ['GAME_DATE_EST', 'SEASON', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'target']
feature_cols = [col for col in data.columns if col not in metadata_cols]

X = data[feature_cols]
y = data['target']

print(f"\nFeatures ({len(feature_cols)}):")
for f in feature_cols:
    print(f"  • {f}")


Features (16):
  • pts_avg_10_home
  • fg_pct_avg_10_home
  • ft_pct_avg_10_home
  • fg3_pct_avg_10_home
  • ast_avg_10_home
  • reb_avg_10_home
  • pts_avg_10_away
  • fg_pct_avg_10_away
  • ft_pct_avg_10_away
  • fg3_pct_avg_10_away
  • ast_avg_10_away
  • reb_avg_10_away
  • rest_days_home
  • rest_days_away
  • back_to_back_home
  • back_to_back_away


In [3]:
split_idx = int(len(X) * 0.8)

X_train = X.iloc[:split_idx]
X_test = X.iloc[split_idx:]
y_train = y.iloc[:split_idx]
y_test = y.iloc[split_idx:]

print(f"\n{'='*60}")
print(f"DATA SPLIT")
print(f"{'='*60}")
print(f"Training: {len(X_train)} games ({len(X_train)/len(X)*100:.1f}%)")
print(f"Test:     {len(X_test)} games ({len(X_test)/len(X)*100:.1f}%)")
print(f"Train home win rate: {y_train.mean():.3f}")
print(f"Test home win rate:  {y_test.mean():.3f}")


DATA SPLIT
Training: 21920 games (80.0%)
Test:     5481 games (20.0%)
Train home win rate: 0.597
Test home win rate:  0.556


In [4]:
# ============================================
# REST FEATURES DIAGNOSTIC
# ============================================

print("\n" + "="*60)
print("REST FEATURES: BASIC CHECK")
print("="*60)

# Check if they exist and have variance
print("\nrest_days_home:")
print(f"  Mean: {X_train['rest_days_home'].mean():.2f}")
print(f"  Std:  {X_train['rest_days_home'].std():.2f}")
print(f"  Min:  {X_train['rest_days_home'].min()}")
print(f"  Max:  {X_train['rest_days_home'].max()}")

print("\nrest_days_away:")
print(f"  Mean: {X_train['rest_days_away'].mean():.2f}")
print(f"  Std:  {X_train['rest_days_away'].std():.2f}")
print(f"  Min:  {X_train['rest_days_away'].min()}")
print(f"  Max:  {X_train['rest_days_away'].max()}")

print("\nback_to_back_home:")
print(X_train['back_to_back_home'].value_counts())


REST FEATURES: BASIC CHECK

rest_days_home:
  Mean: 3.97
  Std:  16.58
  Min:  1.0
  Max:  250.0

rest_days_away:
  Mean: 3.73
  Std:  16.65
  Min:  1.0
  Max:  251.0

back_to_back_home:
back_to_back_home
0    18873
1     3047
Name: count, dtype: int64


In [None]:
baseline_acc = y_test.mean()  # Always predict home team wins

print(f"\n{'='*60}")
print(f"BASELINE (always predict home win): {baseline_acc:.3f}")
print(f"{'='*60}")


In [None]:
model = LogisticRegression(max_iter=10000, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Accuracy
train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)

print(f"\n{'='*60}")
print(f"LOGISTIC REGRESSION RESULTS")
print(f"{'='*60}")
print(f"Training Accuracy:  {train_acc:.3f}")
print(f"Test Accuracy:      {test_acc:.3f}")
print(f"Baseline Accuracy:  {baseline_acc:.3f}")
print(f"Improvement:        {test_acc - baseline_acc:+.3f} ({(test_acc/baseline_acc - 1)*100:+.1f}%)")
print(f"{'='*60}")



In [None]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test, 
                          target_names=['Away Win', 'Home Win']))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_test)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Away Win', 'Home Win'],
            yticklabels=['Away Win', 'Home Win'])
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n✓ Confusion matrix saved as 'confusion_matrix.png'")

In [None]:
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'coefficient': model.coef_[0]
})
feature_importance['abs_coef'] = abs(feature_importance['coefficient'])
feature_importance = feature_importance.sort_values('abs_coef', ascending=False)

print("\n" + "="*60)
print("FEATURE IMPORTANCE (by coefficient magnitude)")
print("="*60)
print(feature_importance.to_string(index=False))

# Plot
plt.figure(figsize=(10, 6))
feature_importance.sort_values('coefficient').plot(
    x='feature', y='coefficient', kind='barh', legend=False
)
plt.xlabel('Coefficient Value')
plt.title('Feature Importance (Logistic Regression Coefficients)')
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n✓ Feature importance saved as 'feature_importance.png'")
