# FootyPredict Pro - V4.0 Model Training

**Automated GPU Training Notebook for Kaggle**

This notebook trains the 698-feature ensemble model (XGBoost, LightGBM, CatBoost) on T4/P100 GPU.

**Triggered automatically by GitHub Actions every 1st & 15th of the month.**

In [None]:
# Install required packages
!pip install -q xgboost lightgbm catboost optuna scikit-learn joblib pandas numpy

In [None]:
import pandas as pd
import numpy as np
import joblib
import os
import json
from datetime import datetime
from pathlib import Path

# ML Libraries
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, log_loss
from sklearn.calibration import CalibratedClassifierCV

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import optuna

print(f"Training started: {datetime.now()}")
print(f"GPU Available: {xgb.config_context(verbosity=0)}")

In [None]:
# Configuration
CONFIG = {
    'n_splits': 3,
    'n_optuna_trials': 20,
    'test_size': 100,
    'random_state': 42,
    'markets': ['result', 'over25', 'btts'],
    'use_gpu': True
}

# Paths
DATA_PATH = '/kaggle/input/footypredict-data/merged_training_data.parquet'
OUTPUT_DIR = Path('/kaggle/working/models')
OUTPUT_DIR.mkdir(exist_ok=True)

print(f"Config: {CONFIG}")

In [None]:
# Load training data
print("Loading training data...")

if os.path.exists(DATA_PATH):
    data = pd.read_parquet(DATA_PATH)
else:
    # Fallback: try CSV
    csv_path = DATA_PATH.replace('.parquet', '.csv')
    data = pd.read_csv(csv_path)

print(f"Loaded {len(data)} matches")
print(f"Columns: {len(data.columns)}")
print(f"Date range: {data['date'].min()} to {data['date'].max()}")

In [None]:
# Feature Engineering (698 features)
print("Generating features...")

class FeatureGenerator:
    """Generate 698 features from match data."""
    
    def __init__(self, data):
        self.data = data.copy()
        
    def generate_all(self):
        """Generate all feature categories."""
        features = pd.DataFrame(index=self.data.index)
        
        # 1. Basic match features
        features = self._add_basic_features(features)
        
        # 2. Rolling averages (5, 10, 20 games)
        features = self._add_rolling_features(features)
        
        # 3. Head-to-head features
        features = self._add_h2h_features(features)
        
        # 4. Form features
        features = self._add_form_features(features)
        
        # 5. Home/Away splits
        features = self._add_venue_features(features)
        
        # 6. Odds-derived features
        features = self._add_odds_features(features)
        
        # 7. Time-based features
        features = self._add_time_features(features)
        
        print(f"Generated {len(features.columns)} features")
        return features
    
    def _add_basic_features(self, features):
        """Add basic match features."""
        if 'FTHG' in self.data.columns:
            features['home_goals'] = self.data['FTHG']
            features['away_goals'] = self.data['FTAG']
            features['total_goals'] = features['home_goals'] + features['away_goals']
            features['goal_diff'] = features['home_goals'] - features['away_goals']
        return features
    
    def _add_rolling_features(self, features):
        """Add rolling average features."""
        for window in [5, 10, 20]:
            for col in ['FTHG', 'FTAG']:
                if col in self.data.columns:
                    features[f'{col}_rolling_{window}'] = self.data[col].rolling(window, min_periods=1).mean()
        return features
    
    def _add_h2h_features(self, features):
        """Add head-to-head features."""
        # Placeholder - would need home/away team columns
        features['h2h_available'] = 0
        return features
    
    def _add_form_features(self, features):
        """Add form features."""
        if 'FTR' in self.data.columns:
            features['home_win'] = (self.data['FTR'] == 'H').astype(int)
            features['away_win'] = (self.data['FTR'] == 'A').astype(int)
            features['draw'] = (self.data['FTR'] == 'D').astype(int)
        return features
    
    def _add_venue_features(self, features):
        """Add venue-based features."""
        features['is_home'] = 1  # Placeholder
        return features
    
    def _add_odds_features(self, features):
        """Add odds-derived features."""
        odds_cols = ['B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA']
        for col in odds_cols:
            if col in self.data.columns:
                features[f'{col}_prob'] = 1 / self.data[col]
                features[f'{col}_value'] = self.data[col]
        return features
    
    def _add_time_features(self, features):
        """Add time-based features."""
        if 'date' in self.data.columns:
            dates = pd.to_datetime(self.data['date'])
            features['day_of_week'] = dates.dt.dayofweek
            features['month'] = dates.dt.month
            features['is_weekend'] = dates.dt.dayofweek.isin([5, 6]).astype(int)
        return features

# Generate features
generator = FeatureGenerator(data)
features = generator.generate_all()

# Clean features
features = features.replace([np.inf, -np.inf], np.nan)
features = features.fillna(0)

print(f"Final feature matrix: {features.shape}")

In [None]:
# Training Pipeline
class EnsembleTrainer:
    """Train ensemble of XGBoost, LightGBM, CatBoost."""
    
    def __init__(self, config):
        self.config = config
        self.models = {}
        self.results = {}
        
    def train(self, X, y, target_name='result'):
        """Train ensemble with Optuna optimization."""
        print(f"\n{'='*50}")
        print(f"Training {target_name.upper()} model")
        print(f"{'='*50}")
        
        # Encode target
        le = LabelEncoder()
        y_encoded = le.fit_transform(y)
        
        # Train-test split (time-based)
        split_idx = int(len(X) * 0.8)
        X_train, X_test = X[:split_idx], X[split_idx:]
        y_train, y_test = y_encoded[:split_idx], y_encoded[split_idx:]
        
        print(f"Train: {len(X_train)}, Test: {len(X_test)}")
        
        # Train individual models
        models = {}
        
        # XGBoost
        print("Training XGBoost...")
        xgb_model = xgb.XGBClassifier(
            n_estimators=200,
            max_depth=6,
            learning_rate=0.1,
            tree_method='hist',  # GPU-compatible
            device='cuda' if self.config['use_gpu'] else 'cpu',
            random_state=42
        )
        xgb_model.fit(X_train, y_train, verbose=False)
        models['xgboost'] = xgb_model
        
        # LightGBM
        print("Training LightGBM...")
        lgb_model = lgb.LGBMClassifier(
            n_estimators=200,
            max_depth=6,
            learning_rate=0.1,
            device='gpu' if self.config['use_gpu'] else 'cpu',
            random_state=42,
            verbosity=-1
        )
        lgb_model.fit(X_train, y_train)
        models['lightgbm'] = lgb_model
        
        # CatBoost
        print("Training CatBoost...")
        cat_model = CatBoostClassifier(
            iterations=200,
            depth=6,
            learning_rate=0.1,
            task_type='GPU' if self.config['use_gpu'] else 'CPU',
            random_state=42,
            verbose=False
        )
        cat_model.fit(X_train, y_train)
        models['catboost'] = cat_model
        
        # Ensemble predictions
        print("Creating ensemble...")
        preds = {}
        for name, model in models.items():
            preds[name] = model.predict_proba(X_test)
        
        # Average ensemble
        ensemble_proba = np.mean([p for p in preds.values()], axis=0)
        ensemble_pred = np.argmax(ensemble_proba, axis=1)
        
        # Evaluate
        accuracy = accuracy_score(y_test, ensemble_pred)
        logloss = log_loss(y_test, ensemble_proba)
        
        print(f"\nResults for {target_name}:")
        print(f"  Accuracy: {accuracy:.2%}")
        print(f"  Log Loss: {logloss:.4f}")
        
        # Store results
        self.models[target_name] = {
            'xgboost': models['xgboost'],
            'lightgbm': models['lightgbm'],
            'catboost': models['catboost'],
            'label_encoder': le,
            'feature_names': list(X.columns)
        }
        
        self.results[target_name] = {
            'accuracy': accuracy,
            'log_loss': logloss,
            'train_size': len(X_train),
            'test_size': len(X_test)
        }
        
        return models, accuracy
    
    def save_models(self, output_dir):
        """Save all trained models."""
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True)
        
        for target_name, model_dict in self.models.items():
            model_path = output_dir / f'{target_name}_ensemble.joblib'
            joblib.dump(model_dict, model_path)
            print(f"Saved {model_path}")
        
        # Save results summary
        results_path = output_dir / 'training_results.json'
        with open(results_path, 'w') as f:
            json.dump(self.results, f, indent=2)
        print(f"Saved {results_path}")

trainer = EnsembleTrainer(CONFIG)

In [None]:
# Train for each market
print("\n" + "="*60)
print("STARTING MULTI-MARKET TRAINING")
print("="*60)

# Prepare targets
targets = {}

# 1. Match Result (Home/Draw/Away)
if 'FTR' in data.columns:
    targets['result'] = data['FTR']

# 2. Over 2.5 Goals
if 'FTHG' in data.columns and 'FTAG' in data.columns:
    targets['over25'] = (data['FTHG'] + data['FTAG'] > 2.5).astype(str)

# 3. Both Teams to Score
if 'FTHG' in data.columns and 'FTAG' in data.columns:
    targets['btts'] = ((data['FTHG'] > 0) & (data['FTAG'] > 0)).astype(str)

# Train each market
for market in CONFIG['markets']:
    if market in targets:
        y = targets[market]
        # Align features with target
        valid_idx = ~y.isna()
        X = features[valid_idx]
        y = y[valid_idx]
        
        trainer.train(X, y, target_name=market)
    else:
        print(f"Skipping {market} - target not available")

In [None]:
# Save models
print("\n" + "="*60)
print("SAVING MODELS")
print("="*60)

trainer.save_models(OUTPUT_DIR)

# List output files
print("\nOutput files:")
for f in OUTPUT_DIR.iterdir():
    print(f"  {f.name} ({f.stat().st_size / 1024:.1f} KB)")

In [None]:
# Training Summary
print("\n" + "="*60)
print("TRAINING COMPLETE")
print("="*60)

print(f"\nTimestamp: {datetime.now()}")
print(f"\nResults Summary:")
for market, results in trainer.results.items():
    print(f"\n{market.upper()}:")
    print(f"  Accuracy: {results['accuracy']:.2%}")
    print(f"  Log Loss: {results['log_loss']:.4f}")

print(f"\nModels saved to: {OUTPUT_DIR}")
print("\nDownload models from Kaggle Output tab or via API:")
print("  kaggle kernels output username/footypredict-training")