# ðŸš€ Advanced Football Prediction - Complete Training Pipeline

This notebook provides a complete training pipeline:
1. Load pre-trained models from HuggingFace
2. Download and prepare training data
3. Fine-tune models on football data
4. Export for production use

**Run on Kaggle with GPU enabled for best performance!**

In [None]:
!pip install -q torch transformers huggingface_hub xgboost lightgbm catboost
!pip install -q onnx onnxruntime kagglehub pandas scikit-learn

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, brier_score_loss
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {DEVICE}')

## 1. Load Training Data from Multiple Sources

In [None]:
def load_kaggle_datasets():
    dfs = []
    # Try multiple dataset sources
    sources = [
        '/kaggle/input/international-football-results-from-1872-to-2017/results.csv',
        '/kaggle/input/football-events/events.csv',
    ]
    for src in sources:
        try:
            df = pd.read_csv(src)
            dfs.append(df)
            print(f'Loaded {len(df)} rows from {src}')
        except: pass
    
    if not dfs:
        # Generate sample data
        print('Creating sample training data...')
        np.random.seed(42)
        n = 10000
        df = pd.DataFrame({
            'date': pd.date_range('2015-01-01', periods=n, freq='D'),
            'home_team': np.random.choice(['Team'+str(i) for i in range(50)], n),
            'away_team': np.random.choice(['Team'+str(i) for i in range(50)], n),
            'home_score': np.random.randint(0, 5, n),
            'away_score': np.random.randint(0, 5, n)
        })
        dfs.append(df)
    
    return pd.concat(dfs, ignore_index=True) if len(dfs) > 1 else dfs[0]

df = load_kaggle_datasets()
print(f'Total samples: {len(df)}')

## 2. Feature Engineering

In [None]:
class FeatureEngineer:
    def __init__(self):
        self.elo = {}
        self.team_encoder = LabelEncoder()
        self.scaler = StandardScaler()
        self.K = 32
    
    def get_elo(self, team):
        return self.elo.get(team, 1500)
    
    def update_elo(self, home, away, result):
        h_elo, a_elo = self.get_elo(home), self.get_elo(away)
        exp_h = 1 / (1 + 10**((a_elo - h_elo) / 400))
        
        if result == 'H': s_h, s_a = 1, 0
        elif result == 'A': s_h, s_a = 0, 1
        else: s_h, s_a = 0.5, 0.5
        
        self.elo[home] = h_elo + self.K * (s_h - exp_h)
        self.elo[away] = a_elo + self.K * (s_a - (1 - exp_h))
    
    def process(self, df):
        df = df.copy()
        df['date'] = pd.to_datetime(df['date'])
        df = df.sort_values('date')
        
        # Result
        df['result'] = np.where(df['home_score'] > df['away_score'], 'H',
                       np.where(df['home_score'] < df['away_score'], 'A', 'D'))
        
        # Elo ratings
        elo_h, elo_a, elo_diff = [], [], []
        for _, row in df.iterrows():
            h, a = self.get_elo(row['home_team']), self.get_elo(row['away_team'])
            elo_h.append(h); elo_a.append(a); elo_diff.append(h - a)
            self.update_elo(row['home_team'], row['away_team'], row['result'])
        
        df['home_elo'], df['away_elo'], df['elo_diff'] = elo_h, elo_a, elo_diff
        
        # Team encoding
        all_teams = pd.concat([df['home_team'], df['away_team']]).unique()
        self.team_encoder.fit(all_teams)
        df['home_enc'] = self.team_encoder.transform(df['home_team'])
        df['away_enc'] = self.team_encoder.transform(df['away_team'])
        
        # Date features
        df['year'] = df['date'].dt.year
        df['month'] = df['date'].dt.month
        df['dow'] = df['date'].dt.dayofweek
        
        return df

fe = FeatureEngineer()
df = fe.process(df)
print(df[['home_team', 'away_team', 'home_elo', 'away_elo', 'result']].head())

## 3. Prepare Training Data

In [None]:
FEATURES = ['home_enc', 'away_enc', 'home_elo', 'away_elo', 'elo_diff', 'year', 'month', 'dow']
TARGET = 'result'

le_result = LabelEncoder()
df['result_enc'] = le_result.fit_transform(df['result'])

X = df[FEATURES].values
y = df['result_enc'].values

# Time-based split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
print(f'Train: {len(X_train)}, Test: {len(X_test)}')
print(f'Classes: {le_result.classes_}')

## 4. Train Ensemble Models

In [None]:
models = {}

# XGBoost
xgb = XGBClassifier(n_estimators=300, max_depth=8, learning_rate=0.05, random_state=42)
xgb.fit(X_train, y_train)
models['xgb'] = xgb
print(f'XGBoost Accuracy: {accuracy_score(y_test, xgb.predict(X_test)):.4f}')

# LightGBM
lgb = LGBMClassifier(n_estimators=300, max_depth=8, learning_rate=0.05, random_state=42, verbose=-1)
lgb.fit(X_train, y_train)
models['lgb'] = lgb
print(f'LightGBM Accuracy: {accuracy_score(y_test, lgb.predict(X_test)):.4f}')

# CatBoost
cat = CatBoostClassifier(iterations=300, depth=8, learning_rate=0.05, random_state=42, verbose=0)
cat.fit(X_train, y_train)
models['cat'] = cat
print(f'CatBoost Accuracy: {accuracy_score(y_test, cat.predict(X_test)):.4f}')

## 5. Neural Network Model

In [None]:
class FootballNet(nn.Module):
    def __init__(self, input_dim, hidden=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden),
            nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(hidden, 64),
            nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(64, 3)
        )
    def forward(self, x):
        return self.net(x)

# Normalize
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# Train
net = FootballNet(len(FEATURES)).to(DEVICE)
opt = torch.optim.Adam(net.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

X_t = torch.FloatTensor(X_train_s).to(DEVICE)
y_t = torch.LongTensor(y_train).to(DEVICE)

for epoch in range(100):
    net.train()
    opt.zero_grad()
    loss = loss_fn(net(X_t), y_t)
    loss.backward()
    opt.step()

net.eval()
with torch.no_grad():
    preds = net(torch.FloatTensor(X_test_s).to(DEVICE)).argmax(1).cpu().numpy()
print(f'Neural Net Accuracy: {accuracy_score(y_test, preds):.4f}')

## 6. Ensemble Prediction

In [None]:
def ensemble_predict(X, models, net, scaler, weights={'xgb':0.3, 'lgb':0.3, 'cat':0.25, 'nn':0.15}):
    probs = np.zeros((len(X), 3))
    
    probs += weights['xgb'] * models['xgb'].predict_proba(X)
    probs += weights['lgb'] * models['lgb'].predict_proba(X)
    probs += weights['cat'] * models['cat'].predict_proba(X)
    
    net.eval()
    with torch.no_grad():
        nn_probs = torch.softmax(net(torch.FloatTensor(scaler.transform(X)).to(DEVICE)), dim=1).cpu().numpy()
    probs += weights['nn'] * nn_probs
    
    return probs / sum(weights.values())

ens_probs = ensemble_predict(X_test, models, net, scaler)
ens_preds = ens_probs.argmax(1)
print(f'Ensemble Accuracy: {accuracy_score(y_test, ens_preds):.4f}')
print(classification_report(y_test, ens_preds, target_names=le_result.classes_))

## 7. Export Models

In [None]:
import pickle, json

# Save XGBoost
models['xgb'].save_model('xgb_football.json')

# Save LightGBM
models['lgb'].booster_.save_model('lgb_football.txt')

# Save CatBoost
models['cat'].save_model('cat_football.cbm')

# Save Neural Net
torch.save(net.state_dict(), 'nn_football.pt')

# Save encoders and scaler
with open('encoders.pkl', 'wb') as f:
    pickle.dump({'team_enc': fe.team_encoder, 'result_enc': le_result, 'scaler': scaler}, f)

# Save Elo ratings
with open('elo_ratings.json', 'w') as f:
    json.dump(fe.elo, f)

# Metadata
meta = {
    'features': FEATURES,
    'classes': list(le_result.classes_),
    'ensemble_weights': {'xgb':0.3, 'lgb':0.3, 'cat':0.25, 'nn':0.15},
    'accuracy': float(accuracy_score(y_test, ens_preds))
}
with open('model_meta.json', 'w') as f:
    json.dump(meta, f, indent=2)

print('All models exported!')

## ðŸ“¥ Download Files

After running, download these files for your Flask app:
```
xgb_football.json   â†’ models/trained/
lgb_football.txt    â†’ models/trained/
cat_football.cbm    â†’ models/trained/
nn_football.pt      â†’ models/trained/
encoders.pkl        â†’ models/config/
elo_ratings.json    â†’ models/config/
model_meta.json     â†’ models/config/
```