# âš½ Football Form Prediction - LSTM Model

This notebook trains an LSTM model to predict match outcomes based on team form sequences.

**Features:**
- Sequences of last 5-10 matches per team
- Captures momentum and form trends
- Exports to ONNX for fast inference

In [None]:
# Install dependencies
!pip install torch pandas scikit-learn onnx onnxruntime --quiet

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

## 1. Load Data

In [None]:
# Load match data
try:
    df = pd.read_csv("/kaggle/input/international-football-results-from-1872-to-2017/results.csv")
except:
    # Create sample data
    np.random.seed(42)
    df = pd.DataFrame({
        'date': pd.date_range('2015-01-01', periods=5000, freq='D'),
        'home_team': np.random.choice(['Team A', 'Team B', 'Team C', 'Team D', 'Team E'], 5000),
        'away_team': np.random.choice(['Team A', 'Team B', 'Team C', 'Team D', 'Team E'], 5000),
        'home_score': np.random.randint(0, 5, 5000),
        'away_score': np.random.randint(0, 5, 5000)
    })

# Create result
df['result'] = np.where(df['home_score'] > df['away_score'], 'H',
                        np.where(df['home_score'] < df['away_score'], 'A', 'D'))

df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date').reset_index(drop=True)
print(f"Loaded {len(df)} matches")

## 2. Create Form Sequences

In [None]:
SEQUENCE_LENGTH = 5

def get_team_form(team_matches, result_col='result'):
    """Convert results to numeric form values"""
    form_values = {
        'W': 3,  # Win
        'D': 1,  # Draw
        'L': 0   # Loss
    }
    return [form_values.get(r, 0) for r in team_matches[result_col]]

def create_form_sequences(df, team_col, sequence_length=5):
    """Create form sequences for each match"""
    sequences = []
    team_history = {}
    
    for idx, row in df.iterrows():
        team = row[team_col]
        
        if team not in team_history:
            team_history[team] = []
        
        # Get last N matches
        history = team_history[team][-sequence_length:]
        
        # Pad if needed
        while len(history) < sequence_length:
            history.insert(0, 1)  # Neutral padding
        
        sequences.append(history)
        
        # Update history based on result
        if team_col == 'home_team':
            result = 3 if row['result'] == 'H' else (1 if row['result'] == 'D' else 0)
        else:
            result = 3 if row['result'] == 'A' else (1 if row['result'] == 'D' else 0)
        
        team_history[team].append(result)
    
    return np.array(sequences)

# Create sequences
home_sequences = create_form_sequences(df, 'home_team', SEQUENCE_LENGTH)
away_sequences = create_form_sequences(df, 'away_team', SEQUENCE_LENGTH)

print(f"Home sequences shape: {home_sequences.shape}")
print(f"Away sequences shape: {away_sequences.shape}")

In [None]:
# Combine features
X = np.stack([home_sequences, away_sequences], axis=2)  # (samples, seq_len, 2)

# Normalize
X = X / 3.0  # Scale to 0-1

# Labels
le = LabelEncoder()
y = le.fit_transform(df['result'])

print(f"X shape: {X.shape}")
print(f"Classes: {le.classes_}")

## 3. Define LSTM Model

In [None]:
class FootballLSTM(nn.Module):
    def __init__(self, input_size=2, hidden_size=64, num_layers=2, num_classes=3):
        super(FootballLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.2
        )
        
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, 32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(32, num_classes)
        )
    
    def forward(self, x):
        # x: (batch, seq_len, input_size)
        lstm_out, _ = self.lstm(x)
        # Use last output
        out = lstm_out[:, -1, :]
        out = self.fc(out)
        return out

model = FootballLSTM()
print(model)
print(f"\nTotal parameters: {sum(p.numel() for p in model.parameters()):,}")

## 4. Train Model

In [None]:
# Dataset class
class MatchDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y)
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

train_dataset = MatchDataset(X_train, y_train)
test_dataset = MatchDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")

In [None]:
# Training
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

EPOCHS = 50
best_acc = 0

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    # Evaluate
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            outputs = model(X_batch)
            _, predicted = torch.max(outputs.data, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()
    
    accuracy = correct / total
    
    if accuracy > best_acc:
        best_acc = accuracy
        torch.save(model.state_dict(), 'lstm_football_best.pt')
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{EPOCHS}] Loss: {total_loss/len(train_loader):.4f} Acc: {accuracy:.4f}")

print(f"\nBest Test Accuracy: {best_acc:.4f}")

## 5. Export to ONNX

In [None]:
# Load best model
model.load_state_dict(torch.load('lstm_football_best.pt'))
model.eval()

# Export to ONNX
dummy_input = torch.randn(1, SEQUENCE_LENGTH, 2)

torch.onnx.export(
    model,
    dummy_input,
    'lstm_football.onnx',
    input_names=['form_sequence'],
    output_names=['prediction'],
    dynamic_axes={
        'form_sequence': {0: 'batch_size'},
        'prediction': {0: 'batch_size'}
    }
)

print("Exported: lstm_football.onnx")

In [None]:
# Test ONNX model
import onnxruntime as ort

session = ort.InferenceSession('lstm_football.onnx')

# Test prediction
test_input = X_test[:1].astype(np.float32)
outputs = session.run(None, {'form_sequence': test_input})

probs = np.exp(outputs[0]) / np.sum(np.exp(outputs[0]), axis=1, keepdims=True)
print(f"Test prediction probabilities: {probs[0]}")
print(f"Classes: {le.classes_}")

## ðŸ“¥ Download Files

After training:
- `lstm_football.onnx` â†’ `models/trained/`
- `lstm_football_best.pt` â†’ `models/trained/` (PyTorch backup)