# âš½ Football Match Prediction - XGBoost Training

This notebook trains an XGBoost model for football match prediction.

**Features:**
- Links Kaggle datasets directly (no download needed)
- Trains XGBoost classifier for Home/Draw/Away
- Exports model for Flask integration

**Datasets Used:**
- davidcariboo/player-scores
- martj42/international-football-results-from-1872-to-2017

In [None]:
# Install dependencies
!pip install kagglehub xgboost pandas scikit-learn --quiet

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

## 1. Load Data from Kaggle

Using `kagglehub` to load datasets directly - no manual download required!

In [None]:
# Method 1: Using kagglehub (if available)
try:
    import kagglehub
    # Download dataset
    path = kagglehub.dataset_download("martj42/international-football-results-from-1872-to-2017")
    df = pd.read_csv(f"{path}/results.csv")
    print(f"Loaded {len(df)} matches via kagglehub")
except:
    # Method 2: Direct Kaggle input (in Kaggle notebooks)
    try:
        df = pd.read_csv("/kaggle/input/international-football-results-from-1872-to-2017/results.csv")
        print(f"Loaded {len(df)} matches from Kaggle input")
    except:
        # Method 3: Sample data for testing
        print("Creating sample data for demonstration...")
        df = pd.DataFrame({
            'date': pd.date_range('2020-01-01', periods=1000, freq='D'),
            'home_team': np.random.choice(['Team A', 'Team B', 'Team C', 'Team D'], 1000),
            'away_team': np.random.choice(['Team A', 'Team B', 'Team C', 'Team D'], 1000),
            'home_score': np.random.randint(0, 5, 1000),
            'away_score': np.random.randint(0, 5, 1000)
        })

df.head()

## 2. Feature Engineering

In [None]:
# Create target variable (result)
def get_result(row):
    if row['home_score'] > row['away_score']:
        return 'H'  # Home win
    elif row['home_score'] < row['away_score']:
        return 'A'  # Away win
    else:
        return 'D'  # Draw

df['result'] = df.apply(get_result, axis=1)

print("Result distribution:")
print(df['result'].value_counts(normalize=True))

In [None]:
# Encode teams
le_home = LabelEncoder()
le_away = LabelEncoder()
le_result = LabelEncoder()

# Fit on all teams
all_teams = pd.concat([df['home_team'], df['away_team']]).unique()
le_home.fit(all_teams)
le_away.fit(all_teams)

df['home_team_encoded'] = le_home.transform(df['home_team'])
df['away_team_encoded'] = le_away.transform(df['away_team'])
df['result_encoded'] = le_result.fit_transform(df['result'])

print(f"Number of unique teams: {len(all_teams)}")

In [None]:
# Calculate Elo ratings
def calculate_elo_ratings(df, k=32):
    """Calculate Elo ratings for all teams"""
    elo = {}
    elo_history = []
    
    for _, row in df.iterrows():
        home = row['home_team']
        away = row['away_team']
        
        # Initialize if new team
        if home not in elo:
            elo[home] = 1500
        if away not in elo:
            elo[away] = 1500
        
        # Store pre-match Elo
        elo_history.append({
            'home_elo': elo[home],
            'away_elo': elo[away],
            'elo_diff': elo[home] - elo[away]
        })
        
        # Calculate expected scores
        exp_home = 1 / (1 + 10 ** ((elo[away] - elo[home]) / 400))
        exp_away = 1 - exp_home
        
        # Actual scores
        if row['result'] == 'H':
            score_home, score_away = 1, 0
        elif row['result'] == 'A':
            score_home, score_away = 0, 1
        else:
            score_home, score_away = 0.5, 0.5
        
        # Update Elo
        elo[home] += k * (score_home - exp_home)
        elo[away] += k * (score_away - exp_away)
    
    return pd.DataFrame(elo_history)

elo_df = calculate_elo_ratings(df)
df = pd.concat([df.reset_index(drop=True), elo_df], axis=1)
df.head()

In [None]:
# Create additional features
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.dayofweek

# Goal difference expectation
df['expected_gd'] = df['elo_diff'] / 100

print(f"Features created. Dataset shape: {df.shape}")

## 3. Train XGBoost Model

In [None]:
# Define features
feature_cols = [
    'home_team_encoded', 'away_team_encoded',
    'home_elo', 'away_elo', 'elo_diff',
    'year', 'month', 'day_of_week', 'expected_gd'
]

X = df[feature_cols]
y = df['result_encoded']

# Train/test split (use recent data for testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False  # Time-based split
)

print(f"Training set: {len(X_train)} matches")
print(f"Test set: {len(X_test)} matches")

In [None]:
# Train XGBoost
model = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)

print("Training complete!")

## 4. Evaluate Model

In [None]:
# Predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le_result.classes_))

In [None]:
# Feature importance
import matplotlib.pyplot as plt

importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(importance['feature'], importance['importance'])
plt.xlabel('Importance')
plt.title('Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 5. Export Model

In [None]:
import json
import pickle

# Save XGBoost model
model.save_model('xgboost_football.json')
print("Saved: xgboost_football.json")

# Save label encoders
encoders = {
    'le_home': le_home,
    'le_away': le_away,
    'le_result': le_result,
    'feature_cols': feature_cols
}
with open('encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)
print("Saved: encoders.pkl")

# Save model metadata
metadata = {
    'model_type': 'XGBClassifier',
    'version': '1.0.0',
    'accuracy': float(accuracy),
    'features': feature_cols,
    'classes': list(le_result.classes_),
    'training_samples': len(X_train)
}
with open('model_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)
print("Saved: model_metadata.json")

## 6. Test Prediction

In [None]:
# Test prediction function
def predict_match(home_team, away_team, model, encoders):
    """Predict match outcome"""
    le_home = encoders['le_home']
    le_result = encoders['le_result']
    
    # Encode teams (use 0 for unknown)
    try:
        home_encoded = le_home.transform([home_team])[0]
    except:
        home_encoded = 0
    try:
        away_encoded = le_home.transform([away_team])[0]
    except:
        away_encoded = 0
    
    # Create features (simplified)
    features = pd.DataFrame([{
        'home_team_encoded': home_encoded,
        'away_team_encoded': away_encoded,
        'home_elo': 1600,  # Default Elo
        'away_elo': 1500,
        'elo_diff': 100,
        'year': 2026,
        'month': 1,
        'day_of_week': 5,
        'expected_gd': 1.0
    }])
    
    # Predict
    proba = model.predict_proba(features)[0]
    pred = model.predict(features)[0]
    
    return {
        'home_team': home_team,
        'away_team': away_team,
        'probabilities': dict(zip(le_result.classes_, proba)),
        'prediction': le_result.inverse_transform([pred])[0]
    }

# Test
result = predict_match('Germany', 'Brazil', model, encoders)
print(f"\n{result['home_team']} vs {result['away_team']}")
print(f"Probabilities: {result['probabilities']}")
print(f"Prediction: {result['prediction']}")

## ðŸ“¥ Download Files

After training, download these files and place in your Flask app:
- `xgboost_football.json` â†’ `models/trained/`
- `encoders.pkl` â†’ `models/trained/`
- `model_metadata.json` â†’ `models/config/`