# FIFA World Cup 2026 Predictions
## Exploratory Data Analysis & Machine Learning Models

**Project Goal:** Predict outcomes for the 2026 FIFA World Cup (48-team format) using historical match data, team performance metrics, and machine learning models.

**Datasets:**
- `results.csv`: Historical international match results (1872-present)
- `goalscorers.csv`: Individual goal-scoring records
- `shootouts.csv`: Penalty shootout outcomes
- `former_names.csv`: Country name changes over time

**Current Date:** January 1, 2026

## 1. Import Required Libraries
Import data manipulation, visualization, and machine learning libraries.

In [1]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8-darkgrid')

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"Analysis date: {datetime.now().strftime('%Y-%m-%d')}")

Libraries imported successfully!
Pandas version: 2.3.3
Analysis date: 2026-01-01


## 2. Load and Prepare Data
Load FIFA datasets and perform initial data quality checks.

In [2]:
# Load datasets
results_df = pd.read_csv('../data/raw/results.csv')
goalscorers_df = pd.read_csv('../data/raw/goalscorers.csv')
shootouts_df = pd.read_csv('../data/raw/shootouts.csv')
former_names_df = pd.read_csv('../data/raw/former_names.csv')

print("Dataset Shapes:")
print(f"Results: {results_df.shape}")
print(f"Goalscorers: {goalscorers_df.shape}")
print(f"Shootouts: {shootouts_df.shape}")
print(f"Former Names: {former_names_df.shape}")

# Display sample data
print("\n=== Results Sample ===")
display(results_df.head(3))

print("\n=== Goalscorers Sample ===")
display(goalscorers_df.head(3))

print("\n=== Shootouts Sample ===")
display(shootouts_df.head(3))

Dataset Shapes:
Results: (48891, 9)
Goalscorers: (44447, 8)
Shootouts: (662, 5)
Former Names: (34, 4)

=== Results Sample ===


Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False



=== Goalscorers Sample ===


Unnamed: 0,date,home_team,away_team,team,scorer,minute,own_goal,penalty
0,1916-07-02,Chile,Uruguay,Uruguay,José Piendibene,44.0,False,False
1,1916-07-02,Chile,Uruguay,Uruguay,Isabelino Gradín,55.0,False,False
2,1916-07-02,Chile,Uruguay,Uruguay,Isabelino Gradín,70.0,False,False



=== Shootouts Sample ===


Unnamed: 0,date,home_team,away_team,winner,first_shooter
0,1967-08-22,India,Taiwan,Taiwan,
1,1971-11-14,South Korea,Vietnam Republic,South Korea,
2,1972-05-07,South Korea,Iraq,Iraq,


In [3]:
# Data quality checks
print("=== Data Quality Assessment ===\n")

print("Results Dataset:")
print(f"Missing values:\n{results_df.isnull().sum()}")
print(f"\nDate range: {results_df['date'].min()} to {results_df['date'].max()}")
print(f"Unique teams: {len(set(results_df['home_team'].unique()) | set(results_df['away_team'].unique()))}")
print(f"Tournament types: {results_df['tournament'].nunique()}")

# Convert date columns
results_df['date'] = pd.to_datetime(results_df['date'])
goalscorers_df['date'] = pd.to_datetime(goalscorers_df['date'])
shootouts_df['date'] = pd.to_datetime(shootouts_df['date'])

# Add derived columns
results_df['year'] = results_df['date'].dt.year
results_df['home_win'] = (results_df['home_score'] > results_df['away_score']).astype(int)
results_df['draw'] = (results_df['home_score'] == results_df['away_score']).astype(int)
results_df['away_win'] = (results_df['home_score'] < results_df['away_score']).astype(int)
results_df['total_goals'] = results_df['home_score'] + results_df['away_score']

print("\n✓ Data loaded and preprocessed successfully!")

=== Data Quality Assessment ===

Results Dataset:
Missing values:
date          0
home_team     0
away_team     0
home_score    0
away_score    0
tournament    0
city          0
country       0
neutral       0
dtype: int64

Date range: 1872-11-30 to 2025-12-18
Unique teams: 333
Tournament types: 190

✓ Data loaded and preprocessed successfully!


## 3. Feature Engineering for Predictions
Create team performance metrics and features for machine learning models.

In [4]:
def calculate_team_stats(df, team_col, lookback_years=4):
    """
    Calculate team performance statistics over a rolling window.
    
    Args:
        df: Results dataframe
        team_col: Column name ('home_team' or 'away_team')
        lookback_years: Number of years to calculate stats over
    
    Returns:
        Dictionary of team statistics
    """
    team_stats = {}
    
    for team in df[team_col].unique():
        # Filter matches for this team
        team_matches = df[(df['home_team'] == team) | (df['away_team'] == team)].copy()
        
        # Calculate win rate
        home_wins = ((team_matches['home_team'] == team) & (team_matches['home_win'] == 1)).sum()
        away_wins = ((team_matches['away_team'] == team) & (team_matches['away_win'] == 1)).sum()
        total_matches = len(team_matches)
        
        if total_matches > 0:
            win_rate = (home_wins + away_wins) / total_matches
            
            # Goals scored and conceded
            goals_scored = (
                team_matches[team_matches['home_team'] == team]['home_score'].sum() +
                team_matches[team_matches['away_team'] == team]['away_score'].sum()
            )
            goals_conceded = (
                team_matches[team_matches['home_team'] == team]['away_score'].sum() +
                team_matches[team_matches['away_team'] == team]['home_score'].sum()
            )
            
            team_stats[team] = {
                'matches_played': total_matches,
                'win_rate': win_rate,
                'goals_per_match': goals_scored / total_matches,
                'goals_conceded_per_match': goals_conceded / total_matches,
                'goal_difference_per_match': (goals_scored - goals_conceded) / total_matches
            }
    
    return team_stats

# Calculate recent performance (last 4 years before 2026)
recent_matches = results_df[results_df['year'] >= 2022].copy()
team_performance = calculate_team_stats(recent_matches, 'home_team')

# Convert to DataFrame
team_stats_df = pd.DataFrame(team_performance).T
team_stats_df = team_stats_df.sort_values('win_rate', ascending=False)

print("=== Top 20 Teams by Win Rate (2022-2025) ===")
display(team_stats_df.head(20))

=== Top 20 Teams by Win Rate (2022-2025) ===


Unnamed: 0,matches_played,win_rate,goals_per_match,goals_conceded_per_match,goal_difference_per_match
Székely Land,2.0,1.0,5.0,1.0,4.0
Elba Island,1.0,1.0,5.0,0.0,5.0
Tamil Eelam,3.0,1.0,3.0,0.666667,2.333333
Kernow,1.0,1.0,2.0,1.0,1.0
Ynys Môn,6.0,0.833333,2.166667,1.166667,1.0
Jersey,11.0,0.818182,3.0,1.0,2.0
Argentina,51.0,0.784314,2.235294,0.45098,1.784314
Isle of Wight,4.0,0.75,1.75,1.0,0.75
Zanzibar,4.0,0.75,1.0,0.5,0.5
Morocco,57.0,0.701754,1.77193,0.526316,1.245614


## 4. Build Match Outcome Prediction Model
Train machine learning models to predict match outcomes (Win/Draw/Loss).

In [None]:
# Prepare training data
def prepare_match_features(df, team_stats):
    """
    Create feature vectors for each match based on team statistics.
    """
    features = []
    labels = []
    
    for idx, row in df.iterrows():
        home_team = row['home_team']
        away_team = row['away_team']
        
        # Skip if team stats not available
        if home_team not in team_stats or away_team not in team_stats:
            continue
        
        # Feature vector
        feature_vec = [
            team_stats[home_team]['win_rate'],
            team_stats[home_team]['goals_per_match'],
            team_stats[home_team]['goals_conceded_per_match'],
            team_stats[home_team]['goal_difference_per_match'],
            team_stats[away_team]['win_rate'],
            team_stats[away_team]['goals_per_match'],
            team_stats[away_team]['goals_conceded_per_match'],
            team_stats[away_team]['goal_difference_per_match'],
            1 if row['neutral'] else 0,  # Neutral venue
        ]
        
        features.append(feature_vec)
        
        # Label: 0=away_win, 1=draw, 2=home_win
        if row['home_win'] == 1:
            labels.append(2)
        elif row['draw'] == 1:
            labels.append(1)
        else:
            labels.append(0)
    
    return np.array(features), np.array(labels)

# Use World Cup and major tournament matches for training
tournament_matches = results_df[
    results_df['tournament'].isin(['FIFA World Cup', 'UEFA Euro', 'Copa América', 'African Cup of Nations'])
].copy()

# Filter to matches after 2010 for relevance
modern_matches = tournament_matches[tournament_matches['year'] >= 2010].copy()

print(f"Training on {len(modern_matches)} matches from major tournaments (2010-2025)")

# Calculate team stats using all available data up to each match
team_stats_all = calculate_team_stats(results_df[results_df['year'] >= 2010], 'home_team')

# Prepare features
X, y = prepare_match_features(modern_matches, team_stats_all)

print(f"\nFeature matrix shape: {X.shape}")
print(f"Labels shape: {y.shape}")
print(f"\nOutcome distribution:")
print(f"  Away wins: {(y == 0).sum()} ({(y == 0).sum()/len(y)*100:.1f}%)")
print(f"  Draws: {(y == 1).sum()} ({(y == 1).sum()/len(y)*100:.1f}%)")
print(f"  Home wins: {(y == 2).sum()} ({(y == 2).sum()/len(y)*100:.1f}%)")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train multiple models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42, max_depth=5)
}

results = {}

print("=== Model Training Results ===\n")

for name, model in models.items():
    # Train model
    model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_pred = model.predict(X_test_scaled)
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'predictions': y_pred
    }
    
    print(f"{name}:")
    print(f"  Accuracy: {accuracy:.3f}")
    print(f"  Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Away Win', 'Draw', 'Home Win']))
    print("-" * 60)

# Select best model
best_model_name = max(results, key=lambda x: results[x]['accuracy'])
best_model = results[best_model_name]['model']

print(f"\n✓ Best model: {best_model_name} with {results[best_model_name]['accuracy']:.3f} accuracy")

## 5. Evaluate Model Performance
Visualize model performance with confusion matrices and feature importance.

In [None]:
# Confusion Matrix
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (name, data) in enumerate(results.items()):
    cm = confusion_matrix(y_test, data['predictions'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                xticklabels=['Away Win', 'Draw', 'Home Win'],
                yticklabels=['Away Win', 'Draw', 'Home Win'])
    axes[idx].set_title(f'{name}\nAccuracy: {data["accuracy"]:.3f}')
    axes[idx].set_ylabel('True Label')
    axes[idx].set_xlabel('Predicted Label')

plt.tight_layout()
plt.show()

# Feature Importance (for tree-based models)
feature_names = [
    'Home Win Rate', 'Home Goals/Match', 'Home Conceded/Match', 'Home Goal Diff/Match',
    'Away Win Rate', 'Away Goals/Match', 'Away Conceded/Match', 'Away Goal Diff/Match',
    'Neutral Venue'
]

if hasattr(best_model, 'feature_importances_'):
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(data=importance_df, x='Importance', y='Feature', palette='viridis')
    plt.title(f'Feature Importance - {best_model_name}')
    plt.xlabel('Importance Score')
    plt.tight_layout()
    plt.show()
    
    print("=== Feature Importance ===")
    display(importance_df)

## 6. Save Trained Model
Save the best model and scaler for later use in predictions.

In [None]:
import joblib
import os

# Create models directory
os.makedirs('../models', exist_ok=True)

# Save model and scaler
model_path = f'../models/world_cup_predictor_{best_model_name.lower().replace(" ", "_")}.pkl'
scaler_path = '../models/feature_scaler.pkl'
stats_path = '../models/team_stats.pkl'

joblib.dump(best_model, model_path)
joblib.dump(scaler, scaler_path)
joblib.dump(team_stats_all, stats_path)

print(f"✓ Model saved to: {model_path}")
print(f"✓ Scaler saved to: {scaler_path}")
print(f"✓ Team stats saved to: {stats_path}")

# Test loading
loaded_model = joblib.load(model_path)
loaded_scaler = joblib.load(scaler_path)
loaded_stats = joblib.load(stats_path)

print("\n✓ Successfully loaded model, scaler, and team stats for verification!")

## 7. Visualize Historical Trends
Analyze World Cup performance trends and patterns.

In [None]:
# World Cup matches only
wc_matches = results_df[results_df['tournament'] == 'FIFA World Cup'].copy()

# Goals over time
goals_by_year = wc_matches.groupby('year').agg({
    'total_goals': 'mean',
    'home_score': 'count'
}).reset_index()
goals_by_year.columns = ['year', 'avg_goals_per_match', 'total_matches']

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=goals_by_year['year'],
    y=goals_by_year['avg_goals_per_match'],
    mode='lines+markers',
    name='Avg Goals per Match',
    line=dict(color='#FF6B6B', width=3),
    marker=dict(size=8)
))

fig.add_trace(go.Bar(
    x=goals_by_year['year'],
    y=goals_by_year['total_matches'],
    name='Total Matches',
    yaxis='y2',
    marker=dict(color='#4ECDC4', opacity=0.6)
))

fig.update_layout(
    title='FIFA World Cup: Goals and Matches Over Time',
    xaxis=dict(title='Year'),
    yaxis=dict(title='Average Goals per Match', side='left'),
    yaxis2=dict(title='Total Matches', overlaying='y', side='right'),
    hovermode='x unified',
    height=500
)

fig.show()

# Home advantage in World Cup
home_advantage = wc_matches.groupby('year').agg({
    'home_win': 'mean',
    'draw': 'mean',
    'away_win': 'mean'
}).reset_index()

fig2 = go.Figure()

fig2.add_trace(go.Bar(x=home_advantage['year'], y=home_advantage['home_win']*100, 
                      name='Home Win %', marker_color='#2ECC71'))
fig2.add_trace(go.Bar(x=home_advantage['year'], y=home_advantage['draw']*100, 
                      name='Draw %', marker_color='#F39C12'))
fig2.add_trace(go.Bar(x=home_advantage['year'], y=home_advantage['away_win']*100, 
                      name='Away Win %', marker_color='#E74C3C'))

fig2.update_layout(
    title='Match Outcome Distribution by World Cup Year',
    xaxis=dict(title='Year'),
    yaxis=dict(title='Percentage (%)'),
    barmode='stack',
    height=500
)

fig2.show()

print("✓ Visualizations generated successfully!")

## 8. Generate 2026 World Cup Predictions
Use the trained model to predict potential matchups for the 2026 tournament.

In [None]:
def predict_match(team1, team2, model, scaler, team_stats, neutral=True):
    """
    Predict outcome of a match between two teams.
    
    Returns:
        Dictionary with prediction probabilities
    """
    if team1 not in team_stats or team2 not in team_stats:
        return None
    
    # Create feature vector
    features = np.array([[
        team_stats[team1]['win_rate'],
        team_stats[team1]['goals_per_match'],
        team_stats[team1]['goals_conceded_per_match'],
        team_stats[team1]['goal_difference_per_match'],
        team_stats[team2]['win_rate'],
        team_stats[team2]['goals_per_match'],
        team_stats[team2]['goals_conceded_per_match'],
        team_stats[team2]['goal_difference_per_match'],
        1 if neutral else 0
    ]])
    
    # Scale and predict
    features_scaled = scaler.transform(features)
    prediction = model.predict(features_scaled)[0]
    probabilities = model.predict_proba(features_scaled)[0]
    
    outcome_map = {0: f'{team2} Win', 1: 'Draw', 2: f'{team1} Win'}
    
    return {
        'prediction': outcome_map[prediction],
        'team1_win_prob': probabilities[2] * 100,
        'draw_prob': probabilities[1] * 100,
        'team2_win_prob': probabilities[0] * 100
    }

# Example predictions for potential 2026 matchups
potential_matchups = [
    ('Brazil', 'Argentina'),
    ('France', 'Germany'),
    ('England', 'Spain'),
    ('Portugal', 'Netherlands'),
    ('Italy', 'Belgium'),
    ('Uruguay', 'Colombia'),
    ('Mexico', 'United States'),
    ('Japan', 'South Korea')
]

print("=== 2026 World Cup Match Predictions ===\n")

predictions_list = []

for team1, team2 in potential_matchups:
    result = predict_match(team1, team2, loaded_model, loaded_scaler, loaded_stats)
    
    if result:
        predictions_list.append({
            'Matchup': f'{team1} vs {team2}',
            'Prediction': result['prediction'],
            f'{team1} Win %': f"{result['team1_win_prob']:.1f}%",
            'Draw %': f"{result['draw_prob']:.1f}%",
            f'{team2} Win %': f"{result['team2_win_prob']:.1f}%"
        })
        
        print(f"{team1} vs {team2}")
        print(f"  Prediction: {result['prediction']}")
        print(f"  Probabilities: {team1} {result['team1_win_prob']:.1f}% | Draw {result['draw_prob']:.1f}% | {team2} {result['team2_win_prob']:.1f}%")
        print()

# Display as DataFrame
predictions_df = pd.DataFrame(predictions_list)
display(predictions_df)