In [4]:
# Football Goals Prediction Model
# Predicting probability of goals in second half based on first half performance

import pandas as pd
import numpy as np
import tensorflow as tf
# import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Read the data
# Assuming the CSV file is saved as 'football_data.csv'
df = pd.read_csv('final_db.csv')

print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())

# Basic data exploration
print("\nFirst few rows:")
print(df.head())

print("\nData types:")
print(df.dtypes)

print("\nMissing values:")
print(df.isnull().sum())

# Clean and prepare the data
# Remove rows with missing ft_goals or ht_goals
df_clean = df.dropna(subset=['ft_goals', 'ht_goals'])

print(f"\nData after cleaning: {df_clean.shape[0]} rows")

# Create target variables based on your requirements
# Target 1: For matches with 0 goals at HT, will there be ≥1 goal at FT?
# Target 2: For matches with 1 goal at HT, will there be ≥2 goals at FT?

df_ht0 = df_clean[df_clean['ht_goals'] == 0].copy()
df_ht1 = df_clean[df_clean['ht_goals'] == 1].copy()

# Create binary targets
df_ht0['target'] = (df_ht0['ft_goals'] >= 1).astype(int)
df_ht1['target'] = (df_ht1['ft_goals'] >= 2).astype(int)

print(f"\nMatches with 0 HT goals: {len(df_ht0)}")
print(f"Of these, {df_ht0['target'].sum()} ({df_ht0['target'].mean():.2%}) had ≥1 FT goals")

print(f"\nMatches with 1 HT goal: {len(df_ht1)}")
print(f"Of these, {df_ht1['target'].sum()} ({df_ht1['target'].mean():.2%}) had ≥2 FT goals")

# Feature engineering function
def create_features(df):
    features_df = df.copy()
    
    # Time-based features
    # features_df['log_time'] = pd.to_datetime(features_df['log_time'], format='%H:%M', errors='coerce')
    # features_df['hour'] = features_df['log_time'].dt.hour
    # features_df['minute'] = features_df['log_time'].dt.minute
    
    # Encode categorical variables
    le_tournament = LabelEncoder()
    le_home = LabelEncoder()
    le_away = LabelEncoder()
    
    # Handle missing tournaments
    features_df['tournament'] = features_df['tournament'].fillna('Unknown')
    
    # Encode teams and tournaments
    features_df['tournament_encoded'] = le_tournament.fit_transform(features_df['tournament'].astype(str))
    features_df['home_team_encoded'] = le_home.fit_transform(features_df['home-team'].astype(str))
    features_df['away_team_encoded'] = le_away.fit_transform(features_df['away-team'].astype(str))
    
    # Odds-based features (when available)
    odds_cols = ['pre-match_odds_home', 'pre-match_odds_draw', 'pre-match_odds_away']
    for col in odds_cols:
        features_df[col] = pd.to_numeric(features_df[col], errors='coerce')
    
    # Calculate implied probabilities from odds
    features_df['home_prob'] = 1 / features_df['pre-match_odds_home']
    features_df['draw_prob'] = 1 / features_df['pre-match_odds_draw'] 
    features_df['away_prob'] = 1 / features_df['pre-match_odds_away']
    
    # Normalize probabilities
    total_prob = features_df['home_prob'] + features_df['draw_prob'] + features_df['away_prob']
    features_df['home_prob_norm'] = features_df['home_prob'] / total_prob
    features_df['draw_prob_norm'] = features_df['draw_prob'] / total_prob
    features_df['away_prob_norm'] = features_df['away_prob'] / total_prob
    
    # Half-time score features
    features_df['home_ht_goals'] = pd.to_numeric(features_df['home_ht_goals'], errors='coerce')
    features_df['away_ht_goals'] = pd.to_numeric(features_df['away_ht_goals'], errors='coerce')
    features_df['ht_goal_diff'] = features_df['home_ht_goals'] - features_df['away_ht_goals']
    
    # Team strength proxies (based on odds when available)
    features_df['team_strength_diff'] = features_df['away_prob_norm'] - features_df['home_prob_norm']
    
    return features_df

# Create features for both datasets
df_ht0_features = create_features(df_ht0)
df_ht1_features = create_features(df_ht1)

# Select feature columns
feature_cols = [
    'tournament_encoded', 'home_team_encoded', 'away_team_encoded',
    'home_prob_norm', 'draw_prob_norm', 'away_prob_norm', 'team_strength_diff',
    'home_ht_goals', 'away_ht_goals', 'ht_goal_diff'
]

# Function to prepare data for modeling
def prepare_model_data(df_features, target_col='target'):
    # Select features that exist and have data
    available_features = [col for col in feature_cols if col in df_features.columns]
    
    X = df_features[available_features].copy()
    y = df_features[target_col].copy()
    
    # Fill missing values
    X = X.fillna(X.mean())
    
    # Remove any remaining NaN rows
    mask = ~(X.isna().any(axis=1) | y.isna())
    X = X[mask]
    y = y[mask]
    
    return X, y, available_features

# Prepare data for both scenarios
X_ht0, y_ht0, features_ht0 = prepare_model_data(df_ht0_features)
X_ht1, y_ht1, features_ht1 = prepare_model_data(df_ht1_features)

print(f"\nHT=0 model: {X_ht0.shape[0]} samples, {X_ht0.shape[1]} features")
print(f"HT=1 model: {X_ht1.shape[0]} samples, {X_ht1.shape[1]} features")

# Function to create and train model
def create_model(input_dim):
    model = Sequential([
        Dense(128, activation='relu', input_dim=input_dim),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.1),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Train model for HT=0 scenario
if len(X_ht0) > 100:  # Ensure we have enough data
    X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(
        X_ht0, y_ht0, test_size=0.2, random_state=42, stratify=y_ht0
    )
    
    scaler_0 = StandardScaler()
    X_train_0_scaled = scaler_0.fit_transform(X_train_0)
    X_test_0_scaled = scaler_0.transform(X_test_0)
    
    model_ht0 = create_model(X_train_0_scaled.shape[1])
    
    history_0 = model_ht0.fit(
        X_train_0_scaled, y_train_0,
        epochs=100,
        batch_size=32,
        validation_split=0.2,
        verbose=1
    )
    
    # Evaluate HT=0 model
    y_pred_0 = (model_ht0.predict(X_test_0_scaled) > 0.5).astype(int)
    y_proba_0 = model_ht0.predict(X_test_0_scaled)
    
    print("\n=== HT=0 Model Results ===")
    print(f"Accuracy: {(y_pred_0.flatten() == y_test_0).mean():.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test_0, y_pred_0))

# Train model for HT=1 scenario
if len(X_ht1) > 100:  # Ensure we have enough data
    X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(
        X_ht1, y_ht1, test_size=0.2, random_state=42, stratify=y_ht1
    )
    
    scaler_1 = StandardScaler()
    X_train_1_scaled = scaler_1.fit_transform(X_train_1)
    X_test_1_scaled = scaler_1.transform(X_test_1)
    
    model_ht1 = create_model(X_train_1_scaled.shape[1])
    
    history_1 = model_ht1.fit(
        X_train_1_scaled, y_train_1,
        epochs=100,
        batch_size=32,
        validation_split=0.2,
        verbose=1
    )
    
    # Evaluate HT=1 model
    y_pred_1 = (model_ht1.predict(X_test_1_scaled) > 0.5).astype(int)
    y_proba_1 = model_ht1.predict(X_test_1_scaled)
    
    print("\n=== HT=1 Model Results ===")
    print(f"Accuracy: {(y_pred_1.flatten() == y_test_1).mean():.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test_1, y_pred_1))

# High-confidence prediction analysis
def analyze_high_confidence_predictions(model, X_test_scaled, y_test, y_proba, threshold=0.9):
    """Analyze predictions with high confidence (>90% or <10% probability)"""
    
    high_conf_mask = (y_proba.flatten() > threshold) | (y_proba.flatten() < (1-threshold))
    
    if high_conf_mask.sum() > 0:
        high_conf_pred = (y_proba[high_conf_mask] > 0.5).astype(int)
        high_conf_actual = y_test[high_conf_mask]
        high_conf_proba = y_proba[high_conf_mask]
        
        accuracy = (high_conf_pred.flatten() == high_conf_actual).mean()
        
        print(f"\nHigh Confidence Predictions (>{threshold:.0%} or <{1-threshold:.0%}):")
        print(f"Number of high-confidence predictions: {high_conf_mask.sum()}")
        print(f"Percentage of total: {high_conf_mask.mean():.2%}")
        print(f"Accuracy on high-confidence predictions: {accuracy:.4f}")
        
        return high_conf_mask, accuracy
    else:
        print(f"\nNo predictions with >{threshold:.0%} confidence found")
        return None, 0

# Analyze high-confidence predictions
if len(X_ht0) > 100:
    print("\n" + "="*50)
    print("HIGH-CONFIDENCE ANALYSIS - HT=0 MODEL")
    high_conf_0, acc_0 = analyze_high_confidence_predictions(model_ht0, X_test_0_scaled, y_test_0, y_proba_0)

if len(X_ht1) > 100:
    print("\n" + "="*50)
    print("HIGH-CONFIDENCE ANALYSIS - HT=1 MODEL")
    high_conf_1, acc_1 = analyze_high_confidence_predictions(model_ht1, X_test_1_scaled, y_test_1, y_proba_1)

# Feature importance analysis (using permutation importance approximation)
def analyze_feature_importance(model, X_test_scaled, y_test, feature_names):
    """Simple feature importance analysis"""
    base_score = model.evaluate(X_test_scaled, y_test, verbose=0)[1]
    importance_scores = []
    
    for i, feature in enumerate(feature_names):
        X_permuted = X_test_scaled.copy()
        np.random.shuffle(X_permuted[:, i])
        permuted_score = model.evaluate(X_permuted, y_test, verbose=0)[1]
        importance = base_score - permuted_score
        importance_scores.append(importance)
    
    # Sort by importance
    feature_importance = list(zip(feature_names, importance_scores))
    feature_importance.sort(key=lambda x: abs(x[1]), reverse=True)
    
    print("\nFeature Importance (by accuracy drop when shuffled):")
    for feature, importance in feature_importance[:10]:
        print(f"{feature}: {importance:.4f}")

# Feature importance analysis
if len(X_ht0) > 100:
    print("\n" + "="*50)
    print("FEATURE IMPORTANCE - HT=0 MODEL")
    analyze_feature_importance(model_ht0, X_test_0_scaled, y_test_0, features_ht0)

if len(X_ht1) > 100:
    print("\n" + "="*50)
    print("FEATURE IMPORTANCE - HT=1 MODEL")
    analyze_feature_importance(model_ht1, X_test_1_scaled, y_test_1, features_ht1)

print("\n" + "="*80)
print("SUMMARY AND RECOMMENDATIONS")
print("="*80)

print("""
ANALYSIS SUMMARY:
1. The model attempts to predict goal probabilities based on available features
2. High-confidence predictions (>90% certainty) are analyzed separately
3. Feature importance shows which variables matter most

ACHIEVING 99% ACCURACY - REALISTIC ASSESSMENT:
- 99% accuracy on ALL predictions is extremely difficult with this data
- However, 99% accuracy on a SUBSET of high-confidence predictions is more feasible
- The key is identifying which games to predict vs. which to skip

STRATEGY FOR HIGH-CONFIDENCE PREDICTIONS:
1. Focus on matches where the model is >95% confident
2. This might be 10-20% of total matches, but with very high accuracy
3. Use ensemble methods and additional features for better confidence estimation

ADDITIONAL FEATURES THAT COULD IMPROVE ACCURACY:
(All easily accessible)

1. TEAM FORM FEATURES:
   - Last 5 games: goals scored/conceded per team
   - Recent head-to-head record
   - Home/away form separately

2. LEAGUE-SPECIFIC FEATURES:
   - Average goals per game in the league
   - League defensive/offensive strength ratings
   - Season stage (early/mid/late season affects motivation)

3. TEMPORAL FEATURES:
   - Day of week (weekend vs. weekday affects performance)
   - Month/season (weather effects, player fitness)
   - Days since last match (rest effects)

4. BETTING MARKET FEATURES:
   - Over/Under 2.5 goals odds
   - Both teams to score odds
   - Asian handicap lines

5. TEAM STATISTICS:
   - League position/points
   - Goals for/against ratios
   - Average possession percentage

6. MATCH CONTEXT:
   - Derby matches (local rivals)
   - Cup vs. league matches
   - Importance of match (relegation/promotion battles)

IMPLEMENTATION STRATEGY:
1. Start with current model to identify high-confidence predictions
2. Add team form features (most impactful)
3. Include league context and betting odds
4. Use ensemble of multiple models
5. Implement strict confidence thresholds (predict only top 10-20% most certain)

This approach could realistically achieve 90-99% accuracy on the subset of matches 
where the model is most confident, while avoiding predictions on uncertain matches.
""")

Dataset shape: (349, 15)

Column names:
['date', 'log_time', 'tournament', 'title', 'home-team', 'away-team', 'pre-match_odds_home', 'pre-match_odds_draw', 'pre-match_odds_away', 'home_ht_goals', 'away_ht_goals', 'ht_goals', 'home_ft_goals', 'away_ft_goals', 'ft_goals']

First few rows:
       date log_time tournament                                title  \
0  07-09-25    09:25        NaN     Tuen Mun SA vs Wong Tai Sin DRSC   
1  07-09-25    09:25        NaN  St. Joseph's Football Club vs Qi Yi   
2  07-09-25    09:25        NaN                 Tsun Tat vs Ornament   
3  07-09-25    09:30        NaN              Yuen Long FC vs Sha Tin   
4  07-09-25    09:52        NaN            Poland SRL vs Finland SRL   

                    home-team          away-team  pre-match_odds_home  \
0                 Tuen Mun SA  Wong Tai Sin DRSC                  NaN   
1  St. Joseph's Football Club              Qi Yi                  NaN   
2                    Tsun Tat           Ornament            

  bias_constraint=None,


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 55ms/step - accuracy: 0.3642 - loss: 0.7393 - val_accuracy: 0.5600 - val_loss: 0.6867
Epoch 2/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.5987 - loss: 0.6765 - val_accuracy: 0.7600 - val_loss: 0.6420
Epoch 3/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.6935 - loss: 0.6345 - val_accuracy: 0.7200 - val_loss: 0.6100
Epoch 4/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.7614 - loss: 0.6224 - val_accuracy: 0.7600 - val_loss: 0.5891
Epoch 5/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7468 - loss: 0.5924 - val_accuracy: 0.7600 - val_loss: 0.5757
Epoch 6/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.7717 - loss: 0.5675 - val_accuracy: 0.7600 - val_loss: 0.5653
Epoch 7/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0

  bias_constraint=None,


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step - accuracy: 0.6523 - loss: 0.6525 - val_accuracy: 0.9333 - val_loss: 0.5193
Epoch 2/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.7990 - loss: 0.5580 - val_accuracy: 0.9333 - val_loss: 0.4300
Epoch 3/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7927 - loss: 0.5415 - val_accuracy: 0.9333 - val_loss: 0.3690
Epoch 4/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7750 - loss: 0.5406 - val_accuracy: 0.9333 - val_loss: 0.3373
Epoch 5/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.8010 - loss: 0.5112 - val_accuracy: 0.9333 - val_loss: 0.3164
Epoch 6/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.7625 - loss: 0.5432 - val_accuracy: 0.9333 - val_loss: 0.3073
Epoch 7/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0