In [6]:
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
import matplotlib.pyplot as plt

# Function to fetch football data from football-data.org
def fetch_football_data():
    url = "https://api.football-data.org/v4/competitions/PL/matches"  # Premier League
    headers = {"X-Auth-Token": "038a29ae77b040edb4aa9c03a7787346"}  # Get API key from football-data.org
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return pd.DataFrame(response.json()['matches'])
    else:
        print("Error fetching football data")
        return None

# Function to fetch sports betting odds
def fetch_betting_odds():
    url = "https://api.the-odds-api.com/v4/sports/soccer_epl/odds"
    params = {"apiKey": "YOUR_ODDS_API_KEY", "regions": "eu", "markets": "h2h"}
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return pd.DataFrame(response.json())
    else:
        print("Error fetching betting odds")
        return None

# Fetch data
football_data = fetch_football_data()
betting_odds = fetch_betting_odds()

# Merge datasets on match ID
if football_data is not None and betting_odds is not None:
    data = pd.merge(football_data, betting_odds, on="id", how="inner")
else:
    raise ValueError("Data fetching failed.")

# Preprocessing
data = data[['homeTeam', 'awayTeam', 'score.fullTime.home', 'score.fullTime.away', 'odds']].dropna()
data['match_result'] = np.where(data['score.fullTime.home'] > data['score.fullTime.away'], 'H',
                                np.where(data['score.fullTime.home'] < data['score.fullTime.away'], 'A', 'D'))

# Encode categorical labels
label_encoder = LabelEncoder()
data['match_result_encoded'] = label_encoder.fit_transform(data['match_result'])

# Standardize numerical features
scaler = StandardScaler()
data[['score.fullTime.home', 'score.fullTime.away', 'odds']] = scaler.fit_transform(data[['score.fullTime.home', 'score.fullTime.away', 'odds']])

# Create training data
X = data[['score.fullTime.home', 'score.fullTime.away', 'odds']].values.reshape(-1, 3, 1)  # Reshape for CNN input
y = data['match_result_encoded'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# CNN Model
model = Sequential([
    Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(3, 1)),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(3, activation='softmax')  # 3 classes: Home Win, Away Win, Draw
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train model
history = model.fit(X_train, y_train, epochs=50, batch_size=8, validation_data=(X_test, y_test))

# Plot accuracy
plt.plot(history.history['accuracy'], label='train accuracy')
plt.plot(history.history['val_accuracy'], label='validation accuracy')
plt.legend()
plt.show()

# Model evaluation
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.2f}")


ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import requests
from io import StringIO
import datetime

# Data Retrieval Functions
def fetch_football_data(league="EPL", seasons=5):
    """
    Fetch historical football match data from Football-Data.co.uk
    Args:
        league (str): League code (EPL, LaLiga, Bundesliga, etc.)
        seasons (int): Number of past seasons to fetch
    Returns:
        pd.DataFrame: Combined match data
    """
    base_url = "https://www.football-data.co.uk/mmz4281/"
    current_year = datetime.datetime.now().year
    dfs = []
    
    for i in range(seasons):
        season = f"{current_year - i - 1}{str(current_year - i)[-2:]}"
        url = f"{base_url}{season}/{league}.csv"
        try:
            response = requests.get(url)
            response.raise_for_status()
            df = pd.read_csv(StringIO(response.text))
            df['Season'] = season
            dfs.append(df)
        except Exception as e:
            print(f"Failed to fetch {league} data for season {season}: {e}")
    
    if not dfs:
        raise ValueError("No data could be fetched. Please check league code or try later.")
    
    return pd.concat(dfs, ignore_index=True)

def preprocess_data(df):
    """
    Preprocess football match data for modeling
    Args:
        df (pd.DataFrame): Raw match data
    Returns:
        pd.DataFrame: Processed data with features and targets
    """
    # Basic cleaning
    df = df.dropna(subset=['FTHG', 'FTAG', 'FTR'])
    df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')
    df = df.dropna(subset=['Date'])
    
    # Create target variable (1=HomeWin, 0=Draw, -1=AwayWin)
    df['Result'] = df['FTR'].map({'H': 1, 'D': 0, 'A': -1})
    
    # Feature engineering
    df['AvgH'] = (df['BbAvH'] + df['BbAvD'] + df['BbAvA']) / 3
    df['AvgD'] = df['BbAvD']
    df['AvgA'] = df['BbAvA']
    
    # Create moving averages of team performance
    teams = pd.unique(np.concatenate([df['HomeTeam'], df['AwayTeam']]))
    
    for team in teams:
        # Home performance features
        home_games = df[df['HomeTeam'] == team].sort_values('Date')
        home_games['Home_GoalsScored_MA'] = home_games['FTHG'].rolling(5, min_periods=1).mean()
        home_games['Home_GoalsConceded_MA'] = home_games['FTAG'].rolling(5, min_periods=1).mean()
        
        # Away performance features
        away_games = df[df['AwayTeam'] == team].sort_values('Date')
        away_games['Away_GoalsScored_MA'] = away_games['FTAG'].rolling(5, min_periods=1).mean()
        away_games['Away_GoalsConceded_MA'] = away_games['FTHG'].rolling(5, min_periods=1).mean()
        
        # Merge features back
        df.loc[df['HomeTeam'] == team, ['Home_GoalsScored_MA', 'Home_GoalsConceded_MA']] = \
            home_games[['Home_GoalsScored_MA', 'Home_GoalsConceded_MA']].values
        df.loc[df['AwayTeam'] == team, ['Away_GoalsScored_MA', 'Away_GoalsConceded_MA']] = \
            away_games[['Away_GoalsScored_MA', 'Away_GoalsConceded_MA']].values
    
    # Additional features
    df['DaysSinceLastGame'] = df.groupby('HomeTeam')['Date'].diff().dt.days.fillna(14)
    df['EloDiff'] = df['HomeTeam'].map(calculate_elo) - df['AwayTeam'].map(calculate_elo)
    
    # Drop remaining NA values
    df = df.dropna()
    
    return df

def calculate_elo(team, df, initial_elo=1500, k=30):
    """
    Calculate Elo ratings for teams
    """
    elo_dict = {team: initial_elo for team in pd.unique(np.concatenate([df['HomeTeam'], df['AwayTeam']]))}
    elo_history = []
    
    for idx, row in df.sort_values('Date').iterrows():
        home_team = row['HomeTeam']
        away_team = row['AwayTeam']
        
        # Current Elo
        home_elo = elo_dict[home_team]
        away_elo = elo_dict[away_team]
        
        # Expected outcome
        expected_home = 1 / (1 + 10**((away_elo - home_elo) / 400))
        expected_away = 1 - expected_home
        
        # Actual outcome
        actual_home = 1 if row['FTR'] == 'H' else (0.5 if row['FTR'] == 'D' else 0)
        actual_away = 1 - actual_home
        
        # Update Elo
        elo_dict[home_team] = home_elo + k * (actual_home - expected_home)
        elo_dict[away_team] = away_elo + k * (actual_away - expected_away)
        
        elo_history.append({'Date': row['Date'], 'HomeTeam': home_team, 'AwayTeam': away_team,
                           'HomeElo': home_elo, 'AwayElo': away_elo})
    
    elo_df = pd.DataFrame(elo_history)
    return elo_df[elo_df['Team'] == team].iloc[-1]['Elo'] if team in elo_dict else initial_elo

# Model Architecture
def build_cnn_model(input_shape):
    """
    Build CNN model for match outcome prediction
    Args:
        input_shape (tuple): Shape of input data
    Returns:
        tf.keras.Model: Compiled CNN model
    """
    model = Sequential([
        Conv1D(64, 3, activation='relu', input_shape=input_shape),
        BatchNormalization(),
        MaxPooling1D(2),
        Dropout(0.3),
        
        Conv1D(128, 3, activation='relu'),
        BatchNormalization(),
        MaxPooling1D(2),
        Dropout(0.3),
        
        Conv1D(256, 3, activation='relu'),
        BatchNormalization(),
        MaxPooling1D(2),
        Dropout(0.3),
        
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(3, activation='softmax')  # 3 outputs: Home Win, Draw, Away Win
    ])
    
    # Custom loss function to minimize correlation with bookmaker odds
    def correlation_loss(y_true, y_pred):
        # Standard MSE loss
        mse_loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred)
        
        # Correlation penalty (we want our predictions to differ from bookmaker odds)
        bookie_probs = y_true[:, 3:]  # Assuming bookie odds are passed as additional features
        correlation = tfp.stats.correlation(y_pred, bookie_probs, sample_axis=0)
        
        # Weighted combination
        return mse_loss - 0.3 * correlation  # Negative sign to minimize correlation
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model

# Profitability Optimization
def calculate_profitability(model, X_test, y_test, odds_data):
    """
    Calculate potential profitability of model predictions
    Args:
        model: Trained model
        X_test: Test features
        y_test: Test labels
        odds_data: Bookmaker odds data
    Returns:
        dict: Profitability metrics
    """
    predictions = model.predict(X_test)
    predicted_classes = np.argmax(predictions, axis=1)
    true_classes = np.argmax(y_test, axis=1)
    
    # Get odds for predicted outcomes
    home_odds = odds_data['AvgH']
    draw_odds = odds_data['AvgD']
    away_odds = odds_data['AvgA']
    
    # Calculate returns for different strategies
    flat_betting = np.where(predicted_classes == true_classes, 
                           [home_odds, draw_odds, away_odds][predicted_classes] - 1,
                           -1)
    
    value_betting = np.where(predictions.max(axis=1) > 0.6,  # Only bet when confident
                           flat_betting,
                           0)
    
    return {
        'flat_betting_roi': np.mean(flat_betting),
        'value_betting_roi': np.mean(value_betting),
        'accuracy': accuracy_score(true_classes, predicted_classes)
    }

# Main Execution
def main():
    # Fetch and preprocess data
    print("Fetching data...")
    raw_data = fetch_football_data(league="E0", seasons=5)  # E0 is Premier League
    
    print("Preprocessing data...")
    processed_data = preprocess_data(raw_data)
    
    # Prepare features and targets
    features = processed_data[['Home_GoalsScored_MA', 'Home_GoalsConceded_MA',
                              'Away_GoalsScored_MA', 'Away_GoalsConceded_MA',
                              'DaysSinceLastGame', 'EloDiff', 'AvgH', 'AvgD', 'AvgA']]
    
    # One-hot encode targets
    targets = pd.get_dummies(processed_data['Result'].map({1: 'Home', 0: 'Draw', -1: 'Away'}))
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        features, targets, test_size=0.2, random_state=42, shuffle=False)
    
    # Scale features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Reshape for CNN (samples, timesteps, features)
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
    
    # Build and train model
    print("Building model...")
    model = build_cnn_model((X_train.shape[1], 1))
    
    print("Training model...")
    history = model.fit(X_train, y_train,
                       epochs=50,
                       batch_size=32,
                       validation_split=0.2,
                       verbose=1)
    
    # Evaluate model
    print("Evaluating model...")
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Accuracy: {test_acc:.4f}")
    
    # Calculate profitability
    odds_data = processed_data.iloc[X_test.index][['AvgH', 'AvgD', 'AvgA']]
    profit_metrics = calculate_profitability(model, X_test, y_test, odds_data)
    
    print("\nProfitability Metrics:")
    print(f"Flat Betting ROI: {profit_metrics['flat_betting_roi']:.2%}")
    print(f"Value Betting ROI: {profit_metrics['value_betting_roi']:.2%}")
    print(f"Accuracy: {profit_metrics['accuracy']:.2%}")
    
    # Plot training history
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    main()

Fetching data...
Failed to fetch E0 data for season 202425: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
Failed to fetch E0 data for season 202324: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
Failed to fetch E0 data for season 202223: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
Failed to fetch E0 data for season 202122: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
Failed to fetch E0 data for season 202021: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


ValueError: No data could be fetched. Please check league code or try later.