In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# List of file paths
file_paths = [
    '/content/drive/My Drive/Sports Modeling/football/pl-23.csv',
    '/content/drive/My Drive/Sports Modeling/football/pl-22.csv',
    '/content/drive/My Drive/Sports Modeling/football/pl-21.csv',
    '/content/drive/My Drive/Sports Modeling/football/pl-20.csv',
    '/content/drive/My Drive/Sports Modeling/football/pl-19.csv',
    '/content/drive/My Drive/Sports Modeling/football/pl-18.csv',
    '/content/drive/My Drive/Sports Modeling/football/pl-17.csv',
    '/content/drive/My Drive/Sports Modeling/football/pl-16.csv',
    '/content/drive/My Drive/Sports Modeling/football/pl-15.csv',
    '/content/drive/My Drive/Sports Modeling/football/pl-14.csv',
    '/content/drive/My Drive/Sports Modeling/football/pl-13.csv',
    '/content/drive/My Drive/Sports Modeling/football/pl-12.csv',
    '/content/drive/My Drive/Sports Modeling/football/pl-11.csv',
    '/content/drive/My Drive/Sports Modeling/football/pl-10.csv',
    '/content/drive/My Drive/Sports Modeling/football/pl-09.csv',
    '/content/drive/My Drive/Sports Modeling/football/pl-08.csv',
    '/content/drive/My Drive/Sports Modeling/football/pl-07.csv',
    '/content/drive/My Drive/Sports Modeling/football/pl-06.csv',
    '/content/drive/My Drive/Sports Modeling/football/pl-05.csv',
    '/content/drive/My Drive/Sports Modeling/football/pl-04.csv',
    '/content/drive/My Drive/Sports Modeling/football/pl-03.csv'
]

# Initialize an empty list to hold the DataFrames
dfs = []

# Function to parse dates with two possible formats
def parse_date(date_str):
    try:
        # Try parsing with four-digit year first
        return pd.to_datetime(date_str, format='%d/%m/%Y')
    except ValueError:
        try:
            # If that fails, try parsing with two-digit year
            return pd.to_datetime(date_str, format='%d/%m/%y')
        except ValueError:
            return pd.NaT

# Loop through the file paths, read each CSV into a DataFrame, standardize the 'Date' column format, and append it to the list
for file_path in file_paths:
    try:
        df = pd.read_csv(file_path, on_bad_lines='skip')
        # Standardize the 'Date' column format
        df['Date'] = df['Date'].apply(parse_date)
        dfs.append(df)
    except pd.errors.ParserError as e:
        print(f"Error parsing {file_path}: {e}")

# Concatenate all DataFrames in the list into one DataFrame, filling missing columns with NaN
merged_df = pd.concat(dfs, ignore_index=True, sort=False)

# Drop rows where 'Date' is NaT
merged_df = merged_df.dropna(subset=['Date'])


In [None]:
class_counts = merged_df['FTR'].value_counts()
print(class_counts)

FTR
H    3617
A    2341
D    1929
Name: count, dtype: int64


In [None]:
# Get the count of null values in each column
null_counts = merged_df.isnull().sum()

# Filter the columns with null values
null_counts = null_counts[null_counts > 0]

# Display the columns with their respective null value counts
print(null_counts)

# Optionally, save the null counts to a CSV file for further analysis
null_counts.to_csv('null_value_counts.csv', header=['null_counts'])

# If you want to download the file, use the following code
from google.colab import files
files.download('null_value_counts.csv')

Time      5987
BWH        670
BWD        670
BWA        670
IWH        184
          ... 
GBAH      7238
LBAHH     7264
LBAHA     7264
LBAH      7264
B365AH    7252
Length: 125, dtype: int64


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Calculate the threshold for dropping columns
threshold = len(merged_df) / 2  # 50% threshold

# Drop columns with more than 50% null values
merged_df = merged_df.dropna(thresh=threshold, axis=1)

In [None]:
# Get the count of null values in each column
null_counts = merged_df.isnull().sum()

# Filter the columns with null values
null_counts = null_counts[null_counts > 0]

# Get the list of columns with null values
columns_with_nulls = null_counts.index.tolist()

print(columns_with_nulls)

['BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'PSH', 'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'PSCH', 'PSCD', 'PSCA', 'Bb1X2', 'BbMxH', 'BbAvH', 'BbMxD', 'BbAvD', 'BbMxA', 'BbAvA', 'BbOU', 'BbMx>2.5', 'BbAv>2.5', 'BbMx<2.5', 'BbAv<2.5', 'BbAH', 'BbAHh', 'BbMxAHH', 'BbAvAHH', 'BbMxAHA', 'BbAvAHA', 'LBH', 'LBD', 'LBA']


### Handling Null values!

In [None]:
# List out columns with null values which can be worked on
columns_with_nulls = ['BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'PSH', 'PSD', 'PSA',
                      'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'PSCH', 'PSCD', 'PSCA',
                      'Bb1X2', 'BbMxH', 'BbAvH', 'BbMxD', 'BbAvD', 'BbMxA', 'BbAvA', 'BbOU',
                      'BbMx>2.5', 'BbAv>2.5', 'BbMx<2.5', 'BbAv<2.5', 'BbAH', 'BbAHh',
                      'BbMxAHH', 'BbAvAHH', 'BbMxAHA', 'BbAvAHA', 'LBH', 'LBD', 'LBA']

# Step 1: Group-Based Mean Imputation (based on HomeTeam and AwayTeam)
for column in columns_with_nulls:
    merged_df[column] = merged_df.groupby(['HomeTeam', 'AwayTeam'])[column].transform(lambda x: x.fillna(x.mean()))

# Step 2: League-Based Mean Imputation (based on Div)
for column in columns_with_nulls:
    merged_df[column] = merged_df.groupby('Div')[column].transform(lambda x: x.fillna(x.mean()))

# Step 3: Overall Median Imputation
for column in columns_with_nulls:
    merged_df[column].fillna(merged_df[column].median(), inplace=True)

In [None]:
# Identifying and removing columns with much more null values
columns_to_drop = merged_df.columns[merged_df.isnull().sum() > 1]

# Drop the identified columns from the DataFrame
merged_df = merged_df.drop(columns=columns_to_drop)

In [None]:
# Extract useful features from the 'Date' column
merged_df['DayOfWeek'] = merged_df['Date'].dt.dayofweek.astype(int)
merged_df['Month'] = merged_df['Date'].dt.month.astype(int)
merged_df['Year'] = merged_df['Date'].dt.year.astype(int)

# Drop the 'Date' column after extracting useful features
merged_df = merged_df.drop(columns=['Date'])

# Drop rows with any NaN values
merged_df = merged_df.dropna()

In [None]:
print(merged_df.columns.tolist())

['Div', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'PSH', 'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'PSCH', 'PSCD', 'PSCA', 'Bb1X2', 'BbMxH', 'BbAvH', 'BbMxD', 'BbAvD', 'BbMxA', 'BbAvA', 'BbOU', 'BbMx>2.5', 'BbAv>2.5', 'BbMx<2.5', 'BbAv<2.5', 'BbAH', 'BbAHh', 'BbMxAHH', 'BbAvAHH', 'BbMxAHA', 'BbAvAHA', 'LBH', 'LBD', 'LBA', 'DayOfWeek', 'Month', 'Year']


In [None]:
# Remove 'Referee' and 'Div' columns from the DataFrame
merged_df = merged_df.drop(columns=['Referee', 'Div'])

In [None]:
# Display columns that are not numerical
non_numerical_data = merged_df.select_dtypes(exclude=['int64', 'float64'])

# Display the first few rows of these non-numerical columns
print(non_numerical_data.head())

      HomeTeam       AwayTeam FTR HTR
0      Burnley       Man City   A   A
1      Arsenal  Nott'm Forest   H   H
2  Bournemouth       West Ham   D   D
3     Brighton          Luton   H   H
4      Everton         Fulham   A   D


In [None]:
merged_df

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,...,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,LBH,LBD,LBA,DayOfWeek,Month,Year
0,Burnley,Man City,0.0,3.0,A,0.0,2.0,A,6.0,17.0,...,2.520000,2.352000,1.754000,1.702000,8.375000,4.812500,1.412500,4,8,2023
1,Arsenal,Nott'm Forest,2.0,1.0,H,2.0,0.0,H,15.0,6.0,...,2.008001,1.936057,2.164899,2.065840,2.621779,3.770915,4.544944,5,8,2023
2,Bournemouth,West Ham,1.0,1.0,D,0.0,0.0,D,14.0,16.0,...,1.952500,1.902500,2.077500,2.012500,2.383333,3.266667,3.066667,5,8,2023
3,Brighton,Luton,4.0,1.0,H,1.0,0.0,H,27.0,9.0,...,2.008001,1.936057,2.164899,2.065840,2.621779,3.770915,4.544944,5,8,2023
4,Everton,Fulham,0.0,1.0,A,0.0,0.0,D,19.0,9.0,...,1.944000,1.887000,2.566000,2.358000,1.644000,3.590909,5.439091,5,8,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7883,Leicester,Portsmouth,3.0,1.0,H,2.0,0.0,H,13.0,14.0,...,2.008001,1.936057,2.164899,2.065840,2.380000,3.200000,2.600000,5,5,2004
7884,Man United,Chelsea,1.0,1.0,D,0.0,1.0,A,12.0,11.0,...,1.915000,1.847857,2.118571,2.027857,2.200000,3.200000,2.800000,5,5,2004
7885,Middlesbrough,Man City,2.0,1.0,H,2.0,1.0,H,12.0,14.0,...,1.986000,1.912000,2.050000,1.962000,1.800000,3.250000,3.750000,5,5,2004
7886,Southampton,Aston Villa,1.0,1.0,D,1.0,1.0,D,18.0,14.0,...,1.990000,1.937500,1.975000,1.932500,2.800000,3.200000,2.200000,5,5,2004


### Feature Engineering for Winner prediction model

In [None]:
def feature_engineering(df):
    # Ensure day, month, and year columns are available
    if 'DayOfWeek' not in df.columns or 'Month' not in df.columns or 'Year' not in df.columns:
        raise ValueError("DayOfWeek, Month, and Year columns must be present in the DataFrame.")

    # Sort by team and then by DayOfWeek, Month, and Year
    df = df.sort_values(by=['HomeTeam', 'Year', 'Month', 'DayOfWeek'])

    # Win Probabilities
    df['HomeWinProb'] = df['B365H'] / (df['B365H'] + df['B365D'] + df['B365A'])
    df['AwayWinProb'] = df['B365A'] / (df['B365H'] + df['B365D'] + df['B365A'])
    df['DrawProb'] = df['B365D'] / (df['B365H'] + df['B365D'] + df['B365A'])

    # Function to compute rolling average with proper index alignment
    def compute_rolling_average(group, column, window):
        return group[column].shift().rolling(window=window, min_periods=1).mean()

    # Apply rolling average calculations
    df['AvgHomeGoalsScored'] = df.groupby('HomeTeam').apply(compute_rolling_average, 'FTHG', 5).reset_index(level=0, drop=True)
    df['AvgHomeGoalsConceded'] = df.groupby('HomeTeam').apply(compute_rolling_average, 'FTAG', 5).reset_index(level=0, drop=True)
    df['AvgAwayGoalsScored'] = df.groupby('AwayTeam').apply(compute_rolling_average, 'FTAG', 5).reset_index(level=0, drop=True)
    df['AvgAwayGoalsConceded'] = df.groupby('AwayTeam').apply(compute_rolling_average, 'FTHG', 5).reset_index(level=0, drop=True)

    # Home/Away Performance
    df['HomeTeamPerformance'] = df.groupby('HomeTeam')['FTR'].apply(lambda x: x.shift().map({'H': 2, 'D': 1, 'A': 0}).rolling(window=5, min_periods=1).mean()).reset_index(level=0, drop=True)
    df['AwayTeamPerformance'] = df.groupby('AwayTeam')['FTR'].apply(lambda x: x.shift().map({'H': 2, 'D': 1, 'A': 0}).rolling(window=5, min_periods=1).mean()).reset_index(level=0, drop=True)

    # Form Streaks
    df['HomeWinStreak'] = df.groupby('HomeTeam')['FTR'].apply(lambda x: (x.shift() == 'H').rolling(window=5, min_periods=1).sum()).reset_index(level=0, drop=True)
    df['AwayWinStreak'] = df.groupby('AwayTeam')['FTR'].apply(lambda x: (x.shift() == 'A').rolling(window=5, min_periods=1).sum()).reset_index(level=0, drop=True)

    # Goal Differences
    df['HomeGoalDiff'] = df['AvgHomeGoalsScored'] - df['AvgHomeGoalsConceded']
    df['AwayGoalDiff'] = df['AvgAwayGoalsScored'] - df['AvgAwayGoalsConceded']

    # Shot Efficiency
    df['HomeShotEff'] = df.groupby('HomeTeam').apply(lambda x: (x['HST'].shift() / x['HS'].shift()).rolling(window=5, min_periods=1).mean()).reset_index(level=0, drop=True)
    df['AwayShotEff'] = df.groupby('AwayTeam').apply(lambda x: (x['AST'].shift() / x['AS'].shift()).rolling(window=5, min_periods=1).mean()).reset_index(level=0, drop=True)

    # Fouls and Cards
    df['AvgHomeFouls'] = df.groupby('HomeTeam')['HF'].apply(lambda x: x.shift().rolling(window=5, min_periods=1).mean()).reset_index(level=0, drop=True)
    df['AvgAwayFouls'] = df.groupby('AwayTeam')['AF'].apply(lambda x: x.shift().rolling(window=5, min_periods=1).mean()).reset_index(level=0, drop=True)
    df['AvgHomeCards'] = df.groupby('HomeTeam')['HY'].apply(lambda x: x.shift().rolling(window=5, min_periods=1).mean()).reset_index(level=0, drop=True)
    df['AvgAwayCards'] = df.groupby('AwayTeam')['AY'].apply(lambda x: x.shift().rolling(window=5, min_periods=1).mean()).reset_index(level=0, drop=True)

    # Drop rows where 'HomeTeam' or 'AwayTeam' is NaN
    df = df.dropna(subset=['HomeTeam', 'AwayTeam'])

    # Historical Head-to-Head
    df['Matchup'] = df.apply(lambda x: '_'.join(sorted([x['HomeTeam'], x['AwayTeam']])), axis=1)
    df['HomeWinsInMatchup'] = df.groupby('Matchup')['FTR'].apply(lambda x: (x.shift() == 'H').rolling(window=5, min_periods=1).sum()).reset_index(level=0, drop=True)
    df['AwayWinsInMatchup'] = df.groupby('Matchup')['FTR'].apply(lambda x: (x.shift() == 'A').rolling(window=5, min_periods=1).sum()).reset_index(level=0, drop=True)

    # Betting Odds Features: Average and max betting odds for home, draw, and away outcomes
    df['AvgHomeOdds'] = df[['B365H', 'BWH', 'IWH', 'WHH', 'VCH', 'LBH']].mean(axis=1)
    df['AvgDrawOdds'] = df[['B365D', 'BWD', 'IWD', 'WHD', 'VCD', 'LBD']].mean(axis=1)
    df['AvgAwayOdds'] = df[['B365A', 'BWA', 'IWA', 'WHA', 'VCA', 'LBA']].mean(axis=1)
    df['MaxHomeOdds'] = df[['B365H', 'BWH', 'IWH', 'WHH', 'VCH', 'LBH']].max(axis=1)
    df['MaxDrawOdds'] = df[['B365D', 'BWD', 'IWD', 'WHD', 'VCD', 'LBD']].max(axis=1)
    df['MaxAwayOdds'] = df[['B365A', 'BWA', 'IWA', 'WHA', 'VCA', 'LBA']].max(axis=1)

    # Odds changes before the match
    df['HomeOddsChange'] = df['B365H'] - df['WHH']
    df['DrawOddsChange'] = df['B365D'] - df['WHD']
    df['AwayOddsChange'] = df['B365A'] - df['WHA']

    # Match Context Features: Whether the match is a derby or rivalry match
    # Assuming you have a list of derbies/rivalries
    derbies = [
        ('Liverpool', 'Everton'),
        ('Arsenal', 'Tottenham'),
        ('Manchester United', 'Manchester City'),
        ('Chelsea', 'Fulham'),
        ('Newcastle United', 'Sunderland'),
        ('Aston Villa', 'Birmingham City'),
        ('Southampton', 'Portsmouth'),
        ('Wolverhampton Wanderers', 'West Bromwich Albion'),
        ('Brighton', 'Crystal Palace')]

    df['IsDerby'] = df.apply(lambda x: 1 if (x['HomeTeam'], x['AwayTeam']) in derbies or (x['AwayTeam'], x['HomeTeam']) in derbies else 0, axis=1)

    return df


In [None]:
featured_merged_df=feature_engineering(merged_df)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Handle categorical features
label_encoders = {}
categorical_features = ['HomeTeam', 'AwayTeam', 'FTR', 'HTR']

for col in categorical_features:
    label_encoders[col] = LabelEncoder()
    featured_merged_df[col] = label_encoders[col].fit_transform(featured_merged_df[col])

In [None]:
class_counts = merged_df['FTR'].value_counts()
print(class_counts)

FTR
H    3617
A    2341
D    1929
Name: count, dtype: int64


In [None]:
print(featured_merged_df.columns.tolist())

['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'PSH', 'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'PSCH', 'PSCD', 'PSCA', 'Bb1X2', 'BbMxH', 'BbAvH', 'BbMxD', 'BbAvD', 'BbMxA', 'BbAvA', 'BbOU', 'BbMx>2.5', 'BbAv>2.5', 'BbMx<2.5', 'BbAv<2.5', 'BbAH', 'BbAHh', 'BbMxAHH', 'BbAvAHH', 'BbMxAHA', 'BbAvAHA', 'LBH', 'LBD', 'LBA', 'DayOfWeek', 'Month', 'Year', 'TotalGoals', 'HomeGoalDiff', 'AwayGoalDiff', 'AvgHomeGoals', 'AvgAwayGoals', 'TotalShots', 'TotalShotsOnTarget', 'HomeShotEff', 'AwayShotEff', 'TotalFouls', 'TotalYellowCards', 'TotalRedCards', 'AvgHomeWinOdds', 'AvgDrawOdds', 'AvgAwayWinOdds', 'HomeWinProb', 'AwayWinProb', 'DrawProb', 'AvgHomeGoalsScored', 'AvgHomeGoalsConceded', 'AvgAwayGoalsScored', 'AvgAwayGoalsConceded', 'HomeTeamPerformance', 'AwayTeamPerformance', 'HomeWinStreak', 'AwayWinStreak', 'AvgHome

In [None]:
# lisitng out features that will be available before match and which are newly created

all_features = [
    'HomeTeam', 'AwayTeam', 'FTR', 'Year', 'Month', 'DayOfWeek',
    'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA',
    'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'LBH', 'LBD', 'LBA',
    'HomeWinProb', 'AwayWinProb', 'DrawProb',
    'AvgHomeGoalsScored', 'AvgHomeGoalsConceded', 'AvgAwayGoalsScored', 'AvgAwayGoalsConceded',
    'HomeTeamPerformance', 'AwayTeamPerformance',
    'HomeWinStreak', 'AwayWinStreak',
    'HomeGoalDiff', 'AwayGoalDiff',
    'HomeShotEff', 'AwayShotEff',
    'AvgHomeFouls', 'AvgAwayFouls', 'AvgHomeCards', 'AvgAwayCards',
    'HomeWinsInMatchup', 'AwayWinsInMatchup',
    'AvgHomeOdds', 'AvgDrawOdds', 'AvgAwayOdds', 'MaxHomeOdds', 'MaxDrawOdds', 'MaxAwayOdds',
    'HomeOddsChange', 'DrawOddsChange', 'AwayOddsChange',
    'IsDerby'
]

# final dataset with selected columns
final_df = featured_merged_df[all_features]

# Display the final dataset
print(final_df.head())


      HomeTeam  AwayTeam  FTR  Year  Month  DayOfWeek  B365H  B365D  B365A  \
7246         0         1    2  2003      8          2  1.286    5.0    8.5   
7578         0         1    2  2003      8          2  1.286    5.0    8.5   
7221         0        15    2  2003      8          5  1.400    3.8    8.0   
7553         0        15    2  2003      8          5  1.400    3.8    8.0   
7279         0        26    2  2003      9          4  1.530    3.5    5.5   

           BWH  ...  AvgHomeOdds  AvgDrawOdds  AvgAwayOdds  MaxHomeOdds  \
7246  1.412500  ...     1.326521     4.650521     8.626563     1.430625   
7578  1.412500  ...     1.326521     4.650521     8.626563     1.430625   
7221  1.496842  ...     1.394009     4.220175     7.679825     1.504211   
7553  1.496842  ...     1.394009     4.220175     7.679825     1.504211   
7279  1.387647  ...     1.459490     3.995196     6.651471     1.530000   

      MaxDrawOdds  MaxAwayOdds  HomeOddsChange  DrawOddsChange  \
7246     5.000

In [None]:
print(final_df.columns.tolist())

['HomeTeam', 'AwayTeam', 'FTR', 'Year', 'Month', 'DayOfWeek', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'LBH', 'LBD', 'LBA', 'HomeWinProb', 'AwayWinProb', 'DrawProb', 'AvgHomeGoalsScored', 'AvgHomeGoalsConceded', 'AvgAwayGoalsScored', 'AvgAwayGoalsConceded', 'HomeTeamPerformance', 'AwayTeamPerformance', 'HomeWinStreak', 'AwayWinStreak', 'HomeGoalDiff', 'AwayGoalDiff', 'HomeShotEff', 'AwayShotEff', 'AvgHomeFouls', 'AvgAwayFouls', 'AvgHomeCards', 'AvgAwayCards', 'HomeWinsInMatchup', 'AwayWinsInMatchup', 'AvgHomeOdds', 'AvgDrawOdds', 'AvgAwayOdds', 'MaxHomeOdds', 'MaxDrawOdds', 'MaxAwayOdds', 'HomeOddsChange', 'DrawOddsChange', 'AwayOddsChange', 'IsDerby']


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import BatchNormalization, LeakyReLU

In [None]:
# Get the count of null values in each column
null_counts = final_df.isnull().sum()

# Filter the columns with null values
null_counts = null_counts[null_counts > 0]

# Display the columns with their respective null value counts
print(null_counts)

AvgHomeGoalsScored      43
AvgHomeGoalsConceded    43
AvgAwayGoalsScored      43
AvgAwayGoalsConceded    43
HomeTeamPerformance     43
AwayTeamPerformance     43
HomeGoalDiff            43
AwayGoalDiff            43
HomeShotEff             43
AwayShotEff             43
AvgHomeFouls            43
AvgAwayFouls            43
AvgHomeCards            43
AvgAwayCards            43
dtype: int64


In [None]:
final_df = final_df.dropna()

In [None]:
final_df

Unnamed: 0,HomeTeam,AwayTeam,FTR,Year,Month,DayOfWeek,B365H,B365D,B365A,BWH,...,AvgHomeOdds,AvgDrawOdds,AvgAwayOdds,MaxHomeOdds,MaxDrawOdds,MaxAwayOdds,HomeOddsChange,DrawOddsChange,AwayOddsChange,IsDerby
7578,0,1,2,2003,8,2,1.286,5.0,8.50,1.412500,...,1.326521,4.650521,8.626563,1.430625,5.000000,10.000000,0.006,0.5,0.50,0
7553,0,15,2,2003,8,5,1.400,3.8,8.00,1.496842,...,1.394009,4.220175,7.679825,1.504211,4.557895,8.000000,0.120,-0.7,0.00,0
7611,0,26,2,2003,9,4,1.530,3.5,5.50,1.387647,...,1.459490,3.995196,6.651471,1.530000,4.850000,9.258824,0.030,0.1,-0.50,0
7591,0,29,1,2003,9,5,1.280,4.5,10.00,1.310000,...,1.281000,4.711667,9.435000,1.354000,5.000000,10.000000,0.060,-0.5,0.50,0
7632,0,12,2,2003,10,5,2.200,3.2,3.00,2.588947,...,2.285877,3.302193,3.043246,2.626316,3.544737,3.243158,0.000,0.0,0.20,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,42,6,0,2024,4,2,2.700,3.6,2.50,2.650000,...,2.519741,3.645152,3.237491,2.700000,3.770915,5.100000,0.000,-0.1,0.12,0
308,42,40,0,2024,4,5,2.500,3.6,2.70,2.450000,...,2.427976,3.442262,2.799405,2.500000,3.600000,3.071429,0.100,0.0,0.00,0
325,42,0,0,2024,4,5,8.500,4.5,1.40,7.500000,...,7.267500,4.472500,1.473667,8.500000,5.000000,1.910000,0.500,-0.5,0.05,0
340,42,22,2,2024,4,5,1.950,3.8,3.60,2.000000,...,2.185177,3.744340,3.802942,2.639280,3.800000,4.544944,0.000,0.0,0.10,0


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Define feature sets
all_features = [
    'AvgHomeGoalsScored', 'AvgHomeGoalsConceded', 'AvgAwayGoalsScored', 'AvgAwayGoalsConceded',
    'HomeTeamPerformance', 'AwayTeamPerformance', 'HomeWinStreak', 'AwayWinStreak',
    'HomeGoalDiff', 'AwayGoalDiff', 'HomeShotEff', 'AwayShotEff', 'AvgHomeFouls', 'AvgAwayFouls',
    'AvgHomeCards', 'AvgAwayCards', 'HomeWinsInMatchup', 'AwayWinsInMatchup', 'HomeWinProb',
    'AwayWinProb', 'DrawProb', 'HomeTeam', 'AwayTeam','FTR',
    'HomeOddsChange', 'DrawOddsChange', 'AwayOddsChange', 'IsDerby', 'DayOfWeek', 'Month', 'Year']

# creatiing new dataframe with all new features
df_new = final_df[all_features]

# Sort the dataset by 'season_year', 'month', and 'day'
df_new_sorted = df_new.sort_values(by=['DayOfWeek', 'Month', 'Year'], ascending=[True, True, True])

# Split the data into training and validation sets (80% for training, 20% for validation)
train_val_data, test_data = train_test_split(df_new_sorted, test_size=0.05, shuffle=False)

# Further split the training and validation set into 80% training and 20% validation
train_data, val_data = train_test_split(train_val_data, test_size=0.2, shuffle=False)

# Step 4: Create the test dataset with only the before_game_features
Before_game_features = ['HomeTeam', 'AwayTeam','DayOfWeek', 'Month', 'Year']

test_data = test_data[Before_game_features]

# Displaying the dataset shapes to the user for verification
print("Training Data Shape:", train_data.shape)
print("Validation Data Shape:", val_data.shape)
print("Testing Data Shape:", test_data.shape)

Training Data Shape: (5928, 31)
Validation Data Shape: (1483, 31)
Testing Data Shape: (391, 5)


In [None]:
class_counts = final_df['FTR'].value_counts()
print(class_counts)

FTR
2    3564
0    2329
1    1909
Name: count, dtype: int64


### Detailed Steps

*   Train the comprehensive model with all features.
*   Extract and transfer weights for a new model with betting odds features.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
from sklearn.utils import class_weight

# Features and target for training and validation
X_train = train_data[all_features].drop(columns=['FTR'])  # Drop the target column from features
y_train = to_categorical(train_data['FTR'], num_classes=3)
X_val = val_data[all_features].drop(columns=['FTR'])  # Drop the target column from features
y_val = to_categorical(val_data['FTR'], num_classes=3)

# Normalize the feature data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Create the model
model = Sequential()
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01), input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))  # Add dropout to reduce overfitting
model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(16, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Compute class weights for imbalanced datasets
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(np.argmax(y_train, axis=1)), y=np.argmax(y_train, axis=1))
class_weights = dict(enumerate(class_weights))

# Train the model using the training and validation sets
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val), callbacks=[early_stopping], class_weight=class_weights)

# Evaluate the model on the validation set
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Accuracy: {accuracy:.2f}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Validation Accuracy: 0.50


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential, Model
from keras.layers import Dense, Flatten, Input
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
from sklearn.utils import class_weight
from keras.layers import MultiHeadAttention, LayerNormalization
import tensorflow as tf

# Define focal loss function
def focal_loss(gamma=2., alpha=.25):
    def focal_loss_fixed(y_true, y_pred):
        eps = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, eps, 1. - eps)
        y_true = tf.cast(y_true, tf.float32)
        alpha_t = y_true * alpha + (tf.keras.backend.ones_like(y_true) - y_true) * (1 - alpha)
        p_t = y_true * y_pred + (tf.keras.backend.ones_like(y_true) - y_true) * (tf.keras.backend.ones_like(y_true) - y_pred)
        fl = - alpha_t * tf.keras.backend.pow((tf.keras.backend.ones_like(y_true) - p_t), gamma) * tf.keras.backend.log(p_t)
        return tf.keras.backend.mean(fl)
    return focal_loss_fixed

# Features and target for training and validation
X_train = train_data[all_features].drop(columns=['FTR'])  # Drop the target column from features
y_train = to_categorical(train_data['FTR'], num_classes=3)
X_val = val_data[all_features].drop(columns=['FTR'])  # Drop the target column from features
y_val = to_categorical(val_data['FTR'], num_classes=3)

# Normalize the feature data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Reshape input to be compatible with attention (if required)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_val = X_val.reshape((X_val.shape[0], X_val.shape[1], 1))

# Create the model
input_layer = Input(shape=(X_train.shape[1], 1))
attention = MultiHeadAttention(num_heads=8, key_dim=2)(input_layer, input_layer)
attention = Dropout(0.5)(attention)
attention = LayerNormalization(epsilon=1e-6)(attention)

# Flatten the attention output to match the shape required by Dense layers
flatten = Flatten()(attention)

# Adding Dense layers
dense = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(flatten)
dense = Dropout(0.5)(dense)
dense = Dense(32, activation='relu', kernel_regularizer=l2(0.01))(dense)
dense = Dropout(0.5)(dense)
output = Dense(3, activation='softmax')(dense)

model = Model(inputs=input_layer, outputs=output)

# Compile the model with focal loss
model.compile(optimizer='adam', loss=focal_loss(gamma=2., alpha=.25), metrics=['accuracy'])

# Compute class weights for imbalanced datasets
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(np.argmax(y_train, axis=1)), y=np.argmax(y_train, axis=1))
class_weights = dict(enumerate(class_weights))

# Train the model using the training and validation sets
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val), callbacks=[early_stopping], class_weight=class_weights)

# Evaluate the model on the validation set
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Accuracy: {accuracy:.2f}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Validation Accuracy: 0.47


### Feature Engineering for Over/Under Model

In [None]:
def create_over_under_features(df):
    # Total Goals
    df['TotalGoals'] = df['FTHG'] + df['FTAG']

    # Goal Differences
    df['HomeGoalDiff'] = df['FTHG'] - df['FTAG']
    df['AwayGoalDiff'] = df['FTAG'] - df['FTHG']

    # Rolling Average Goals
    df['AvgHomeGoals'] = df.groupby('HomeTeam')['FTHG'].transform(lambda x: x.rolling(window=5, min_periods=1).mean())
    df['AvgAwayGoals'] = df.groupby('AwayTeam')['FTAG'].transform(lambda x: x.rolling(window=5, min_periods=1).mean())

    # Shots and Shots on Target
    df['TotalShots'] = df['HS'] + df['AS']
    df['TotalShotsOnTarget'] = df['HST'] + df['AST']

    # Efficiency
    df['HomeShotEff'] = df.apply(lambda row: row['HST'] / row['HS'] if row['HS'] != 0 else 0, axis=1)
    df['AwayShotEff'] = df.apply(lambda row: row['AST'] / row['AS'] if row['AS'] != 0 else 0, axis=1)

    # Fouls and Cards
    df['TotalFouls'] = df['HF'] + df['AF']
    df['TotalYellowCards'] = df['HY'] + df['AY']
    df['TotalRedCards'] = df['HR'] + df['AR']

    # Average Betting Odds
    df['AvgHomeWinOdds'] = df[['B365H', 'BWH', 'IWH', 'WHH', 'VCH', 'LBH']].mean(axis=1)
    df['AvgDrawOdds'] = df[['B365D', 'BWD', 'IWD', 'WHD', 'VCD', 'LBD']].mean(axis=1)
    df['AvgAwayWinOdds'] = df[['B365A', 'BWA', 'IWA', 'WHA', 'VCA', 'LBA']].mean(axis=1)

    # Drop rows with NaN values created by rolling window function
    df = df.dropna()

    return df


In [None]:
over_under_featured=create_over_under_features(merged_df)

In [None]:
print(over_under_featured.columns.tolist())

['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'PSH', 'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'PSCH', 'PSCD', 'PSCA', 'Bb1X2', 'BbMxH', 'BbAvH', 'BbMxD', 'BbAvD', 'BbMxA', 'BbAvA', 'BbOU', 'BbMx>2.5', 'BbAv>2.5', 'BbMx<2.5', 'BbAv<2.5', 'BbAH', 'BbAHh', 'BbMxAHH', 'BbAvAHH', 'BbMxAHA', 'BbAvAHA', 'LBH', 'LBD', 'LBA', 'DayOfWeek', 'Month', 'Year', 'TotalGoals', 'HomeGoalDiff', 'AwayGoalDiff', 'AvgHomeGoals', 'AvgAwayGoals', 'TotalShots', 'TotalShotsOnTarget', 'HomeShotEff', 'AwayShotEff', 'TotalFouls', 'TotalYellowCards', 'TotalRedCards', 'AvgHomeWinOdds', 'AvgDrawOdds', 'AvgAwayWinOdds']


In [None]:
# Selecting features that will be available before match and newly created features

relevant_features = [
    'HomeTeam', 'AwayTeam','AvgHomeGoals','FTR', 'AvgAwayGoals', 'HomeGoalDiff', 'AwayGoalDiff', 'TotalShots',
    'TotalShotsOnTarget', 'HomeShotEff', 'AwayShotEff', 'TotalFouls', 'TotalYellowCards',
    'TotalRedCards', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA',
    'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'LBH', 'LBD', 'LBA', 'AvgHomeWinOdds',
    'AvgDrawOdds', 'AvgAwayWinOdds'
]

features_before_match = [
    'AvgHomeGoals', 'AvgAwayGoals', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD',
    'IWA', 'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'LBH', 'LBD', 'LBA', 'AvgHomeWinOdds',
    'AvgDrawOdds', 'AvgAwayWinOdds'
]

In [None]:
# Drop rows with any NaN values
over_under_featured = over_under_featured.dropna()

# Check for remaining null values to ensure no columns with more than one null value remain
null_counts = final_df.isnull().sum()
null_counts = null_counts[null_counts > 0]
print(null_counts)

Series([], dtype: int64)


In [None]:
over_under_featured[['HomeTeam', 'AwayTeam', 'FTR']]

Unnamed: 0,HomeTeam,AwayTeam,FTR
0,Burnley,Man City,A
1,Arsenal,Nott'm Forest,H
2,Bournemouth,West Ham,D
3,Brighton,Luton,H
4,Everton,Fulham,A
...,...,...,...
7883,Leicester,Portsmouth,H
7884,Man United,Chelsea,D
7885,Middlesbrough,Man City,H
7886,Southampton,Aston Villa,D


In [None]:
# Handle categorical features
label_encoders = {}
categorical_features = ['HomeTeam', 'AwayTeam', 'FTR']

for col in categorical_features:
    label_encoders[col] = LabelEncoder()
    over_under_featured[col] = label_encoders[col].fit_transform(over_under_featured[col])

# Creating TotalGoals for Over/Under prediction
over_under_featured['TotalGoals'] = featured_merged_df['FTHG'] + featured_merged_df['FTAG']


In [None]:
class_counts = over_under_featured['TotalGoals'].value_counts()
print(class_counts)

TotalGoals
2.0     1850
3.0     1744
1.0     1346
4.0     1240
5.0      649
0.0      604
6.0      269
7.0      118
8.0       45
9.0       16
10.0       5
11.0       1
Name: count, dtype: int64


In [None]:
# Split data into training and testing sets
train_data, test_data = train_test_split(over_under_featured, test_size=0.2, random_state=42)

# Normalize numerical features
scaler = StandardScaler()
train_data[relevant_features] = scaler.fit_transform(train_data[relevant_features])
test_data[relevant_features] = scaler.transform(test_data[relevant_features])

In [None]:
# Create the binary target variable for over/under 2.5 goals
train_data['OverUnder'] = (train_data['TotalGoals'] > 2.5).astype(int)

# Check the class distribution
class_counts = train_data['OverUnder'].value_counts()

# Print the class distribution
print("Class distribution for Over/Under 2.5 goals:")
print(class_counts)

# Optionally, you can calculate the percentage distribution as well
class_distribution_percentage = class_counts / len(train_data) * 100
print("\nClass distribution percentage:")
print(class_distribution_percentage)

Class distribution for Over/Under 2.5 goals:
OverUnder
1    3267
0    3042
Name: count, dtype: int64

Class distribution percentage:
OverUnder
1    51.783167
0    48.216833
Name: count, dtype: float64


### Comprehensive Model for Over/Under Prediction


In [None]:
# Define the comprehensive model for over/under prediction
input_all_ou = Input(shape=(len(relevant_features),))
x_ou = Dense(128, activation='relu')(input_all_ou)
x_ou = Dropout(0.5)(x_ou)
x_ou = Dense(64, activation='relu')(x_ou)
x_ou = Dropout(0.5)(x_ou)
x_ou = Dense(32, activation='relu')(x_ou)
output_all_ou = Dense(1, activation='sigmoid')(x_ou)
comprehensive_model_ou = Model(inputs=input_all_ou, outputs=output_all_ou)

# Compile the comprehensive model
comprehensive_model_ou.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the comprehensive model
X_train_ou = train_data[relevant_features].values
y_train_ou = (train_data['TotalGoals'] > 2.5).astype(int)  # Assuming 'TotalGoals' is the total goals in the match
comprehensive_model_ou.fit(X_train_ou, y_train_ou, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the comprehensive model
X_test_ou = test_data[relevant_features].values
y_test_ou = (test_data['TotalGoals'] > 2.5).astype(int)
predictions_ou = comprehensive_model_ou.predict(X_test_ou)
predicted_classes_ou = (predictions_ou > 0.5).astype(int)

accuracy_ou = accuracy_score(y_test_ou, predicted_classes_ou)
precision_ou = precision_score(y_test_ou, predicted_classes_ou)
recall_ou = recall_score(y_test_ou, predicted_classes_ou)
f1_ou = f1_score(y_test_ou, predicted_classes_ou)
roc_auc_ou = roc_auc_score(y_test_ou, predictions_ou)

print(f"Comprehensive Model (Over/Under) - Accuracy: {accuracy_ou:.4f}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Comprehensive Model (Over/Under) - Accuracy: 0.7408


### Prediction Model with Betting Odds Features


In [None]:
# Define the prediction model with betting odds features for over/under prediction
input_pred_ou = Input(shape=(len(features_before_match),))
x_pred_ou = Dense(128, activation='relu')(input_pred_ou)
x_pred_ou = Dropout(0.5)(x_pred_ou)
x_pred_ou = Dense(64, activation='relu')(x_pred_ou)
x_pred_ou = Dropout(0.5)(x_pred_ou)
x_pred_ou = Dense(32, activation='relu')(x_pred_ou)
output_pred_ou = Dense(1, activation='sigmoid')(x_pred_ou)
prediction_model_ou = Model(inputs=input_pred_ou, outputs=output_pred_ou)

# Transfer weights from the comprehensive model to the prediction model for the hidden layers
comprehensive_model_layers_ou = [layer for layer in comprehensive_model_ou.layers if 'dense' in layer.name or 'dropout' in layer.name]
prediction_model_layers_ou = [layer for layer in prediction_model_ou.layers if 'dense' in layer.name or 'dropout' in layer.name]

for comp_layer_ou, pred_layer_ou in zip(comprehensive_model_layers_ou[1:], prediction_model_layers_ou[1:]):
    pred_layer_ou.set_weights(comp_layer_ou.get_weights())

# Compile the prediction model
prediction_model_ou.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Fine-tune the prediction model with betting odds features
X_train_pred_ou = train_data[features_before_match].values
prediction_model_ou.fit(X_train_pred_ou, y_train_ou, epochs=10, batch_size=32, validation_split=0.2)

# Prediction using betting odds features
X_test_pred_ou = test_data[features_before_match].values
predictions_pred_ou = prediction_model_ou.predict(X_test_pred_ou)
predicted_classes_pred_ou = (predictions_pred_ou > 0.5).astype(int)

# Evaluate the prediction model
accuracy_pred_ou = accuracy_score(y_test_ou, predicted_classes_pred_ou)
precision_pred_ou = precision_score(y_test_ou, predicted_classes_pred_ou)
recall_pred_ou = recall_score(y_test_ou, predicted_classes_pred_ou)
f1_pred_ou = f1_score(y_test_ou, predicted_classes_pred_ou)
roc_auc_pred_ou = roc_auc_score(y_test_ou, predictions_pred_ou)

print(f"Prediction Model (Over/Under) - Accuracy: {accuracy_pred_ou:.4f}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Prediction Model (Over/Under) - Accuracy: 0.6629
