<a href="https://colab.research.google.com/github/venti-sei/Bet26/blob/main/data_collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature Description

In [None]:
"""
Date = Match Date (dd/mm/yy)
Time = Time of match kick off

HomeTeam = Home Team
AwayTeam = Away Team

FTHG and HG = Full Time Home Team Goals
FTAG and AG = Full Time Away Team Goals
FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)

HTHG = Half Time Home Team Goals
HTAG = Half Time Away Team Goals
HTR = Half Time Result (H=Home Win, D=Draw, A=Away Win)

Match Statistics (where available)

# Attendance = Crowd Attendance

Referee = Match Referee

HS = Home Team Shots
AS = Away Team Shots

HST = Home Team Shots on Target
AST = Away Team Shots on Target

HC = Home Team Corners
AC = Away Team Corners

HF = Home Team Fouls Committed
AF = Away Team Fouls Committed

HFKC = Home Team Free Kicks Conceded
AFKC = Away Team Free Kicks Conceded

# HO = Home Team Offsides
# AO = Away Team Offsides
# XG

HY = Home Team Yellow Cards
AY = Away Team Yellow Cards

HR = Home Team Red Cards
AR = Away Team Red Cards

MatchOrder = Order of match in the dataset

HP = Home Points (win=3, draw=1, loss=0)
AP = Away Points (win=3, draw=1, loss=0)

H_HRP_10 = History of Home Points in last 10 matches
H_ARP_10 = History of Away Points in last 10 matches

H_H_P_5 = History of Home Points in last 5 Home matches
H_A_P_5 = History of Away Points in last 5 Away matches

H_H2H_H_2 = History of Home Points in last 2 head to head matches
H_H2H_A_2 = History of Away Points in last 2 head to head matches

H_H_GS_10 = History of Home Goals Scored in last 10 matches
H_A_GS_10 = History of Away Goals Scored in last 10 matches
-------------------------------------
H_H_GC_10 = History of Home Goals Conceded in last 10 matches
H_A_GC_10 = History of Away Goals Conceded in last 10 matches

H_H_HGS_10 = History of Home Half Time Goals Scored in last 10 matches
H_A_HGS_10 = History of Away Half Time Goals Scored in last 10 matches
-------------------------------------
H_H_HGC_10 = History of Home Half Time Goals Conceded in last 10 matches
H_A_HGC_10 = History of Away Half Time Goals Conceded in last 10 matches

H_H_S_10 = History of Home Shots in last 10 matches
H_A_S_10 = History of Away Shots in last 10 matches
-------------------------------------
H_H_S_A_10 = History of Home Shots in last 10 matches against
H_A_S_A_10 = History of Away Shots in last 10 matches against

H_H_ST_10 = History of Home Shots on Target in last 10 matches
H_A_ST_10 = History of Away Shots on Target in last 10 matches
-------------------------------------
H_H_ST_A_10 = History of Home Shots on Target in last 10 matches against
H_A_ST_A_10 = History of Away Shots on Target in last 10 matches against

H_H_C_10 = History of Home Corners in last 10 matches
H_A_C_10 = History of Away Corners in last 10 matches
-------------------------------------
H_H_C_A_10 = History of Home Corners in last 10 matches against
H_A_C_A_10 = History of Away Corners in last 10 matches against

H_H_F_10 = History of Home Fouls in last 10 matches
H_A_F_10 = History of Away Fouls in last 10 matches
-------------------------------------
H_H_F_A_10 = History of Home Fouls in last 10 matches against
H_A_F_A_10 = History of Away Fouls in last 10 matches against

H_H_KC_10 = History of Home Free Kicks Conceded in last 10 match
H_A_KC_10 = History of Away Free Kicks Conceded in last 10 match
-------------------------------------
H_H_KC_A_10 = History of Home Free Kicks Conceded in last 10 match against
H_A_KC_A_10 = History of Away Free Kicks Conceded in last 10 match against

H_H_YC_10 = History of Home Yellow Cards in last 10 matches
H_A_YC_10 = History of Away Yellow Cards in last 10 matches
-------------------------------------
H_H_YC_A_10 = History of Home Yellow Cards in last 10 matches against
H_A_YC_A_10 = History of Away Yellow Cards in last 10 matches against

H_H_RC_10 = History of Home Red Cards in last 10 matches
H_A_RC_10 = History of Away Red Cards in last 10 matches
-------------------------------------
H_H_RC_A_10 = History of Home Red Cards in last 10 matches against
H_A_RC_A_10 = History of Away Red Cards in last 10 matches against
"""

'\nDate = Match Date (dd/mm/yy)\nTime = Time of match kick off\n\nHomeTeam = Home Team\nAwayTeam = Away Team\n\nFTHG and HG = Full Time Home Team Goals\nFTAG and AG = Full Time Away Team Goals\nFTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)\n\nHTHG = Half Time Home Team Goals\nHTAG = Half Time Away Team Goals\nHTR = Half Time Result (H=Home Win, D=Draw, A=Away Win)\n\nMatch Statistics (where available)\n\n# Attendance = Crowd Attendance\n\nReferee = Match Referee\n\nHS = Home Team Shots\nAS = Away Team Shots\n\nHST = Home Team Shots on Target\nAST = Away Team Shots on Target\n\nHC = Home Team Corners\nAC = Away Team Corners\n\nHF = Home Team Fouls Committed\nAF = Away Team Fouls Committed\n\nHFKC = Home Team Free Kicks Conceded\nAFKC = Away Team Free Kicks Conceded\n\n#\xa0HO = Home Team Offsides\n# AO = Away Team Offsides\n# XG\n\nHY = Home Team Yellow Cards\nAY = Away Team Yellow Cards\n\nHR = Home Team Red Cards\nAR = Away Team Red Cards\n\nMatchOrder = Order of mat

# Import Libraries

In [None]:
import os, io, requests
import pandas as pd
import numpy as np

# Import Dataset

In [None]:
seasons = {
  "2020-21": "2021",
  "2021-22": "2122",
  "2022-23": "2223",
  "2023-24": "2324",
  "2024-25": "2425",
}

base = "https://www.football-data.co.uk/mmz4281/{}/E0.csv"
out_dir = "premier_league_2020_2025"
os.makedirs(out_dir, exist_ok=True)

frames = []
for label, code in seasons.items():
  url = base.format(code)
  try:
      r = requests.get(url, timeout=30)
      r.raise_for_status()
      df = pd.read_csv(io.BytesIO(r.content))
      df["Season"] = label
      frames.append(df)
      df.to_csv(os.path.join(out_dir, f"E0_{label}.csv"), index=False)
      print(f"✓ downloaded {label}")
  except requests.HTTPError as e:
      print(f"✗ {label} missing ({e}). Skipping.")

# combined file (only for seasons that downloaded)
if frames:
  all_df = pd.concat(frames, ignore_index=True)
  all_df.to_csv(os.path.join(out_dir, "E0_2020_2025_combined.csv"), index=False)
  print(f"Combined shape: {all_df.shape}")

✓ downloaded 2020-21
✓ downloaded 2021-22
✓ downloaded 2022-23
✓ downloaded 2023-24
✓ downloaded 2024-25
Combined shape: (1900, 133)


In [None]:
all_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1900 entries, 0 to 1899
Columns: 133 entries, Div to BFECAHA
dtypes: float64(108), int64(16), object(9)
memory usage: 1.9+ MB


In [None]:
display(all_df.head())

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,1XBCH,1XBCD,1XBCA,BFECH,BFECD,BFECA,BFEC>2.5,BFEC<2.5,BFECAHH,BFECAHA
0,E0,12/09/2020,12:30,Fulham,Arsenal,0,3,A,0,1,...,,,,,,,,,,
1,E0,12/09/2020,15:00,Crystal Palace,Southampton,1,0,H,1,0,...,,,,,,,,,,
2,E0,12/09/2020,17:30,Liverpool,Leeds,4,3,H,3,2,...,,,,,,,,,,
3,E0,12/09/2020,20:00,West Ham,Newcastle,0,2,A,0,0,...,,,,,,,,,,
4,E0,13/09/2020,14:00,West Brom,Leicester,0,3,A,0,0,...,,,,,,,,,,


# Feature Collection

## Create Dataframe

In [None]:
features_df = all_df.copy()

## Create MatchOrder

In [None]:
features_df['MatchDateTime'] = pd.to_datetime(features_df['Date'] + ' ' + features_df['Time'], format='%d/%m/%Y %H:%M')
features_df = features_df.sort_values(by='MatchDateTime').reset_index(drop=True)
features_df['MatchOrder'] = features_df.index + 1

In [None]:
features_df = features_df.drop(columns=['Date', 'Time'])

## Add HP and AP

In [None]:
conditions = [
    features_df['FTR'] == 'H',
    features_df['FTR'] == 'D'
]

In [None]:
choices = [3, 1]

features_df['HP'] = np.select(conditions, choices, default=0)

In [None]:
choices_ap = [3, 1]

features_df['AP'] = np.select(conditions, choices_ap, default=0)

# Historical Features

## Add H_HRP_10 and H_ARP_10

In [None]:
home_matches = features_df[['HomeTeam', 'AwayTeam', 'HP', 'MatchOrder']].copy()
home_matches.rename(columns={'HomeTeam': 'Team', 'AwayTeam': 'Opponent', 'HP': 'Points'}, inplace=True)

away_matches = features_df[['AwayTeam', 'HomeTeam', 'AP', 'MatchOrder']].copy()
away_matches.rename(columns={'AwayTeam': 'Team', 'HomeTeam': 'Opponent', 'AP': 'Points'}, inplace=True)

combined_match_data = pd.concat([home_matches, away_matches], ignore_index=True)
combined_match_data = combined_match_data.sort_values(by='MatchOrder').reset_index(drop=True)

In [None]:
combined_match_data['RollingPoints'] = combined_match_data.groupby('Team')['Points'].transform(lambda x: x.rolling(window=10, min_periods=1).sum().shift(1))

In [None]:
home_rolling_points = combined_match_data[combined_match_data['Team'] == combined_match_data['Opponent']].drop(columns=['Opponent'])
features_df = pd.merge(features_df, combined_match_data[['Team', 'MatchOrder', 'RollingPoints']], left_on=['HomeTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingPoints': 'H_HRP_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

features_df = pd.merge(features_df, combined_match_data[['Team', 'MatchOrder', 'RollingPoints']], left_on=['AwayTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingPoints': 'H_ARP_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

## Last 5 Home and Away Points
H_H_P_5
H_A_P_5

In [None]:
home_matches_temp = features_df[['HomeTeam', 'MatchOrder', 'HP']].copy()
home_matches_temp['H_H_P_5'] = home_matches_temp.groupby('HomeTeam')['HP'].transform(lambda x: x.rolling(window=5, min_periods=1).sum().shift(1))

features_df = pd.merge(features_df, home_matches_temp[['HomeTeam', 'MatchOrder', 'H_H_P_5']], on=['HomeTeam', 'MatchOrder'], how='left')

In [None]:
away_matches_temp = features_df[['AwayTeam', 'MatchOrder', 'AP']].copy()
away_matches_temp['H_A_P_5'] = away_matches_temp.groupby('AwayTeam')['AP'].transform(lambda x: x.rolling(window=5, min_periods=1).sum().shift(1))

features_df = pd.merge(features_df, away_matches_temp[['AwayTeam', 'MatchOrder', 'H_A_P_5']], on=['AwayTeam', 'MatchOrder'], how='left')

## Two last head to head matches
### H_H2H_H_2 - H_H2H_A_2

In [None]:
def get_h2h_home_team_points(row, df):
    home_team = row['HomeTeam']
    away_team = row['AwayTeam']
    match_order = row['MatchOrder']

    # Filter past head-to-head matches
    past_h2h_matches = df[
        (df['MatchOrder'] < match_order) &
        (
            ((df['HomeTeam'] == home_team) & (df['AwayTeam'] == away_team)) |
            ((df['HomeTeam'] == away_team) & (df['AwayTeam'] == home_team))
        )
    ].sort_values(by='MatchOrder', ascending=False)

    h2h_points_list = []
    for _, h2h_row in past_h2h_matches.head(2).iterrows():
        if h2h_row['HomeTeam'] == home_team:
            h2h_points_list.append(h2h_row['HP'])
        else:
            h2h_points_list.append(h2h_row['AP'])

    # Pad with 0s if fewer than 2 matches found
    while len(h2h_points_list) < 2:
        h2h_points_list.append(0)

    return pd.Series(h2h_points_list, index=['H2H_Home_Pts_1', 'H2H_Home_Pts_2'])

# Create a temporary DataFrame to store the individual H2H home points
h2h_home_data = features_df.apply(lambda row: get_h2h_home_team_points(row, features_df), axis=1)
h2h_home_data['MatchOrder'] = features_df['MatchOrder']
h2h_home_data['HomeTeam'] = features_df['HomeTeam']

# Calculate the sum of the last two H2H home points in the temporary DataFrame
h2h_home_data['H_H2H_H_2'] = h2h_home_data['H2H_Home_Pts_1'] + h2h_home_data['H2H_Home_Pts_2']

# Merge only the sum back to features_df
features_df = pd.merge(
    features_df,
    h2h_home_data[['MatchOrder', 'HomeTeam', 'H_H2H_H_2']],
    on=['MatchOrder', 'HomeTeam'],
    how='left'
)

In [None]:
def get_h2h_away_team_points(row, df):
    home_team = row['HomeTeam']
    away_team = row['AwayTeam']
    match_order = row['MatchOrder']

    # Filter past head-to-head matches
    past_h2h_matches = df[
        (df['MatchOrder'] < match_order) &
        (
            ((df['HomeTeam'] == home_team) & (df['AwayTeam'] == away_team)) |
            ((df['HomeTeam'] == away_team) & (df['AwayTeam'] == home_team))
        )
    ].sort_values(by='MatchOrder', ascending=False)

    h2h_points_list = []
    for _, h2h_row in past_h2h_matches.head(2).iterrows():
        if h2h_row['AwayTeam'] == away_team:
            h2h_points_list.append(h2h_row['AP'])
        else:
            h2h_points_list.append(h2h_row['HP'])

    # Pad with 0s if fewer than 2 matches found
    while len(h2h_points_list) < 2:
        h2h_points_list.append(0)

    return pd.Series(h2h_points_list, index=['H2H_Away_Pts_1', 'H2H_Away_Pts_2'])

# Create a temporary DataFrame to store the individual H2H away points
h2h_away_data = features_df.apply(lambda row: get_h2h_away_team_points(row, features_df), axis=1)
h2h_away_data['MatchOrder'] = features_df['MatchOrder']
h2h_away_data['AwayTeam'] = features_df['AwayTeam']

# Calculate the sum of the last two H2H away points in the temporary DataFrame
h2h_away_data['H_H2H_A_2'] = h2h_away_data['H2H_Away_Pts_1'] + h2h_away_data['H2H_Away_Pts_2']

# Merge only the sum back to features_df
features_df = pd.merge(
    features_df,
    h2h_away_data[['MatchOrder', 'AwayTeam', 'H_H2H_A_2']],
    on=['MatchOrder', 'AwayTeam'],
    how='left'
)

## Make History for FTHG and FTAG
### H_H_GS_10 - H_A_GS_10
### H_H_GC_10 - H_A_GC_10

In [None]:
home_goals_scored_df = features_df[['HomeTeam', 'FTHG', 'MatchOrder']].copy()
home_goals_scored_df.rename(columns={'HomeTeam': 'Team', 'FTHG': 'GoalsScored'}, inplace=True)

away_goals_scored_df = features_df[['AwayTeam', 'FTAG', 'MatchOrder']].copy()
away_goals_scored_df.rename(columns={'AwayTeam': 'Team', 'FTAG': 'GoalsScored'}, inplace=True)

combined_goals_scored = pd.concat([home_goals_scored_df, away_goals_scored_df], ignore_index=True)
combined_goals_scored = combined_goals_scored.sort_values(by=['MatchOrder', 'Team']).reset_index(drop=True)
combined_goals_scored['RollingGoalsScored'] = combined_goals_scored.groupby('Team')['GoalsScored'].transform(lambda x: x.rolling(window=10, min_periods=1).sum().shift(1))

home_goals_conceded_df = features_df[['HomeTeam', 'FTAG', 'MatchOrder']].copy()
home_goals_conceded_df.rename(columns={'HomeTeam': 'Team', 'FTAG': 'GoalsConceded'}, inplace=True)

away_goals_conceded_df = features_df[['AwayTeam', 'FTHG', 'MatchOrder']].copy()
away_goals_conceded_df.rename(columns={'AwayTeam': 'Team', 'FTHG': 'GoalsConceded'}, inplace=True)

combined_goals_conceded = pd.concat([home_goals_conceded_df, away_goals_conceded_df], ignore_index=True)
combined_goals_conceded = combined_goals_conceded.sort_values(by=['MatchOrder', 'Team']).reset_index(drop=True)
combined_goals_conceded['RollingGoalsConceded'] = combined_goals_conceded.groupby('Team')['GoalsConceded'].transform(lambda x: x.rolling(window=10, min_periods=1).sum().shift(1))

print("Rolling goals scored and conceded calculated successfully!")

# Merge rolling goals scored to features_df
features_df = pd.merge(features_df, combined_goals_scored[['Team', 'MatchOrder', 'RollingGoalsScored']],
                       left_on=['HomeTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsScored': 'H_H_GS_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

features_df = pd.merge(features_df, combined_goals_scored[['Team', 'MatchOrder', 'RollingGoalsScored']],
                       left_on=['AwayTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsScored': 'H_A_GS_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

# Merge rolling goals conceded to features_df
features_df = pd.merge(features_df, combined_goals_conceded[['Team', 'MatchOrder', 'RollingGoalsConceded']],
                       left_on=['HomeTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsConceded': 'H_H_GC_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

features_df = pd.merge(features_df, combined_goals_conceded[['Team', 'MatchOrder', 'RollingGoalsConceded']],
                       left_on=['AwayTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsConceded': 'H_A_GC_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

Rolling goals scored and conceded calculated successfully!


## Make History for HTHG and HTAG
H_H_HGS_10,
H_A_HGS_10,
H_H_HGC_10,
H_A_HGC_10

In [None]:
home_goals_scored_df = features_df[['HomeTeam', 'HTHG', 'MatchOrder']].copy()
home_goals_scored_df.rename(columns={'HomeTeam': 'Team', 'HTHG': 'GoalsScored'}, inplace=True)

away_goals_scored_df = features_df[['AwayTeam', 'HTAG', 'MatchOrder']].copy()
away_goals_scored_df.rename(columns={'AwayTeam': 'Team', 'HTAG': 'GoalsScored'}, inplace=True)

combined_goals_scored = pd.concat([home_goals_scored_df, away_goals_scored_df], ignore_index=True)
combined_goals_scored = combined_goals_scored.sort_values(by=['MatchOrder', 'Team']).reset_index(drop=True)
combined_goals_scored['RollingGoalsScored'] = combined_goals_scored.groupby('Team')['GoalsScored'].transform(lambda x: x.rolling(window=10, min_periods=1).sum().shift(1))

home_goals_conceded_df = features_df[['HomeTeam', 'HTAG', 'MatchOrder']].copy()
home_goals_conceded_df.rename(columns={'HomeTeam': 'Team', 'HTAG': 'GoalsConceded'}, inplace=True)

away_goals_conceded_df = features_df[['AwayTeam', 'HTHG', 'MatchOrder']].copy()
away_goals_conceded_df.rename(columns={'AwayTeam': 'Team', 'HTHG': 'GoalsConceded'}, inplace=True)

combined_goals_conceded = pd.concat([home_goals_conceded_df, away_goals_conceded_df], ignore_index=True)
combined_goals_conceded = combined_goals_conceded.sort_values(by=['MatchOrder', 'Team']).reset_index(drop=True)
combined_goals_conceded['RollingGoalsConceded'] = combined_goals_conceded.groupby('Team')['GoalsConceded'].transform(lambda x: x.rolling(window=10, min_periods=1).sum().shift(1))

# Merge rolling goals scored to features_df
features_df = pd.merge(features_df, combined_goals_scored[['Team', 'MatchOrder', 'RollingGoalsScored']],
                       left_on=['HomeTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsScored': 'H_H_HGS_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

features_df = pd.merge(features_df, combined_goals_scored[['Team', 'MatchOrder', 'RollingGoalsScored']],
                       left_on=['AwayTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsScored': 'H_A_HGS_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

# Merge rolling goals conceded to features_df
features_df = pd.merge(features_df, combined_goals_conceded[['Team', 'MatchOrder', 'RollingGoalsConceded']],
                       left_on=['HomeTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsConceded': 'H_H_HGC_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

features_df = pd.merge(features_df, combined_goals_conceded[['Team', 'MatchOrder', 'RollingGoalsConceded']],
                       left_on=['AwayTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsConceded': 'H_A_HGC_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

## Make history for HS and AS
H_H_S_10,
H_A_S_10,
H_H_S_A_10,
H_A_S_A_10

In [None]:
home_goals_scored_df = features_df[['HomeTeam', 'HS', 'MatchOrder']].copy()
home_goals_scored_df.rename(columns={'HomeTeam': 'Team', 'HS': 'GoalsScored'}, inplace=True)

away_goals_scored_df = features_df[['AwayTeam', 'AS', 'MatchOrder']].copy()
away_goals_scored_df.rename(columns={'AwayTeam': 'Team', 'AS': 'GoalsScored'}, inplace=True)

combined_goals_scored = pd.concat([home_goals_scored_df, away_goals_scored_df], ignore_index=True)
combined_goals_scored = combined_goals_scored.sort_values(by=['MatchOrder', 'Team']).reset_index(drop=True)
combined_goals_scored['RollingGoalsScored'] = combined_goals_scored.groupby('Team')['GoalsScored'].transform(lambda x: x.rolling(window=10, min_periods=1).sum().shift(1))

home_goals_conceded_df = features_df[['HomeTeam', 'AS', 'MatchOrder']].copy()
home_goals_conceded_df.rename(columns={'HomeTeam': 'Team', 'AS': 'GoalsConceded'}, inplace=True)

away_goals_conceded_df = features_df[['AwayTeam', 'HS', 'MatchOrder']].copy()
away_goals_conceded_df.rename(columns={'AwayTeam': 'Team', 'HS': 'GoalsConceded'}, inplace=True)

combined_goals_conceded = pd.concat([home_goals_conceded_df, away_goals_conceded_df], ignore_index=True)
combined_goals_conceded = combined_goals_conceded.sort_values(by=['MatchOrder', 'Team']).reset_index(drop=True)
combined_goals_conceded['RollingGoalsConceded'] = combined_goals_conceded.groupby('Team')['GoalsConceded'].transform(lambda x: x.rolling(window=10, min_periods=1).sum().shift(1))

print("Rolling goals scored and conceded calculated successfully!")

# Merge rolling goals scored to features_df
features_df = pd.merge(features_df, combined_goals_scored[['Team', 'MatchOrder', 'RollingGoalsScored']],
                       left_on=['HomeTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsScored': 'H_H_S_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

features_df = pd.merge(features_df, combined_goals_scored[['Team', 'MatchOrder', 'RollingGoalsScored']],
                       left_on=['AwayTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsScored': 'H_A_S_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

# Merge rolling goals conceded to features_df
features_df = pd.merge(features_df, combined_goals_conceded[['Team', 'MatchOrder', 'RollingGoalsConceded']],
                       left_on=['HomeTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsConceded': 'H_H_S_A_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

features_df = pd.merge(features_df, combined_goals_conceded[['Team', 'MatchOrder', 'RollingGoalsConceded']],
                       left_on=['AwayTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsConceded': 'H_A_S_A_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

Rolling goals scored and conceded calculated successfully!


## Make history for HST and AST
H_H_ST_10,
H_A_ST_10,
H_H_ST_A_10,
H_A_ST_A_10

In [None]:
home_goals_scored_df = features_df[['HomeTeam', 'HST', 'MatchOrder']].copy()
home_goals_scored_df.rename(columns={'HomeTeam': 'Team', 'HST': 'GoalsScored'}, inplace=True)

away_goals_scored_df = features_df[['AwayTeam', 'AST', 'MatchOrder']].copy()
away_goals_scored_df.rename(columns={'AwayTeam': 'Team', 'AST': 'GoalsScored'}, inplace=True)

combined_goals_scored = pd.concat([home_goals_scored_df, away_goals_scored_df], ignore_index=True)
combined_goals_scored = combined_goals_scored.sort_values(by=['MatchOrder', 'Team']).reset_index(drop=True)
combined_goals_scored['RollingGoalsScored'] = combined_goals_scored.groupby('Team')['GoalsScored'].transform(lambda x: x.rolling(window=10, min_periods=1).sum().shift(1))

home_goals_conceded_df = features_df[['HomeTeam', 'AST', 'MatchOrder']].copy()
home_goals_conceded_df.rename(columns={'HomeTeam': 'Team', 'AST': 'GoalsConceded'}, inplace=True)

away_goals_conceded_df = features_df[['AwayTeam', 'HST', 'MatchOrder']].copy()
away_goals_conceded_df.rename(columns={'AwayTeam': 'Team', 'HST': 'GoalsConceded'}, inplace=True)

combined_goals_conceded = pd.concat([home_goals_conceded_df, away_goals_conceded_df], ignore_index=True)
combined_goals_conceded = combined_goals_conceded.sort_values(by=['MatchOrder', 'Team']).reset_index(drop=True)
combined_goals_conceded['RollingGoalsConceded'] = combined_goals_conceded.groupby('Team')['GoalsConceded'].transform(lambda x: x.rolling(window=10, min_periods=1).sum().shift(1))

# Merge rolling goals scored to features_df
features_df = pd.merge(features_df, combined_goals_scored[['Team', 'MatchOrder', 'RollingGoalsScored']],
                       left_on=['HomeTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsScored': 'H_H_ST_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

features_df = pd.merge(features_df, combined_goals_scored[['Team', 'MatchOrder', 'RollingGoalsScored']],
                       left_on=['AwayTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsScored': 'H_A_ST_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

# Merge rolling goals conceded to features_df
features_df = pd.merge(features_df, combined_goals_conceded[['Team', 'MatchOrder', 'RollingGoalsConceded']],
                       left_on=['HomeTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsConceded': 'H_H_ST_A_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

features_df = pd.merge(features_df, combined_goals_conceded[['Team', 'MatchOrder', 'RollingGoalsConceded']],
                       left_on=['AwayTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsConceded': 'H_A_ST_A_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

## Make history for HC and AC
H_H_C_10,
H_A_C_10,
H_H_C_A_10,
H_A_C_A_10

In [None]:
home_goals_scored_df = features_df[['HomeTeam', 'HC', 'MatchOrder']].copy()
home_goals_scored_df.rename(columns={'HomeTeam': 'Team', 'HC': 'GoalsScored'}, inplace=True)

away_goals_scored_df = features_df[['AwayTeam', 'AC', 'MatchOrder']].copy()
away_goals_scored_df.rename(columns={'AwayTeam': 'Team', 'AC': 'GoalsScored'}, inplace=True)

combined_goals_scored = pd.concat([home_goals_scored_df, away_goals_scored_df], ignore_index=True)
combined_goals_scored = combined_goals_scored.sort_values(by=['MatchOrder', 'Team']).reset_index(drop=True)
combined_goals_scored['RollingGoalsScored'] = combined_goals_scored.groupby('Team')['GoalsScored'].transform(lambda x: x.rolling(window=10, min_periods=1).sum().shift(1))

home_goals_conceded_df = features_df[['HomeTeam', 'AC', 'MatchOrder']].copy()
home_goals_conceded_df.rename(columns={'HomeTeam': 'Team', 'AC': 'GoalsConceded'}, inplace=True)

away_goals_conceded_df = features_df[['AwayTeam', 'HC', 'MatchOrder']].copy()
away_goals_conceded_df.rename(columns={'AwayTeam': 'Team', 'HC': 'GoalsConceded'}, inplace=True)

combined_goals_conceded = pd.concat([home_goals_conceded_df, away_goals_conceded_df], ignore_index=True)
combined_goals_conceded = combined_goals_conceded.sort_values(by=['MatchOrder', 'Team']).reset_index(drop=True)
combined_goals_conceded['RollingGoalsConceded'] = combined_goals_conceded.groupby('Team')['GoalsConceded'].transform(lambda x: x.rolling(window=10, min_periods=1).sum().shift(1))

print("Rolling goals scored and conceded calculated successfully!")

# Merge rolling goals scored to features_df
features_df = pd.merge(features_df, combined_goals_scored[['Team', 'MatchOrder', 'RollingGoalsScored']],
                       left_on=['HomeTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsScored': 'H_H_C_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

features_df = pd.merge(features_df, combined_goals_scored[['Team', 'MatchOrder', 'RollingGoalsScored']],
                       left_on=['AwayTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsScored': 'H_A_C_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

# Merge rolling goals conceded to features_df
features_df = pd.merge(features_df, combined_goals_conceded[['Team', 'MatchOrder', 'RollingGoalsConceded']],
                       left_on=['HomeTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsConceded': 'H_H_C_A_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

features_df = pd.merge(features_df, combined_goals_conceded[['Team', 'MatchOrder', 'RollingGoalsConceded']],
                       left_on=['AwayTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsConceded': 'H_A_C_A_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

Rolling goals scored and conceded calculated successfully!


## Make history for HF and AF
H_H_F_10,
H_A_F_10,
H_H_F_A_10,
H_A_F_A_10

In [None]:
home_goals_scored_df = features_df[['HomeTeam', 'HF', 'MatchOrder']].copy()
home_goals_scored_df.rename(columns={'HomeTeam': 'Team', 'HF': 'GoalsScored'}, inplace=True)

away_goals_scored_df = features_df[['AwayTeam', 'AF', 'MatchOrder']].copy()
away_goals_scored_df.rename(columns={'AwayTeam': 'Team', 'AF': 'GoalsScored'}, inplace=True)

combined_goals_scored = pd.concat([home_goals_scored_df, away_goals_scored_df], ignore_index=True)
combined_goals_scored = combined_goals_scored.sort_values(by=['MatchOrder', 'Team']).reset_index(drop=True)
combined_goals_scored['RollingGoalsScored'] = combined_goals_scored.groupby('Team')['GoalsScored'].transform(lambda x: x.rolling(window=10, min_periods=1).sum().shift(1))

home_goals_conceded_df = features_df[['HomeTeam', 'AF', 'MatchOrder']].copy()
home_goals_conceded_df.rename(columns={'HomeTeam': 'Team', 'AF': 'GoalsConceded'}, inplace=True)

away_goals_conceded_df = features_df[['AwayTeam', 'HF', 'MatchOrder']].copy()
away_goals_conceded_df.rename(columns={'AwayTeam': 'Team', 'HF': 'GoalsConceded'}, inplace=True)

combined_goals_conceded = pd.concat([home_goals_conceded_df, away_goals_conceded_df], ignore_index=True)
combined_goals_conceded = combined_goals_conceded.sort_values(by=['MatchOrder', 'Team']).reset_index(drop=True)
combined_goals_conceded['RollingGoalsConceded'] = combined_goals_conceded.groupby('Team')['GoalsConceded'].transform(lambda x: x.rolling(window=10, min_periods=1).sum().shift(1))

print("Rolling goals scored and conceded calculated successfully!")

# Merge rolling goals scored to features_df
features_df = pd.merge(features_df, combined_goals_scored[['Team', 'MatchOrder', 'RollingGoalsScored']],
                       left_on=['HomeTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsScored': 'H_H_F_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

features_df = pd.merge(features_df, combined_goals_scored[['Team', 'MatchOrder', 'RollingGoalsScored']],
                       left_on=['AwayTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsScored': 'H_A_F_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

# Merge rolling goals conceded to features_df
features_df = pd.merge(features_df, combined_goals_conceded[['Team', 'MatchOrder', 'RollingGoalsConceded']],
                       left_on=['HomeTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsConceded': 'H_H_F_A_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

features_df = pd.merge(features_df, combined_goals_conceded[['Team', 'MatchOrder', 'RollingGoalsConceded']],
                       left_on=['AwayTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsConceded': 'H_A_F_A_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

Rolling goals scored and conceded calculated successfully!


## Make history for HY and AY
H_H_YC_10,
H_A_YC_10,
H_H_YC_A_10,
H_A_YC_A_10

In [None]:
home_goals_scored_df = features_df[['HomeTeam', 'HY', 'MatchOrder']].copy()
home_goals_scored_df.rename(columns={'HomeTeam': 'Team', 'HY': 'GoalsScored'}, inplace=True)

away_goals_scored_df = features_df[['AwayTeam', 'AY', 'MatchOrder']].copy()
away_goals_scored_df.rename(columns={'AwayTeam': 'Team', 'AY': 'GoalsScored'}, inplace=True)

combined_goals_scored = pd.concat([home_goals_scored_df, away_goals_scored_df], ignore_index=True)
combined_goals_scored = combined_goals_scored.sort_values(by=['MatchOrder', 'Team']).reset_index(drop=True)
combined_goals_scored['RollingGoalsScored'] = combined_goals_scored.groupby('Team')['GoalsScored'].transform(lambda x: x.rolling(window=10, min_periods=1).sum().shift(1))

home_goals_conceded_df = features_df[['HomeTeam', 'AY', 'MatchOrder']].copy()
home_goals_conceded_df.rename(columns={'HomeTeam': 'Team', 'AY': 'GoalsConceded'}, inplace=True)

away_goals_conceded_df = features_df[['AwayTeam', 'HY', 'MatchOrder']].copy()
away_goals_conceded_df.rename(columns={'AwayTeam': 'Team', 'HY': 'GoalsConceded'}, inplace=True)

combined_goals_conceded = pd.concat([home_goals_conceded_df, away_goals_conceded_df], ignore_index=True)
combined_goals_conceded = combined_goals_conceded.sort_values(by=['MatchOrder', 'Team']).reset_index(drop=True)
combined_goals_conceded['RollingGoalsConceded'] = combined_goals_conceded.groupby('Team')['GoalsConceded'].transform(lambda x: x.rolling(window=10, min_periods=1).sum().shift(1))

print("Rolling goals scored and conceded calculated successfully!")

# Merge rolling goals scored to features_df
features_df = pd.merge(features_df, combined_goals_scored[['Team', 'MatchOrder', 'RollingGoalsScored']],
                       left_on=['HomeTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsScored': 'H_H_YC_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

features_df = pd.merge(features_df, combined_goals_scored[['Team', 'MatchOrder', 'RollingGoalsScored']],
                       left_on=['AwayTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsScored': 'H_A_YC_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

# Merge rolling goals conceded to features_df
features_df = pd.merge(features_df, combined_goals_conceded[['Team', 'MatchOrder', 'RollingGoalsConceded']],
                       left_on=['HomeTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsConceded': 'H_H_YC_A_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

features_df = pd.merge(features_df, combined_goals_conceded[['Team', 'MatchOrder', 'RollingGoalsConceded']],
                       left_on=['AwayTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsConceded': 'H_A_YC_A_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

Rolling goals scored and conceded calculated successfully!


## Make history for HR and AR
H_H_RC_10,
H_A_RC_10,
H_H_RC_A_10,
H_A_RC_A_10

In [None]:
home_goals_scored_df = features_df[['HomeTeam', 'HR', 'MatchOrder']].copy()
home_goals_scored_df.rename(columns={'HomeTeam': 'Team', 'HR': 'GoalsScored'}, inplace=True)

away_goals_scored_df = features_df[['AwayTeam', 'AR', 'MatchOrder']].copy()
away_goals_scored_df.rename(columns={'AwayTeam': 'Team', 'AR': 'GoalsScored'}, inplace=True)

combined_goals_scored = pd.concat([home_goals_scored_df, away_goals_scored_df], ignore_index=True)
combined_goals_scored = combined_goals_scored.sort_values(by=['MatchOrder', 'Team']).reset_index(drop=True)
combined_goals_scored['RollingGoalsScored'] = combined_goals_scored.groupby('Team')['GoalsScored'].transform(lambda x: x.rolling(window=10, min_periods=1).sum().shift(1))

home_goals_conceded_df = features_df[['HomeTeam', 'AR', 'MatchOrder']].copy()
home_goals_conceded_df.rename(columns={'HomeTeam': 'Team', 'AR': 'GoalsConceded'}, inplace=True)

away_goals_conceded_df = features_df[['AwayTeam', 'HR', 'MatchOrder']].copy()
away_goals_conceded_df.rename(columns={'AwayTeam': 'Team', 'HR': 'GoalsConceded'}, inplace=True)

combined_goals_conceded = pd.concat([home_goals_conceded_df, away_goals_conceded_df], ignore_index=True)
combined_goals_conceded = combined_goals_conceded.sort_values(by=['MatchOrder', 'Team']).reset_index(drop=True)
combined_goals_conceded['RollingGoalsConceded'] = combined_goals_conceded.groupby('Team')['GoalsConceded'].transform(lambda x: x.rolling(window=10, min_periods=1).sum().shift(1))

# Merge rolling goals scored to features_df
features_df = pd.merge(features_df, combined_goals_scored[['Team', 'MatchOrder', 'RollingGoalsScored']],
                       left_on=['HomeTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsScored': 'H_H_RC_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

features_df = pd.merge(features_df, combined_goals_scored[['Team', 'MatchOrder', 'RollingGoalsScored']],
                       left_on=['AwayTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsScored': 'H_A_RC_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

# Merge rolling goals conceded to features_df
features_df = pd.merge(features_df, combined_goals_conceded[['Team', 'MatchOrder', 'RollingGoalsConceded']],
                       left_on=['HomeTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsConceded': 'H_H_RC_A_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

features_df = pd.merge(features_df, combined_goals_conceded[['Team', 'MatchOrder', 'RollingGoalsConceded']],
                       left_on=['AwayTeam', 'MatchOrder'], right_on=['Team', 'MatchOrder'], how='left')
features_df.rename(columns={'RollingGoalsConceded': 'H_A_RC_A_10'}, inplace=True)
features_df.drop(columns=['Team'], inplace=True)

#Win/Lose/Draw

In [None]:
# Ensure the DataFrame is sorted by MatchDateTime for accurate lagged features
features_df = features_df.sort_values(by='MatchDateTime').reset_index(drop=True)
print("DataFrame 'features_df' sorted by 'MatchDateTime'.")

DataFrame 'features_df' sorted by 'MatchDateTime'.


In [None]:
import pandas as pd
import numpy as np

def get_lagged_team_result(df, current_index, lag_steps, team_column_name):
    """
    Finds the FTR of the match that occurred 'lag_steps' ago for a specific team
    (identified by team_column_name in the current match) by searching all
    historical matches before the current_index.

    Args:
        df (pd.DataFrame): The full match history DataFrame (MUST BE SORTED BY DATE/TIME!).
        current_index (int): The index of the current match.
        lag_steps (int): How many of the team's recent matches to look back (1 for most recent).
        team_column_name (str): The name of the column ('HomeTeam' or 'AwayTeam')
                                where the target team's name is located in the current match.

    Returns:
        str: 3 for Win, 0 for Lose, 1 for Draw, or None.
    """
    if lag_steps <= 0:
        return None

    # 1. Identify the Target Team for the current match
    try:
        target_team = df.loc[current_index, team_column_name]
    except KeyError:
        return None

    # 2. Filter the historical data: All matches played BY this team BEFORE the current match
    historical_df = df.loc[df.index < current_index]

    # Filter for matches involving the target team
    team_history = historical_df[
        (historical_df['HomeTeam'] == target_team) |
        (historical_df['AwayTeam'] == target_team)
    ].sort_index(ascending=False) # Ensure history is sorted by most recent first

    # 3. Check if enough history exists
    if len(team_history) < lag_steps:
        return None

    # 4. Get the FTR and teams of the N-th most recent match
    lagged_match = team_history.iloc[lag_steps - 1]

    prev_ftr = lagged_match['FTR']
    prev_home_team = lagged_match['HomeTeam']

    # 5. Determine the result for the target_team based on the lagged match's FTR
    if target_team == prev_home_team:
        # Target team played at home in the lagged match
        if prev_ftr == 'H':
            return 3
        elif prev_ftr == 'A':
            return 0
        else:
            return 1
    else:
        # Target team played away in the lagged match
        if prev_ftr == 'A':
            return 3
        elif prev_ftr == 'H':
            return 0
        else:
            return 1


In [None]:
features_df['HomeTeam_Lag1_Result'] = features_df.index.to_series().apply(
    lambda idx: get_lagged_team_result(features_df, idx, lag_steps=1, team_column_name='HomeTeam')
)
features_df['AwayTeam_Lag1_Result'] = features_df.index.to_series().apply(
    lambda idx: get_lagged_team_result(features_df, idx, lag_steps=1, team_column_name='AwayTeam')
)
print("Lagged results for HomeTeam and AwayTeam added to features_df.")

Lagged results for HomeTeam and AwayTeam added to features_df.


In [None]:
features_df['HomeTeam_Lag2_Result'] = features_df.index.to_series().apply(
    lambda idx: get_lagged_team_result(features_df, idx, lag_steps=2, team_column_name='HomeTeam')
)
features_df['AwayTeam_Lag2_Result'] = features_df.index.to_series().apply(
    lambda idx: get_lagged_team_result(features_df, idx, lag_steps=2, team_column_name='AwayTeam')
)
print("Lagged results for HomeTeam and AwayTeam added to features_df.")

Lagged results for HomeTeam and AwayTeam added to features_df.


In [None]:
features_df['HomeTeam_Lag3_Result'] = features_df.index.to_series().apply(
    lambda idx: get_lagged_team_result(features_df, idx, lag_steps=3, team_column_name='HomeTeam')
)
features_df['AwayTeam_Lag3_Result'] = features_df.index.to_series().apply(
    lambda idx: get_lagged_team_result(features_df, idx, lag_steps=3, team_column_name='AwayTeam')
)
print("Lagged results for HomeTeam and AwayTeam added to features_df.")

Lagged results for HomeTeam and AwayTeam added to features_df.


In [None]:
features_df['HomeTeam_Lag4_Result'] = features_df.index.to_series().apply(
    lambda idx: get_lagged_team_result(features_df, idx, lag_steps=4, team_column_name='HomeTeam')
)
features_df['AwayTeam_Lag4_Result'] = features_df.index.to_series().apply(
    lambda idx: get_lagged_team_result(features_df, idx, lag_steps=4, team_column_name='AwayTeam')
)
print("Lagged results for HomeTeam and AwayTeam added to features_df.")

Lagged results for HomeTeam and AwayTeam added to features_df.


In [None]:
features_df['HomeTeam_Lag5_Result'] = features_df.index.to_series().apply(
    lambda idx: get_lagged_team_result(features_df, idx, lag_steps=5, team_column_name='HomeTeam')
)
features_df['AwayTeam_Lag5_Result'] = features_df.index.to_series().apply(
    lambda idx: get_lagged_team_result(features_df, idx, lag_steps=5, team_column_name='AwayTeam')
)
print("Lagged results for HomeTeam and AwayTeam added to features_df.")

Lagged results for HomeTeam and AwayTeam added to features_df.


#Difference

In [None]:
features_df['H_HRP_ARP_diff_10'] = features_df['H_HRP_10'] - features_df['H_ARP_10']

print("features_df with new difference column:")
display(features_df.head())

features_df with new difference column:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,AwayTeam_Lag1_Result,HomeTeam_Lag2_Result,AwayTeam_Lag2_Result,HomeTeam_Lag3_Result,AwayTeam_Lag3_Result,HomeTeam_Lag4_Result,AwayTeam_Lag4_Result,HomeTeam_Lag5_Result,AwayTeam_Lag5_Result,H_HRP_ARP_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_P_A_P_diff_5'] = features_df['H_H_P_5'] - features_df['H_A_P_5']

print("features_df with new difference column H_H_P_A_P_diff_5:")
display(features_df.head())

features_df with new difference column H_H_P_A_P_diff_5:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HomeTeam_Lag2_Result,AwayTeam_Lag2_Result,HomeTeam_Lag3_Result,AwayTeam_Lag3_Result,HomeTeam_Lag4_Result,AwayTeam_Lag4_Result,HomeTeam_Lag5_Result,AwayTeam_Lag5_Result,H_HRP_ARP_diff_10,H_H_P_A_P_diff_5
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_GS_A_GS_diff_10'] = features_df['H_H_GS_10'] - features_df['H_A_GS_10']

print("features_df with new difference column H_H_GS_A_GS_diff_10:")
display(features_df.head())

features_df with new difference column H_H_GS_A_GS_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,AwayTeam_Lag2_Result,HomeTeam_Lag3_Result,AwayTeam_Lag3_Result,HomeTeam_Lag4_Result,AwayTeam_Lag4_Result,HomeTeam_Lag5_Result,AwayTeam_Lag5_Result,H_HRP_ARP_diff_10,H_H_P_A_P_diff_5,H_H_GS_A_GS_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_GC_A_GC_diff_10'] = features_df['H_H_GC_10'] - features_df['H_A_GC_10']

print("features_df with new difference column H_H_GC_A_GC_diff_10:")
display(features_df.head())

features_df with new difference column H_H_GC_A_GC_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HomeTeam_Lag3_Result,AwayTeam_Lag3_Result,HomeTeam_Lag4_Result,AwayTeam_Lag4_Result,HomeTeam_Lag5_Result,AwayTeam_Lag5_Result,H_HRP_ARP_diff_10,H_H_P_A_P_diff_5,H_H_GS_A_GS_diff_10,H_H_GC_A_GC_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_S_A_S_diff_10'] = features_df['H_H_S_10'] - features_df['H_A_S_10']

print("features_df with new difference column H_H_S_A_S_diff_10:")
display(features_df.head())

features_df with new difference column H_H_S_A_S_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,AwayTeam_Lag3_Result,HomeTeam_Lag4_Result,AwayTeam_Lag4_Result,HomeTeam_Lag5_Result,AwayTeam_Lag5_Result,H_HRP_ARP_diff_10,H_H_P_A_P_diff_5,H_H_GS_A_GS_diff_10,H_H_GC_A_GC_diff_10,H_H_S_A_S_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_S_A_S_A_diff_10'] = features_df['H_H_S_A_10'] - features_df['H_A_S_A_10']

print("features_df with new difference column H_H_S_A_S_A_diff_10:")
display(features_df.head())

features_df with new difference column H_H_S_A_S_A_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HomeTeam_Lag4_Result,AwayTeam_Lag4_Result,HomeTeam_Lag5_Result,AwayTeam_Lag5_Result,H_HRP_ARP_diff_10,H_H_P_A_P_diff_5,H_H_GS_A_GS_diff_10,H_H_GC_A_GC_diff_10,H_H_S_A_S_diff_10,H_H_S_A_S_A_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_ST_A_ST_diff_10'] = features_df['H_H_ST_10'] - features_df['H_A_ST_10']

print("features_df with new difference column H_H_ST_A_ST_diff_10:")
display(features_df.head())

features_df with new difference column H_H_ST_A_ST_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,AwayTeam_Lag4_Result,HomeTeam_Lag5_Result,AwayTeam_Lag5_Result,H_HRP_ARP_diff_10,H_H_P_A_P_diff_5,H_H_GS_A_GS_diff_10,H_H_GC_A_GC_diff_10,H_H_S_A_S_diff_10,H_H_S_A_S_A_diff_10,H_H_ST_A_ST_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_ST_A_ST_A_diff_10'] = features_df['H_H_ST_A_10'] - features_df['H_A_ST_A_10']

print("features_df with new difference column H_H_ST_A_ST_A_diff_10:")
display(features_df.head())

features_df with new difference column H_H_ST_A_ST_A_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HomeTeam_Lag5_Result,AwayTeam_Lag5_Result,H_HRP_ARP_diff_10,H_H_P_A_P_diff_5,H_H_GS_A_GS_diff_10,H_H_GC_A_GC_diff_10,H_H_S_A_S_diff_10,H_H_S_A_S_A_diff_10,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_C_A_C_diff_10'] = features_df['H_H_C_10'] - features_df['H_A_C_10']

print("features_df with new difference column H_H_C_A_C_diff_10:")
display(features_df.head())

features_df with new difference column H_H_C_A_C_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,AwayTeam_Lag5_Result,H_HRP_ARP_diff_10,H_H_P_A_P_diff_5,H_H_GS_A_GS_diff_10,H_H_GC_A_GC_diff_10,H_H_S_A_S_diff_10,H_H_S_A_S_A_diff_10,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_C_A_C_A_diff_10'] = features_df['H_H_C_A_10'] - features_df['H_A_C_A_10']

print("features_df with new difference column H_H_C_A_C_A_diff_10:")
display(features_df.head())

features_df with new difference column H_H_C_A_C_A_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,H_HRP_ARP_diff_10,H_H_P_A_P_diff_5,H_H_GS_A_GS_diff_10,H_H_GC_A_GC_diff_10,H_H_S_A_S_diff_10,H_H_S_A_S_A_diff_10,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_F_A_F_diff_10'] = features_df['H_H_F_10'] - features_df['H_A_F_10']

print("features_df with new difference column H_H_F_A_F_diff_10:")
display(features_df.head())

features_df with new difference column H_H_F_A_F_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,H_H_P_A_P_diff_5,H_H_GS_A_GS_diff_10,H_H_GC_A_GC_diff_10,H_H_S_A_S_diff_10,H_H_S_A_S_A_diff_10,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10,H_H_F_A_F_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_F_A_F_A_diff_10'] = features_df['H_H_F_A_10'] - features_df['H_A_F_A_10']

print("features_df with new difference column H_H_F_A_F_A_diff_10:")
display(features_df.head())

features_df with new difference column H_H_F_A_F_A_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,H_H_GS_A_GS_diff_10,H_H_GC_A_GC_diff_10,H_H_S_A_S_diff_10,H_H_S_A_S_A_diff_10,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10,H_H_F_A_F_diff_10,H_H_F_A_F_A_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_YC_A_YC_diff_10'] = features_df['H_H_YC_10'] - features_df['H_A_YC_10']

print("features_df with new difference column H_H_YC_A_YC_diff_10:")
display(features_df.head())

features_df with new difference column H_H_YC_A_YC_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,H_H_GC_A_GC_diff_10,H_H_S_A_S_diff_10,H_H_S_A_S_A_diff_10,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10,H_H_F_A_F_diff_10,H_H_F_A_F_A_diff_10,H_H_YC_A_YC_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_YC_A_YC_A_diff_10'] = features_df['H_H_YC_A_10'] - features_df['H_A_YC_A_10']

print("features_df with new difference column H_H_YC_A_YC_A_diff_10:")
display(features_df.head())

features_df with new difference column H_H_YC_A_YC_A_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,H_H_S_A_S_diff_10,H_H_S_A_S_A_diff_10,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10,H_H_F_A_F_diff_10,H_H_F_A_F_A_diff_10,H_H_YC_A_YC_diff_10,H_H_YC_A_YC_A_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_RC_A_RC_diff_10'] = features_df['H_H_RC_10'] - features_df['H_A_RC_10']

print("features_df with new difference column H_H_RC_A_RC_diff_10:")
display(features_df.head())

features_df with new difference column H_H_RC_A_RC_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,H_H_S_A_S_A_diff_10,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10,H_H_F_A_F_diff_10,H_H_F_A_F_A_diff_10,H_H_YC_A_YC_diff_10,H_H_YC_A_YC_A_diff_10,H_H_RC_A_RC_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_RC_A_RC_A_diff_10'] = features_df['H_H_RC_A_10'] - features_df['H_A_RC_A_10']

print("features_df with new difference column H_H_RC_A_RC_A_diff_10:")
display(features_df.head())

features_df with new difference column H_H_RC_A_RC_A_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10,H_H_F_A_F_diff_10,H_H_F_A_F_A_diff_10,H_H_YC_A_YC_diff_10,H_H_YC_A_YC_A_diff_10,H_H_RC_A_RC_diff_10,H_H_RC_A_RC_A_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_P_A_P_diff_5'] = features_df['H_H_P_5'] - features_df['H_A_P_5']

print("features_df with new difference column H_H_P_A_P_diff_5:")
display(features_df.head())

features_df with new difference column H_H_P_A_P_diff_5:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10,H_H_F_A_F_diff_10,H_H_F_A_F_A_diff_10,H_H_YC_A_YC_diff_10,H_H_YC_A_YC_A_diff_10,H_H_RC_A_RC_diff_10,H_H_RC_A_RC_A_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_GS_A_GS_diff_10'] = features_df['H_H_GS_10'] - features_df['H_A_GS_10']

print("features_df with new difference column H_H_GS_A_GS_diff_10:")
display(features_df.head())

features_df with new difference column H_H_GS_A_GS_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10,H_H_F_A_F_diff_10,H_H_F_A_F_A_diff_10,H_H_YC_A_YC_diff_10,H_H_YC_A_YC_A_diff_10,H_H_RC_A_RC_diff_10,H_H_RC_A_RC_A_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_GC_A_GC_diff_10'] = features_df['H_H_GC_10'] - features_df['H_A_GC_10']

print("features_df with new difference column H_H_GC_A_GC_diff_10:")
display(features_df.head())

features_df with new difference column H_H_GC_A_GC_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10,H_H_F_A_F_diff_10,H_H_F_A_F_A_diff_10,H_H_YC_A_YC_diff_10,H_H_YC_A_YC_A_diff_10,H_H_RC_A_RC_diff_10,H_H_RC_A_RC_A_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_S_A_S_diff_10'] = features_df['H_H_S_10'] - features_df['H_A_S_10']

print("features_df with new difference column H_H_S_A_S_diff_10:")
display(features_df.head())

features_df with new difference column H_H_S_A_S_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10,H_H_F_A_F_diff_10,H_H_F_A_F_A_diff_10,H_H_YC_A_YC_diff_10,H_H_YC_A_YC_A_diff_10,H_H_RC_A_RC_diff_10,H_H_RC_A_RC_A_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_S_A_S_A_diff_10'] = features_df['H_H_S_A_10'] - features_df['H_A_S_A_10']

print("features_df with new difference column H_H_S_A_S_A_diff_10:")
display(features_df.head())

features_df with new difference column H_H_S_A_S_A_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10,H_H_F_A_F_diff_10,H_H_F_A_F_A_diff_10,H_H_YC_A_YC_diff_10,H_H_YC_A_YC_A_diff_10,H_H_RC_A_RC_diff_10,H_H_RC_A_RC_A_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_ST_A_ST_diff_10'] = features_df['H_H_ST_10'] - features_df['H_A_ST_10']

print("features_df with new difference column H_H_ST_A_ST_diff_10:")
display(features_df.head())

features_df with new difference column H_H_ST_A_ST_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10,H_H_F_A_F_diff_10,H_H_F_A_F_A_diff_10,H_H_YC_A_YC_diff_10,H_H_YC_A_YC_A_diff_10,H_H_RC_A_RC_diff_10,H_H_RC_A_RC_A_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_ST_A_ST_A_diff_10'] = features_df['H_H_ST_A_10'] - features_df['H_A_ST_A_10']

print("features_df with new difference column H_H_ST_A_ST_A_diff_10:")
display(features_df.head())

features_df with new difference column H_H_ST_A_ST_A_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10,H_H_F_A_F_diff_10,H_H_F_A_F_A_diff_10,H_H_YC_A_YC_diff_10,H_H_YC_A_YC_A_diff_10,H_H_RC_A_RC_diff_10,H_H_RC_A_RC_A_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_C_A_C_diff_10'] = features_df['H_H_C_10'] - features_df['H_A_C_10']

print("features_df with new difference column H_H_C_A_C_diff_10:")
display(features_df.head())

features_df with new difference column H_H_C_A_C_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10,H_H_F_A_F_diff_10,H_H_F_A_F_A_diff_10,H_H_YC_A_YC_diff_10,H_H_YC_A_YC_A_diff_10,H_H_RC_A_RC_diff_10,H_H_RC_A_RC_A_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_C_A_C_A_diff_10'] = features_df['H_H_C_A_10'] - features_df['H_A_C_A_10']

print("features_df with new difference column H_H_C_A_C_A_diff_10:")
display(features_df.head())

features_df with new difference column H_H_C_A_C_A_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10,H_H_F_A_F_diff_10,H_H_F_A_F_A_diff_10,H_H_YC_A_YC_diff_10,H_H_YC_A_YC_A_diff_10,H_H_RC_A_RC_diff_10,H_H_RC_A_RC_A_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_F_A_F_diff_10'] = features_df['H_H_F_10'] - features_df['H_A_F_10']

print("features_df with new difference column H_H_F_A_F_diff_10:")
display(features_df.head())

features_df with new difference column H_H_F_A_F_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10,H_H_F_A_F_diff_10,H_H_F_A_F_A_diff_10,H_H_YC_A_YC_diff_10,H_H_YC_A_YC_A_diff_10,H_H_RC_A_RC_diff_10,H_H_RC_A_RC_A_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_F_A_F_A_diff_10'] = features_df['H_H_F_A_10'] - features_df['H_A_F_A_10']

print("features_df with new difference column H_H_F_A_F_A_diff_10:")
display(features_df.head())

features_df with new difference column H_H_F_A_F_A_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10,H_H_F_A_F_diff_10,H_H_F_A_F_A_diff_10,H_H_YC_A_YC_diff_10,H_H_YC_A_YC_A_diff_10,H_H_RC_A_RC_diff_10,H_H_RC_A_RC_A_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_YC_A_YC_diff_10'] = features_df['H_H_YC_10'] - features_df['H_A_YC_10']

print("features_df with new difference column H_H_YC_A_YC_diff_10:")
display(features_df.head())

features_df with new difference column H_H_YC_A_YC_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10,H_H_F_A_F_diff_10,H_H_F_A_F_A_diff_10,H_H_YC_A_YC_diff_10,H_H_YC_A_YC_A_diff_10,H_H_RC_A_RC_diff_10,H_H_RC_A_RC_A_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_YC_A_YC_A_diff_10'] = features_df['H_H_YC_A_10'] - features_df['H_A_YC_A_10']

print("features_df with new difference column H_H_YC_A_YC_A_diff_10:")
display(features_df.head())

features_df with new difference column H_H_YC_A_YC_A_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10,H_H_F_A_F_diff_10,H_H_F_A_F_A_diff_10,H_H_YC_A_YC_diff_10,H_H_YC_A_YC_A_diff_10,H_H_RC_A_RC_diff_10,H_H_RC_A_RC_A_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_RC_A_RC_diff_10'] = features_df['H_H_RC_10'] - features_df['H_A_RC_10']

print("features_df with new difference column H_H_RC_A_RC_diff_10:")
display(features_df.head())

features_df with new difference column H_H_RC_A_RC_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10,H_H_F_A_F_diff_10,H_H_F_A_F_A_diff_10,H_H_YC_A_YC_diff_10,H_H_YC_A_YC_A_diff_10,H_H_RC_A_RC_diff_10,H_H_RC_A_RC_A_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


In [None]:
features_df['H_H_RC_A_RC_A_diff_10'] = features_df['H_H_RC_A_10'] - features_df['H_A_RC_A_10']

print("features_df with new difference column H_H_RC_A_RC_A_diff_10:")
display(features_df.head())

features_df with new difference column H_H_RC_A_RC_A_diff_10:


Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10,H_H_F_A_F_diff_10,H_H_F_A_F_A_diff_10,H_H_YC_A_YC_diff_10,H_H_YC_A_YC_A_diff_10,H_H_RC_A_RC_diff_10,H_H_RC_A_RC_A_diff_10
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,...,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,...,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,...,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,...,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,...,,,,,,,,,,


#Ratio

In [None]:
# Ratio: Home recent points / Away recent points
features_df["Points_ratio_10"] = features_df["H_HRP_10"] / (features_df["H_ARP_10"] + 1)
features_df["Points_ratio_10"] = features_df["Points_ratio_10"].clip(0, 2)

In [None]:
features_df["Points_ratio_5"] = features_df["H_H_P_5"] / (features_df["H_A_P_5"] + 1)
features_df["Points_ratio_5"] = features_df["Points_ratio_5"].clip(0, 2)

In [None]:
features_df["GS_ratio_10"] = features_df["H_H_GS_10"] / (features_df["H_A_GS_10"] + 1)
features_df["GS_ratio_10"] = features_df["GS_ratio_10"].clip(0, 2)

In [None]:
features_df["GC_ratio_10"] = features_df["H_H_GC_10"] / (features_df["H_A_GC_10"] + 1)
features_df["GC_ratio_10"] = features_df["GC_ratio_10"].clip(0, 2)

In [None]:
features_df["HT_GS_ratio_10"] = features_df["H_H_HGS_10"] / (features_df["H_A_HGS_10"] + 1)
features_df["HT_GS_ratio_10"] = features_df["HT_GS_ratio_10"].clip(0, 2)

In [None]:
features_df["HT_GC_ratio_10"] = features_df["H_H_HGC_10"] / (features_df["H_A_HGC_10"] + 1)
features_df["HT_GC_ratio_10"] = features_df["HT_GC_ratio_10"].clip(0, 2)

In [None]:
features_df["Shots_ratio_10"] = features_df["H_H_S_10"] / (features_df["H_A_S_10"] + 1)
features_df["Shots_ratio_10"] = features_df["Shots_ratio_10"].clip(0, 2)

In [None]:
features_df["Shots_against_ratio_10"] = features_df["H_H_S_A_10"] / (features_df["H_A_S_A_10"] + 1)
features_df["Shots_against_ratio_10"] = features_df["Shots_against_ratio_10"].clip(0, 2)

In [None]:
features_df["ST_ratio_10"] = features_df["H_H_ST_10"] / (features_df["H_A_ST_10"] + 1)
features_df["ST_ratio_10"] = features_df["ST_ratio_10"].clip(0, 2)

In [None]:
features_df["ST_against_ratio_10"] = features_df["H_H_ST_A_10"] / (features_df["H_A_ST_A_10"] + 1)
features_df["ST_against_ratio_10"] = features_df["ST_against_ratio_10"].clip(0, 2)

In [None]:
features_df["Corners_ratio_10"] = features_df["H_H_C_10"] / (features_df["H_A_C_10"] + 1)
features_df["Corners_ratio_10"] = features_df["Corners_ratio_10"].clip(0, 2)

In [None]:
features_df["Corners_against_ratio_10"] = features_df["H_H_C_A_10"] / (features_df["H_A_C_A_10"] + 1)
features_df["Corners_against_ratio_10"] = features_df["Corners_against_ratio_10"].clip(0, 2)

In [None]:
features_df["Fouls_ratio_10"] = features_df["H_H_F_10"] / (features_df["H_A_F_10"] + 1)
features_df["Fouls_ratio_10"] = features_df["Fouls_ratio_10"].clip(0, 2)

In [None]:
features_df["Fouls_against_ratio_10"] = features_df["H_H_F_A_10"] / (features_df["H_A_F_A_10"] + 1)
features_df["Fouls_against_ratio_10"] = features_df["Fouls_against_ratio_10"].clip(0, 2)

# Ratio/Difference Feature Collection

## Global Variables

In [None]:
RATIO_BINS = 20
DIFFERENCE_BINS = 10

## Global Function

In [None]:
def create_equal_width_bins(df, column_name, num_bins=10):
    """
    Converts a continuous numerical column into an ordinal categorical column
    by dividing the column's value range into a specified number of equal-width bins.

    Args:
        df (pd.DataFrame): The DataFrame containing the column to be binned.
        column_name (str): The name of the continuous column (e.g., 'ST_ratio_10').
        num_bins (int): The number of equal-width bins to create (default is 10).

    Returns:
        pd.Series: A new Series containing the bin label (0 to num_bins-1)
                   for each corresponding value in the input column.
    """
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the DataFrame.")

    # Use pd.cut to create equal-width bins
    # labels=False ensures the bins are labeled with integers 0 to (num_bins - 1)
    new_column_values = pd.cut(
        df[column_name],
        bins=num_bins,
        labels=False,
        include_lowest=True, # Ensure the minimum value is included
        right=True
    )
    return new_column_values

## Difference

In [None]:
features_df['H_HRP_ARP_diff_10_Range_Bin'] = create_equal_width_bins(features_df, 'H_HRP_ARP_diff_10', num_bins=DIFFERENCE_BINS)

In [None]:
features_df['H_H_P_A_P_diff_5_Range_Bin'] = create_equal_width_bins(features_df, 'H_H_P_A_P_diff_5', num_bins=DIFFERENCE_BINS)

In [None]:
features_df['H_H_GS_A_GS_diff_10_Range_Bin'] = create_equal_width_bins(features_df, 'H_H_GS_A_GS_diff_10', num_bins=DIFFERENCE_BINS)

In [None]:
features_df['H_H_GC_A_GC_diff_10_Range_Bin'] = create_equal_width_bins(features_df, 'H_H_GC_A_GC_diff_10', num_bins=DIFFERENCE_BINS)

In [None]:
features_df['H_H_S_A_S_diff_10_Range_Bin'] = create_equal_width_bins(features_df, 'H_H_S_A_S_diff_10', num_bins=DIFFERENCE_BINS)

In [None]:
features_df['H_H_S_A_S_A_diff_10_Range_Bin'] = create_equal_width_bins(features_df, 'H_H_S_A_S_A_diff_10', num_bins=DIFFERENCE_BINS)

In [None]:
features_df['H_H_ST_A_ST_diff_10_Range_Bin'] = create_equal_width_bins(features_df, 'H_H_ST_A_ST_diff_10', num_bins=DIFFERENCE_BINS)

In [None]:
features_df['H_H_ST_A_ST_A_diff_10_Range_Bin'] = create_equal_width_bins(features_df, 'H_H_ST_A_ST_A_diff_10', num_bins=DIFFERENCE_BINS)

In [None]:
features_df['H_H_C_A_C_diff_10_Range_Bin'] = create_equal_width_bins(features_df, 'H_H_C_A_C_diff_10', num_bins=DIFFERENCE_BINS)

In [None]:
features_df['H_H_C_A_C_A_diff_10_Range_Bin'] = create_equal_width_bins(features_df, 'H_H_C_A_C_A_diff_10', num_bins=DIFFERENCE_BINS)

In [None]:
features_df['H_H_F_A_F_diff_10_Range_Bin'] = create_equal_width_bins(features_df, 'H_H_F_A_F_diff_10', num_bins=DIFFERENCE_BINS)

In [None]:
features_df['H_H_F_A_F_A_diff_10_Range_Bin'] = create_equal_width_bins(features_df, 'H_H_F_A_F_A_diff_10', num_bins=DIFFERENCE_BINS)

In [None]:
features_df['H_H_YC_A_YC_diff_10_Range_Bin'] = create_equal_width_bins(features_df, 'H_H_YC_A_YC_diff_10', num_bins=DIFFERENCE_BINS)

In [None]:
features_df['H_H_YC_A_YC_A_diff_10_Range_Bin'] = create_equal_width_bins(features_df, 'H_H_YC_A_YC_A_diff_10', num_bins=DIFFERENCE_BINS)

## Ratio

In [None]:
features_df['Points_ratio_10_Range_Bin'] = create_equal_width_bins(features_df, 'Points_ratio_10', num_bins=RATIO_BINS)

In [None]:
features_df['Points_ratio_5_Range_Bin'] = create_equal_width_bins(features_df, 'Points_ratio_5', num_bins=RATIO_BINS)

In [None]:
features_df['GS_ratio_10_Range_Bin'] = create_equal_width_bins(features_df, 'GS_ratio_10', num_bins=RATIO_BINS)

In [None]:
features_df['GC_ratio_10_Range_Bin'] = create_equal_width_bins(features_df, 'GC_ratio_10', num_bins=RATIO_BINS)

In [None]:
features_df['HT_GS_ratio_10_Range_Bin'] = create_equal_width_bins(features_df, 'HT_GS_ratio_10', num_bins=RATIO_BINS)

In [None]:
features_df['HT_GC_ratio_10_Range_Bin'] = create_equal_width_bins(features_df, 'HT_GC_ratio_10', num_bins=RATIO_BINS)

In [None]:
features_df['Shots_ratio_10_Range_Bin'] = create_equal_width_bins(features_df, 'Shots_ratio_10', num_bins=RATIO_BINS)

In [None]:
features_df['Shots_against_ratio_10_Range_Bin'] = create_equal_width_bins(features_df, 'Shots_against_ratio_10', num_bins=RATIO_BINS)

In [None]:
features_df['ST_ratio_10_Range_Bin'] = create_equal_width_bins(features_df, 'ST_ratio_10', num_bins=RATIO_BINS)

In [None]:
features_df['ST_against_ratio_10_Range_Bin'] = create_equal_width_bins(features_df, 'ST_against_ratio_10', num_bins=RATIO_BINS)

In [None]:
features_df['Corners_ratio_10_Range_Bin'] = create_equal_width_bins(features_df, 'Corners_ratio_10', num_bins=RATIO_BINS)

In [None]:
features_df['Corners_against_ratio_10_Range_Bin'] = create_equal_width_bins(features_df, 'Corners_against_ratio_10', num_bins=RATIO_BINS)

In [None]:
features_df['Fouls_ratio_10_Range_Bin'] = create_equal_width_bins(features_df, 'Fouls_ratio_10', num_bins=RATIO_BINS)

In [None]:
features_df['Fouls_against_ratio_10_Range_Bin'] = create_equal_width_bins(features_df, 'Fouls_against_ratio_10', num_bins=RATIO_BINS)

# Visualize Dataset

In [None]:
pd.set_option('display.max_columns', None)
features_df.head(10)

Unnamed: 0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,PSH,PSD,PSA,WHH,WHD,WHA,VCH,VCD,VCA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA,B365>2.5,B365<2.5,P>2.5,P<2.5,Max>2.5,Max<2.5,Avg>2.5,Avg<2.5,AHh,B365AHH,B365AHA,PAHH,PAHA,MaxAHH,MaxAHA,AvgAHH,AvgAHA,B365CH,B365CD,B365CA,BWCH,BWCD,BWCA,IWCH,IWCD,IWCA,PSCH,PSCD,PSCA,WHCH,WHCD,WHCA,VCCH,VCCD,VCCA,MaxCH,MaxCD,MaxCA,AvgCH,AvgCD,AvgCA,B365C>2.5,B365C<2.5,PC>2.5,PC<2.5,MaxC>2.5,MaxC<2.5,AvgC>2.5,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,Season,BFH,BFD,BFA,1XBH,1XBD,1XBA,BFEH,BFED,BFEA,BFE>2.5,BFE<2.5,BFEAHH,BFEAHA,BFCH,BFCD,BFCA,1XBCH,1XBCD,1XBCA,BFECH,BFECD,BFECA,BFEC>2.5,BFEC<2.5,BFECAHH,BFECAHA,MatchDateTime,MatchOrder,HP,AP,H_HRP_10,H_ARP_10,H_H_P_5,H_A_P_5,H_H2H_H_2,H_H2H_A_2,H_H_GS_10,H_A_GS_10,H_H_GC_10,H_A_GC_10,H_H_HGS_10,H_A_HGS_10,H_H_HGC_10,H_A_HGC_10,H_H_S_10,H_A_S_10,H_H_S_A_10,H_A_S_A_10,H_H_ST_10,H_A_ST_10,H_H_ST_A_10,H_A_ST_A_10,H_H_C_10,H_A_C_10,H_H_C_A_10,H_A_C_A_10,H_H_F_10,H_A_F_10,H_H_F_A_10,H_A_F_A_10,H_H_YC_10,H_A_YC_10,H_H_YC_A_10,H_A_YC_A_10,H_H_RC_10,H_A_RC_10,H_H_RC_A_10,H_A_RC_A_10,HomeTeam_Lag1_Result,AwayTeam_Lag1_Result,HomeTeam_Lag2_Result,AwayTeam_Lag2_Result,HomeTeam_Lag3_Result,AwayTeam_Lag3_Result,HomeTeam_Lag4_Result,AwayTeam_Lag4_Result,HomeTeam_Lag5_Result,AwayTeam_Lag5_Result,H_HRP_ARP_diff_10,H_H_P_A_P_diff_5,H_H_GS_A_GS_diff_10,H_H_GC_A_GC_diff_10,H_H_S_A_S_diff_10,H_H_S_A_S_A_diff_10,H_H_ST_A_ST_diff_10,H_H_ST_A_ST_A_diff_10,H_H_C_A_C_diff_10,H_H_C_A_C_A_diff_10,H_H_F_A_F_diff_10,H_H_F_A_F_A_diff_10,H_H_YC_A_YC_diff_10,H_H_YC_A_YC_A_diff_10,H_H_RC_A_RC_diff_10,H_H_RC_A_RC_A_diff_10,Points_ratio_10,Points_ratio_5,GS_ratio_10,GC_ratio_10,HT_GS_ratio_10,HT_GC_ratio_10,Shots_ratio_10,Shots_against_ratio_10,ST_ratio_10,ST_against_ratio_10,Corners_ratio_10,Corners_against_ratio_10,Fouls_ratio_10,Fouls_against_ratio_10,H_HRP_ARP_diff_10_Range_Bin,H_H_P_A_P_diff_5_Range_Bin,H_H_GS_A_GS_diff_10_Range_Bin,H_H_GC_A_GC_diff_10_Range_Bin,H_H_S_A_S_diff_10_Range_Bin,H_H_S_A_S_A_diff_10_Range_Bin,H_H_ST_A_ST_diff_10_Range_Bin,H_H_ST_A_ST_A_diff_10_Range_Bin,H_H_C_A_C_diff_10_Range_Bin,H_H_C_A_C_A_diff_10_Range_Bin,H_H_F_A_F_diff_10_Range_Bin,H_H_F_A_F_A_diff_10_Range_Bin,H_H_YC_A_YC_diff_10_Range_Bin,H_H_YC_A_YC_A_diff_10_Range_Bin,Points_ratio_10_Range_Bin,Points_ratio_5_Range_Bin,GS_ratio_10_Range_Bin,GC_ratio_10_Range_Bin,HT_GS_ratio_10_Range_Bin,HT_GC_ratio_10_Range_Bin,Shots_ratio_10_Range_Bin,Shots_against_ratio_10_Range_Bin,ST_ratio_10_Range_Bin,ST_against_ratio_10_Range_Bin,Corners_ratio_10_Range_Bin,Corners_against_ratio_10_Range_Bin,Fouls_ratio_10_Range_Bin,Fouls_against_ratio_10_Range_Bin
0,E0,Fulham,Arsenal,0,3,A,0,1,A,C Kavanagh,5,13,2,6,12,12,2,3,2,2,0,0,6.0,4.33,1.53,5.5,4.25,1.57,6.0,3.9,1.57,6.16,4.51,1.56,6.5,4.2,1.53,6.5,4.2,1.55,6.55,4.55,1.6,5.94,4.34,1.55,1.72,2.1,1.8,2.13,1.84,2.18,1.76,2.1,1.0,1.93,1.97,1.96,1.96,2.0,1.99,1.93,1.95,5.0,4.0,1.66,5.5,4.0,1.62,5.25,3.9,1.67,5.48,3.98,1.69,5.5,3.8,1.65,5.5,3.9,1.67,5.75,4.2,1.71,5.36,3.93,1.67,2.0,1.8,2.06,1.86,2.1,1.92,2.0,1.84,0.75,2.01,1.89,2.02,1.91,2.13,1.92,2.02,1.87,2020-21,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-09-12 12:30:00,1,0,0,,,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,E0,Crystal Palace,Southampton,1,0,H,1,0,H,J Moss,5,9,3,5,14,11,7,3,2,1,0,0,3.1,3.25,2.37,3.0,3.2,2.45,3.15,2.95,2.4,3.32,3.29,2.4,3.2,3.2,2.35,3.2,3.2,2.4,3.36,3.36,2.5,3.18,3.22,2.39,2.2,1.66,2.34,1.68,2.36,1.73,2.24,1.67,0.25,1.85,2.05,1.88,2.05,1.88,2.07,1.84,2.03,3.0,3.25,2.4,3.0,3.3,2.4,3.05,2.9,2.45,3.09,3.27,2.54,3.1,3.1,2.45,3.1,3.25,2.45,3.25,3.33,2.55,3.08,3.22,2.47,2.2,1.66,2.26,1.72,2.27,1.78,2.18,1.7,0.25,1.78,2.13,1.79,2.17,1.85,2.18,1.79,2.12,2020-21,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-09-12 15:00:00,2,3,3,,,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,E0,Liverpool,Leeds,4,3,H,3,2,H,M Oliver,22,6,6,3,9,6,9,0,1,0,0,0,1.28,6.0,9.5,1.26,6.25,10.5,1.35,5.0,8.5,1.31,6.25,9.92,1.27,6.0,10.0,1.3,5.75,10.5,1.35,6.5,10.75,1.3,5.96,9.68,1.53,2.5,1.56,2.6,1.56,2.68,1.52,2.53,-1.5,1.95,1.95,1.97,1.95,2.0,2.08,1.9,1.97,1.25,6.0,11.0,1.25,6.25,11.0,1.3,6.0,9.0,1.28,6.34,11.38,1.25,6.0,12.0,1.29,6.0,11.5,1.3,6.75,12.27,1.28,6.16,10.63,1.5,2.62,1.51,2.76,1.53,2.82,1.5,2.62,-1.5,1.85,2.05,1.85,2.08,1.9,2.16,1.84,2.04,2020-21,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-09-12 17:30:00,3,3,3,,,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,E0,West Ham,Newcastle,0,2,A,0,0,D,S Attwell,15,15,3,2,13,7,8,7,2,2,0,0,2.15,3.4,3.4,2.15,3.4,3.4,2.15,3.15,3.4,2.18,3.61,3.5,2.15,3.5,3.4,2.15,3.4,3.6,2.24,3.7,3.6,2.15,3.48,3.42,1.9,1.9,2.0,1.91,2.05,1.95,1.97,1.86,-0.5,2.07,1.72,2.17,1.78,2.17,1.81,2.12,1.75,1.95,3.6,3.75,1.95,3.7,3.75,2.05,3.25,3.75,2.04,3.59,3.92,2.0,3.5,3.8,2.0,3.5,3.9,2.07,3.78,3.99,2.01,3.57,3.79,1.9,1.9,2.0,1.92,2.0,2.05,1.91,1.92,-0.5,2.03,1.87,2.04,1.88,2.09,1.91,2.02,1.86,2020-21,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-09-12 20:00:00,4,0,0,,,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,E0,West Brom,Leicester,0,3,A,0,0,D,A Taylor,7,13,1,7,12,9,2,5,1,1,0,0,3.8,3.6,1.95,3.7,3.6,2.0,3.85,3.2,2.0,4.0,3.59,2.0,3.8,3.6,1.95,4.0,3.5,1.95,4.0,3.82,2.04,3.87,3.57,1.97,1.9,1.9,2.0,1.91,2.02,2.03,1.92,1.9,0.5,1.91,1.99,1.92,2.0,1.93,2.02,1.88,1.97,3.25,3.4,2.2,3.3,3.4,2.2,3.35,3.0,2.3,3.38,3.38,2.32,3.3,3.3,2.25,3.3,3.3,2.3,3.55,3.5,2.38,3.32,3.33,2.28,2.2,1.66,2.23,1.74,2.28,1.82,2.15,1.73,0.25,1.92,1.98,1.93,1.99,1.95,2.01,1.91,1.97,2020-21,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-09-13 14:00:00,5,0,0,,,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,E0,Tottenham,Everton,0,1,A,0,0,D,M Atkinson,9,15,5,4,15,7,5,3,1,0,0,0,1.83,3.6,4.33,1.85,3.6,4.33,1.95,3.25,4.1,1.94,3.57,4.3,1.88,3.5,4.33,1.87,3.5,4.33,1.98,3.68,4.5,1.9,3.55,4.2,1.9,1.9,2.01,1.9,2.04,1.95,1.94,1.88,-0.5,1.91,1.99,1.93,1.99,1.93,2.02,1.9,1.96,2.0,3.4,3.8,2.0,3.4,3.9,2.05,3.1,3.75,2.09,3.5,3.88,2.05,3.3,3.8,2.1,3.4,3.75,2.16,3.5,4.05,2.07,3.39,3.79,2.0,1.8,2.08,1.85,2.15,1.92,2.05,1.79,-0.5,2.09,1.81,2.09,1.85,2.16,1.86,2.08,1.81,2020-21,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-09-13 16:30:00,6,0,0,,,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,E0,Sheffield United,Wolves,0,2,A,0,2,A,M Dean,9,11,2,4,13,7,12,5,2,1,0,0,3.25,3.1,2.37,3.4,3.1,2.3,3.35,2.7,2.45,3.41,3.03,2.46,3.3,3.1,2.38,3.4,3.0,2.38,3.46,3.24,2.52,3.36,3.0,2.41,2.62,1.5,2.84,1.48,2.88,1.53,2.68,1.48,0.25,1.86,2.04,1.85,2.06,1.88,2.07,1.84,2.02,3.2,2.9,2.55,3.2,2.9,2.55,3.2,2.6,2.65,3.28,2.9,2.68,3.25,2.8,2.6,3.3,2.88,2.55,3.5,2.97,2.77,3.23,2.86,2.62,3.2,1.36,3.15,1.41,3.2,1.44,3.01,1.39,0.25,1.7,2.1,1.74,2.23,1.86,2.28,1.74,2.18,2020-21,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-09-14 18:00:00,7,0,0,,,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,E0,Brighton,Chelsea,1,3,A,0,1,A,C Pawson,13,10,3,5,8,13,4,3,1,0,0,0,5.0,4.33,1.61,5.0,4.1,1.65,5.0,3.7,1.67,5.36,4.34,1.65,5.25,4.0,1.65,5.25,3.9,1.65,5.36,4.35,1.71,5.12,4.13,1.65,1.72,2.1,1.79,2.14,1.84,2.18,1.76,2.08,0.75,2.06,1.84,2.11,1.83,2.11,1.88,2.05,1.82,5.75,4.4,1.53,5.75,4.25,1.55,5.5,4.2,1.6,6.07,4.46,1.57,6.0,4.2,1.55,6.0,4.2,1.57,6.4,4.7,1.6,5.91,4.35,1.56,1.72,2.1,1.71,2.27,1.8,2.27,1.71,2.17,1.0,1.93,1.97,1.94,1.98,2.02,2.02,1.93,1.95,2020-21,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-09-14 20:15:00,8,0,0,,,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,E0,Everton,West Brom,5,2,H,2,1,H,M Dean,17,6,7,4,9,11,11,1,1,0,0,1,1.5,4.2,6.5,1.55,4.2,6.0,1.55,4.0,6.0,1.55,4.33,6.46,1.52,4.2,6.5,1.5,4.2,7.0,1.59,4.5,7.0,1.54,4.25,6.3,1.87,2.03,1.87,2.05,1.9,2.13,1.81,2.03,-1.0,1.91,1.99,1.97,1.96,1.99,2.04,1.91,1.97,1.57,4.0,6.0,1.57,4.0,6.0,1.6,3.8,6.25,1.61,4.12,6.41,1.55,3.9,6.5,1.6,3.9,6.5,1.66,4.23,6.85,1.59,4.03,6.15,1.9,1.9,2.03,1.89,2.07,1.94,1.98,1.85,-1.0,2.09,1.81,2.14,1.82,2.15,1.9,2.06,1.83,2020-21,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-09-19 12:30:00,9,3,3,0.0,0.0,,,0,0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,15.0,7.0,9.0,13.0,4.0,1.0,5.0,7.0,3.0,2.0,5.0,5.0,7.0,12.0,15.0,9.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,,,,,,,,,0.0,,1.0,-3.0,8.0,-4.0,3.0,-2.0,1.0,0.0,-5.0,6.0,-1.0,0.0,0.0,0.0,0.0,,1.0,0.0,0.0,0.0,1.875,0.642857,2.0,0.625,1.0,0.833333,0.538462,1.5,5.0,,4.0,4.0,5.0,4.0,4.0,5.0,5.0,4.0,4.0,4.0,4.0,4.0,0.0,,9.0,0.0,0.0,0.0,18.0,5.0,19.0,5.0,9.0,7.0,4.0,14.0
9,E0,Leeds,Fulham,4,3,H,2,1,H,A Taylor,10,14,7,6,13,18,5,3,1,2,0,0,1.61,3.9,5.75,1.62,4.0,5.5,1.65,3.7,5.5,1.65,4.05,5.61,1.63,3.9,5.5,1.62,3.9,6.0,1.7,4.15,6.0,1.64,3.97,5.5,1.97,1.93,1.96,1.96,1.99,2.0,1.93,1.91,-0.75,1.81,2.09,1.85,2.08,1.86,2.14,1.82,2.07,1.66,3.8,5.25,1.67,4.0,5.0,1.75,3.55,4.9,1.75,3.91,5.12,1.7,3.7,5.25,1.73,3.7,5.25,1.8,4.0,5.7,1.73,3.81,5.05,2.01,1.89,2.03,1.89,2.04,2.05,1.96,1.88,-0.75,1.97,1.93,1.99,1.94,2.01,2.03,1.95,1.93,2020-21,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-09-19 15:00:00,10,3,3,3.0,0.0,,,0,0,3.0,0.0,4.0,3.0,2.0,0.0,3.0,1.0,6.0,5.0,22.0,13.0,3.0,2.0,6.0,6.0,0.0,2.0,9.0,3.0,6.0,12.0,9.0,12.0,0.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,3.0,,3.0,1.0,1.0,9.0,1.0,0.0,-2.0,6.0,-6.0,-3.0,-2.0,-1.0,0.0,0.0,2.0,,2.0,1.0,2.0,1.5,1.0,1.571429,1.0,0.857143,0.0,2.0,0.461538,0.692308,6.0,,4.0,4.0,4.0,5.0,4.0,5.0,5.0,5.0,4.0,3.0,4.0,4.0,19.0,,19.0,9.0,19.0,14.0,9.0,15.0,9.0,7.0,0.0,19.0,3.0,6.0


# Save Dataset

In [None]:
features_df.to_csv('features_df.csv', index=False)