In [29]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

path = r'/Users/tom/Documents/Coding/AiCore/Projects/4. Football Match Outcome Predictor /Scraped Datasets/cleaned_populated_results_df.csv'

scraped_df = pd.read_csv(path)

missing_information_list= []

league_list = ['segunda_division', 'ligue_2', 'bundesliga', 'serie_b',
       'premier_league', 'eredivisie', 'serie_a', '2_liga',
       'primeira_liga', 'ligue_1', 'primera_division', 'segunda_liga',
       'championship']
    

In [30]:
# Group into decades

def _split_dataset_into_decades(combined_elo_with_partial_seasons_removed_df):
    '''split_dataset_into_decades 
    Function to split dataset into three datasets each approximately 10 years long
    '''

    combined_elo_with_partial_seasons_removed_df = combined_elo_with_partial_seasons_removed_df.sort_values(by=['Season'], ascending=[True])

    decade_1 = (combined_elo_with_partial_seasons_removed_df["Season"].unique()[0:10])
    decade_2 = (combined_elo_with_partial_seasons_removed_df["Season"].unique()[10:20])
    decade_3 = (combined_elo_with_partial_seasons_removed_df["Season"].unique()[20:32])

    season_grouped_df = combined_elo_with_partial_seasons_removed_df.groupby("Season")

    decade_1_df = pd.DataFrame()
    decade_2_df = pd.DataFrame()
    decade_3_df = pd.DataFrame()

    for season in decade_1:
        decade_df = season_grouped_df.get_group(season)
        decade_1_df = pd.concat([decade_1_df, decade_df], axis=0, ignore_index=True)

    for season in decade_2:
        decade_df = season_grouped_df.get_group(season)
        decade_2_df = pd.concat([decade_2_df, decade_df], axis=0, ignore_index=True)

    for season in decade_3:
        decade_df = season_grouped_df.get_group(season)
        decade_3_df = pd.concat([decade_3_df, decade_df], axis=0, ignore_index=True)


    return decade_1_df, decade_2_df, decade_3_df

def _group_into_divisions(decade_df, division):
    '''_group_into_leagues _summary_

    Arguments:
        decade_df -- _description_
        division -- _description_
    '''

    league_grouped_df = decade_df.groupby("League")
    division_df = league_grouped_df.get_group(division)

    return division_df

def _create_average_feature_columns(split_df):
    '''create_new_feature_columns 
    Function to create columns filled with zeros for each required feature

    Arguments:
        combined_local_elo_df -- _description_
    '''
    # Get an average goals_scored and conceeded

    split_df["home_team_average_goals_scored_per_game"] = 0
    split_df["home_team_average_goals_conceeded_per_game"] = 0
    split_df["away_team_average_goals_scored_per_game"] = 0
    split_df["away_team_average_goals_conceeded_per_game"] = 0

    return split_df

def _create_split_averages_summary_template(split_df):
    '''create_summary_template 
    Function to create auxiliary template summary per decade

    Arguments:
        decade_df -- _description_

    Returns:
        _description_
    '''
    summary_df_template = pd.DataFrame(columns=
    [
        'team_name', 'league', 'total_goals_scored', 'total_goals_conceeded', 'games_played', 'goals_scored_per_game', 
        'goals_conceeded_per_game'
        ])
    team_group = split_df.groupby("Home_Team")
    home_teams = list(split_df["Home_Team"].unique())
    away_teams = list(split_df["Away_Team"].unique())
    missing_teams = list(set(away_teams) - set(home_teams))
    team_list = home_teams + missing_teams

    for team_name in team_list:
        league = team_group.first()["League"].unique()[0]
        team_stats_dict = {
            'team_name': team_name, 
            'league': league, 'total_goals_scored': 0, 
            'total_goals_conceeded': 0, 'games_played': 0, 'goals_scored_per_game':0, 
            'goals_conceeded_per_game':0
            }
        team_stats_dict_df = pd.DataFrame([team_stats_dict])
        summary_df_template = pd.concat([summary_df_template, team_stats_dict_df], ignore_index=True)


    print("Summary Template Complete")
    return summary_df_template

def populate_df_with_averages(split_df, summary_df_template):
    '''populate_df_with_averages 
    Function to compute and add average goals per game to combined dataframe.

    This is achieved by:
    1. Ordering split df by season and round.
    2. Splitting df into seasons
    3. For each season splitting into round
    4a. In each round split, if data exists, inputting into split df
    4b. This is put here so that averages for the round 1 before are put into round 2
    5. For each row in the round df goals per game is computed
    6. This is then added to a auxilary df where a running count of the averages are stored
    7. This is then updated into the split df for each match per round


    Arguments:
        split_df -- _description_

    Returns:
        _description_
    '''

    split_df = split_df.sort_values(['Season', 'Round'], ascending=[True, True])

    season_df_grouped = split_df.groupby("Season")

    for season in split_df["Season"].unique():
        season_df = season_df_grouped.get_group(season)
        round_df_grouped = season_df.groupby("Round")
        for round in season_df["Round"].unique():
            print(f"Starting round {round}, season {season}")

            round_df = round_df_grouped.get_group(round)
            # Set the values of the previous match
            for team in list(summary_df_template["team_name"]):

                populated_season_df = _add_home_team_feature_total_to_df("home_team_average_goals_scored_per_game","goals_scored_per_game", team, season_df, summary_df_template, round)
                populated_season_df = _add_home_team_feature_total_to_df("home_team_average_goals_conceeded_per_game","goals_conceeded_per_game", team, season_df, summary_df_template, round)
        
                populated_season_df = _add_away_team_feature_total_to_df("away_team_average_goals_scored_per_game","goals_scored_per_game", team, season_df, summary_df_template, round)
                populated_season_df = _add_away_team_feature_total_to_df("away_team_average_goals_conceeded_per_game","goals_conceeded_per_game", team, season_df, summary_df_template, round)
            

            for index, row in round_df.iterrows():

                home_team_name = row["Home_Team"]
                away_team_name = row["Away_Team"]
                league = row["League"]
                season=row["Season"]
                round=row["Round"]
                home_goals_scored = row["Home_Goals"]
                away_goals_scored = row["Away_Goals"]

                #Home Team
                summary_df_template = _goals_and_averages_computer(summary_df_template, home_team_name, home_goals_scored, 
                away_goals_scored, league)

                #Away Team
                summary_df_template = _goals_and_averages_computer(summary_df_template, away_team_name, away_goals_scored, 
                home_goals_scored, league)
        
        
        split_df = merge_season_df_with_combined_df(split_df, populated_season_df)

        print(f"Average Goals computed and added to df for season {season}")

    return split_df


def _goals_and_averages_computer(summary_df_template, team, goal_option_1, goal_option_2, league):
    '''_goals_and_averages_computer _summary_

    Arguments:
        summary_df_template -- _description_
        team -- _description_
        goal_option_1 -- _description_
        goal_option_2 -- _description_
        league -- _description_

    Returns:
        _description_
    '''

    summary_df_template.loc[((summary_df_template.team_name == team) & (
    summary_df_template.league == league)), "total_goals_scored"] += goal_option_1

    summary_df_template.loc[((summary_df_template.team_name == team) & (
    summary_df_template.league == league)), "total_goals_conceeded"] += goal_option_2

    summary_df_template.loc[((summary_df_template.team_name == team) & (
    summary_df_template.league == league)), "games_played"] += 1

    summary_df_template.loc[((summary_df_template.team_name == team) & (
    summary_df_template.league == league)), "goals_scored_per_game"] = summary_df_template.loc[((summary_df_template.team_name == team) & (
    summary_df_template.league == league)), "total_goals_scored"] / summary_df_template.loc[((summary_df_template.team_name == team) & (
    summary_df_template.league == league)), "games_played"]

    summary_df_template.loc[((summary_df_template.team_name == team) & (
    summary_df_template.league == league)), "goals_conceeded_per_game"] = summary_df_template.loc[((summary_df_template.team_name == team) & (
    summary_df_template.league == league)), "total_goals_conceeded"] / summary_df_template.loc[((summary_df_template.team_name == team) & (
    summary_df_template.league == league)), "games_played"]

    return summary_df_template


def _add_home_team_feature_total_to_df(feature,summary_feature, team, df, summary_df_template, round):
    '''_add_home_team_feature_total_to_df _summary_

    Arguments:
        feature -- _description_
        summary_feature -- _description_
        team -- _description_
        df -- _description_
        summary_df_template -- _description_
        round -- _description_

    Returns:
        _description_
    '''
    
    df.loc[(df.Home_Team == team) & ((
                df.Round == round)), feature] = summary_df_template.loc[(
                    summary_df_template.team_name == team), summary_feature].values[0]
    return df

def _add_away_team_feature_total_to_df(feature,summary_feature, team, df, summary_df_template, round):
    '''_add_away_team_feature_total_to_df _summary_

    Arguments:
        feature -- _description_
        summary_feature -- _description_
        team -- _description_
        df -- _description_
        summary_df_template -- _description_
        round -- _description_

    Returns:
        _description_
    '''
    
    df.loc[((df.Away_Team == team) & (
                df.Round == round)), feature] = summary_df_template.loc[((
                    summary_df_template.team_name == team)), summary_feature].values[0]
    return df

def merge_season_df_with_combined_df(split_df, populated_season_df):
    '''merge_season_df_with_combined_df _summary_

    Arguments:
        df -- _description_
        populated_season_df -- _description_

    Returns:
        _description_
    '''
    split_df.update(populated_season_df, overwrite=True)

    return split_df

def _drop_loader_season(split_df, loader_season):
    '''drop_loader_season 
    Each season includes the year before for the average goals p/game statistic.
    This season is then dropped from the dataset so that the early rounds of each 
    season are no so heavily wegithed on the game prior


    Arguments:
        split_df -- _description_
        loader_season -- _description_

    Returns:
        _description_
    '''
    

    split_df.drop(split_df.index[(split_df.Season == loader_season)], inplace=True)

    return split_df

def _prepare_dataset_for_ML_models(df):


    df.drop(axis=1, columns=['Round','Home_Win', 'Away_Win', 'Season', 'League'], inplace=True)
    df.drop(axis=1, columns=['Home_Goals', 'Away_Goals'], inplace=True)
    
    return df


In [42]:

# for scraped data:

scraped_df_with_feat = _create_average_feature_columns(scraped_df)
scraped_summary_template = _create_split_averages_summary_template(scraped_df_with_feat)
populated_scraped_df = populate_df_with_averages(scraped_df_with_feat, scraped_summary_template)

# drop the seasons collected to generate features for the matches wanted

populated_scraped_df_2021 = _drop_loader_season(populated_scraped_df, 2021)
populated_scraped_df_2022 = _drop_loader_season(populated_scraped_df_2021, 2022)

# only take out round - to be predicted

to_predict_df = populated_scraped_df_2022.loc[populated_scraped_df_2022.Round == 8]


Summary Template Complete
Starting round 1.0, season 2021.0
Starting round 2.0, season 2021.0
Starting round 3.0, season 2021.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[(df.Home_Team == team) & ((
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[((df.Away_Team == team) & (


Starting round 4.0, season 2021.0
Starting round 5.0, season 2021.0
Starting round 6.0, season 2021.0
Starting round 7.0, season 2021.0
Starting round 8.0, season 2021.0
Starting round 9.0, season 2021.0
Starting round 10.0, season 2021.0
Starting round 11.0, season 2021.0
Starting round 12.0, season 2021.0
Starting round 13.0, season 2021.0
Starting round 14.0, season 2021.0
Starting round 15.0, season 2021.0
Starting round 16.0, season 2021.0
Starting round 17.0, season 2021.0
Starting round 18.0, season 2021.0
Starting round 19.0, season 2021.0
Starting round 20.0, season 2021.0
Starting round 21.0, season 2021.0
Starting round 22.0, season 2021.0
Starting round 23.0, season 2021.0
Starting round 24.0, season 2021.0
Starting round 25.0, season 2021.0
Starting round 26.0, season 2021.0
Starting round 27.0, season 2021.0
Starting round 28.0, season 2021.0
Starting round 29.0, season 2021.0
Starting round 30.0, season 2021.0
Starting round 31.0, season 2021.0
Starting round 32.0, seaso

In [43]:
populated_scraped_df_for_ML

Unnamed: 0,Home_Team,Away_Team,ELO_Home,ELO_Away,home_team_total_goals_scored_so_far,home_team_total_goals_conceeded_so_far,home_team_current_win_streak,home_team_current_loss_streak,home_team_total_points_so_far,home_team_current_goal_drought,home_team_total_wins_so_far,away_team_total_goals_scored_so_far,away_team_total_goals_conceeded_so_far,away_team_current_win_streak,away_team_current_loss_streak,away_team_total_points_so_far,away_team_current_goal_drought,away_team_total_wins_so_far,home_team_average_goals_scored_per_game,home_team_average_goals_conceeded_per_game,away_team_average_goals_scored_per_game,away_team_average_goals_conceeded_per_game
0,Fulham,Arsenal,69.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
1,Crystal Palace,Southampton,74.0,79.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
2,Liverpool,Leeds United,96.0,69.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
3,West Ham,Newcastle,76.0,74.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
4,West Bromwich Albion,Leicester,79.0,83.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
825,Tottenham Hotspur,Leicester,92.0,83.0,12.0,5.0,1.0,0.0,14.0,0.0,4.0,8.0,16.0,0.0,5.0,1.0,0.0,0.0,1.817073,1.097561,1.682927,1.524390
826,Brentford,Arsenal,62.0,95.0,15.0,9.0,1.0,0.0,9.0,0.0,2.0,14.0,7.0,0.0,1.0,15.0,0.0,5.0,1.431818,1.477273,1.585366,1.146341
827,Everton,West Ham,84.0,80.0,4.0,6.0,0.0,2.0,4.0,1.0,0.0,3.0,8.0,0.0,1.0,4.0,0.0,1.0,1.146341,1.463415,1.524390,1.292683
828,Man. Utd,Leeds United,93.0,62.0,8.0,8.0,4.0,0.0,12.0,0.0,4.0,10.0,10.0,0.0,2.0,8.0,0.0,2.0,1.682927,1.329268,1.390244,1.743902


In [41]:
#path = r'/Users/tom/Documents/Coding/AiCore/Projects/4. Football Match Outcome Predictor /Scraped Datasets/populated_scraped_df_for_ML.csv'
#populated_scraped_df_for_ML.to_csv(path, index=False)

In [36]:
scraped_summary_template.sort_values(by=['total_goals_scored'], ascending=[False])

Unnamed: 0,team_name,league,total_goals_scored,total_goals_conceeded,games_played,goals_scored_per_game,goals_conceeded_per_game
9,Man. City,premier_league,202.0,64.0,83,2.433735,0.771084
2,Liverpool,premier_league,177.0,74.0,83,2.13253,0.891566
5,Tottenham Hotspur,premier_league,149.0,90.0,83,1.795181,1.084337
16,Chelsea,premier_league,142.0,78.0,83,1.710843,0.939759
12,Man. Utd,premier_league,138.0,109.0,83,1.662651,1.313253
17,Leicester,premier_league,138.0,125.0,83,1.662651,1.506024
13,Arsenal,premier_league,130.0,94.0,83,1.566265,1.13253
3,West Ham,premier_league,125.0,106.0,83,1.506024,1.277108
11,Leeds United,premier_league,114.0,143.0,83,1.373494,1.722892
18,Aston Villa,premier_league,112.0,110.0,83,1.349398,1.325301
