In [1]:
# importing data manipulation libraries
import pandas as pd
import numpy as np
# from ydata_profiling import ProfileReport

# importing visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# configure notebook for inline plotting
%matplotlib inline

# config pandas to display more than 20 columns
pd.set_option('display.max_columns',250)

# set grid style 
sns.set_style('darkgrid')

# Data Understanding
The objective in this section is to learn as much as possible about the our match data. Here, we will get a high-level practical understnading of our data and by the end of the process, we should have a clear undertanding of the structure of the dataset, how to clean the data, the target variables and possible modelling techniques.

In [2]:
# read data into pandas dataframe
data = pd.read_csv('./data/Matches.csv', header=0)

In [3]:
# check the shape of dataframe
data.shape

(6460, 215)

The dataset consists of 6,460 observations and 215 features some of which are our target variables.

In [4]:
# check types of columns
data.dtypes.unique()

array([dtype('int64'), dtype('O'), dtype('float64'), dtype('bool')],
      dtype=object)

The result indicates that our dataset consists of categorical (object), boolean and numeric (float and int) features. Before proceeding further, let's confirm that the dtypes match what is expected in the documentation.

### To do (Create table of columns and their definitions) - Use datawrapper and insert link 



## columns to be used in modelling 
- total goal count (will be target variable for objective two)
- team_a_cards_num	team_b_cards_num(objective three)
- odds_ft_1	odds_ft_x	odds_ft_2
- winningTeam (objective one )- re-code column to 1x2
- home_ppg	away_ppg
- team_a_xg_prematch	team_b_xg_prematch	total_xg_prematch

target columns 
- total goal count
- total cards(obtain by summing team a and b cards)
- winningTeam 1x2

features
- odds_ft_1	odds_ft_x	odds_ft_2
- home_ppg	away_ppg
- team_a_xg_prematch	team_b_xg_prematch	total_xg_prematch


In [5]:
# inspect the first few rows
data.head(20)

Unnamed: 0,id,homeID,awayID,season,status,roundID,game_week,revised_game_week,homeGoals,awayGoals,homeGoalCount,awayGoalCount,totalGoalCount,team_a_corners,team_b_corners,totalCornerCount,team_a_offsides,team_b_offsides,team_a_yellow_cards,team_b_yellow_cards,team_a_red_cards,team_b_red_cards,team_a_shotsOnTarget,team_b_shotsOnTarget,team_a_shotsOffTarget,team_b_shotsOffTarget,team_a_shots,team_b_shots,team_a_fouls,team_b_fouls,team_a_possession,team_b_possession,refereeID,coach_a_ID,coach_b_ID,stadium_name,stadium_location,team_a_cards_num,team_b_cards_num,odds_ft_1,odds_ft_x,odds_ft_2,odds_ft_over05,odds_ft_over15,odds_ft_over25,odds_ft_over35,odds_ft_over45,odds_ft_under05,odds_ft_under15,odds_ft_under25,odds_ft_under35,odds_ft_under45,odds_btts_yes,odds_btts_no,odds_team_a_cs_yes,odds_team_a_cs_no,odds_team_b_cs_yes,odds_team_b_cs_no,odds_doublechance_1x,odds_doublechance_12,odds_doublechance_x2,odds_1st_half_result_1,odds_1st_half_result_x,odds_1st_half_result_2,odds_2nd_half_result_1,odds_2nd_half_result_x,odds_2nd_half_result_2,odds_dnb_1,odds_dnb_2,odds_corners_over_75,odds_corners_over_85,odds_corners_over_95,odds_corners_over_105,odds_corners_over_115,odds_corners_under_75,odds_corners_under_85,odds_corners_under_95,odds_corners_under_105,odds_corners_under_115,odds_corners_1,odds_corners_x,odds_corners_2,odds_team_to_score_first_1,odds_team_to_score_first_x,odds_team_to_score_first_2,odds_win_to_nil_1,odds_win_to_nil_2,odds_1st_half_over05,odds_1st_half_over15,odds_1st_half_over25,odds_1st_half_over35,odds_1st_half_under05,odds_1st_half_under15,odds_1st_half_under25,odds_1st_half_under35,odds_2nd_half_over05,odds_2nd_half_over15,odds_2nd_half_over25,odds_2nd_half_over35,odds_2nd_half_under05,odds_2nd_half_under15,odds_2nd_half_under25,odds_2nd_half_under35,odds_btts_1st_half_yes,odds_btts_1st_half_no,odds_btts_2nd_half_yes,odds_btts_2nd_half_no,overallGoalCount,ht_goals_team_a,ht_goals_team_b,goals_2hg_team_a,goals_2hg_team_b,GoalCount_2hg,HTGoalCount,date_unix,winningTeam,no_home_away,btts_potential,btts_fhg_potential,btts_2hg_potential,goalTimingDisabled,attendance,corner_timings_recorded,card_timings_recorded,team_a_fh_corners,team_b_fh_corners,team_a_2h_corners,team_b_2h_corners,corner_fh_count,corner_2h_count,team_a_fh_cards,team_b_fh_cards,team_a_2h_cards,team_b_2h_cards,total_fh_cards,total_2h_cards,attacks_recorded,team_a_dangerous_attacks,team_b_dangerous_attacks,team_a_attacks,team_b_attacks,team_a_xg,team_b_xg,total_xg,team_a_penalties_won,team_b_penalties_won,team_a_penalty_goals,team_b_penalty_goals,team_a_penalty_missed,team_b_penalty_missed,pens_recorded,goal_timings_recorded,team_a_0_10_min_goals,team_b_0_10_min_goals,team_a_corners_0_10_min,team_b_corners_0_10_min,team_a_cards_0_10_min,team_b_cards_0_10_min,throwins_recorded,team_a_throwins,team_b_throwins,freekicks_recorded,team_a_freekicks,team_b_freekicks,goalkicks_recorded,team_a_goalkicks,team_b_goalkicks,o45_potential,o35_potential,o25_potential,o15_potential,o05_potential,o15HT_potential,o05HT_potential,o05_2H_potential,o15_2H_potential,corners_potential,offsides_potential,cards_potential,avg_potential,home_url,home_image,home_name,away_url,away_image,away_name,home_ppg,away_ppg,pre_match_home_ppg,pre_match_away_ppg,pre_match_teamA_overall_ppg,pre_match_teamB_overall_ppg,u45_potential,u35_potential,u25_potential,u15_potential,u05_potential,corners_o85_potential,corners_o95_potential,corners_o105_potential,team_a_xg_prematch,team_b_xg_prematch,total_xg_prematch,match_url,competition_id,matches_completed_minimum,over05,over15,over25,over35,over45,over55,btts,homeGoals_timings,awayGoals_timings
0,2155,150,108,2016/2017,complete,19,1,-1,"['45+1', '57']",['47'],2,1,3,5,3,8,1,0,2,2,0,0,4,6,7,9,11,15,7,17,50,50,685.0,196.0,160.0,KCOM Stadium (Hull),,2,2,3.41,3.19,2.39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,3,1,0,1,1,2,1,1471087800,150,0,0,0,0,0,-1,-1,1,-1,-1,-1,-1,-1,-1,0,2,2,0,2,2,-1,0,0,0,0,0.0,0.0,0.0,0,1,0,1,0,0,1,1,0,0,-1,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,/clubs/hull-city-afc-150,teams/england-hull-city-afc.png,Hull City,/clubs/leicester-city-fc-108,teams/england-leicester-city-fc.png,Leicester City,1.47,0.53,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,/england/leicester-city-fc-vs-hull-city-afc-h2...,9,38,True,True,True,False,False,False,True,"['45+1', '57']",['47']
1,2156,145,154,2016/2017,complete,19,1,-1,[],['82'],0,1,1,7,4,11,2,2,3,2,0,0,6,8,6,7,12,15,10,14,55,45,688.0,197.0,198.0,Turf Moor (Burnley),,3,2,2.45,3.22,3.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,1,1,0,1471096800,154,0,0,0,0,0,-1,-1,1,-1,-1,-1,-1,-1,-1,1,0,2,2,1,4,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,/clubs/burnley-fc-145,teams/england-burnley-fc.png,Burnley,/clubs/swansea-city-afc-154,teams/wales-swansea-city-afc.png,Swansea City,1.74,0.74,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,/england/burnley-fc-vs-swansea-city-afc-h2h-st...,9,38,True,False,False,False,False,False,False,[],['82']
2,2157,143,142,2016/2017,complete,19,1,-1,[],['74'],0,1,1,3,6,9,0,2,2,2,0,0,7,6,5,6,12,12,12,14,54,46,360.0,199.0,200.0,Selhurst Park (London),,2,2,2.2,3.25,3.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,1,1,0,1471096800,142,0,0,0,0,0,-1,-1,1,-1,-1,-1,-1,-1,-1,0,1,2,1,1,3,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,/clubs/crystal-palace-fc-143,teams/england-crystal-palace-fc.png,Crystal Palace,/clubs/west-bromwich-albion-fc-142,teams/england-west-bromwich-albion-fc.png,West Bromwich Albion,1.05,0.84,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,/england/west-bromwich-albion-fc-vs-crystal-pa...,9,38,True,False,False,False,False,False,False,[],['74']
3,2158,144,92,2016/2017,complete,19,1,-1,['5'],['59'],1,1,2,5,6,11,4,0,0,0,0,0,7,4,3,2,10,6,10,14,41,59,537.0,201.0,156.0,Goodison Park (Liverpool),,0,0,3.13,3.36,2.45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,2,1,0,0,1,1,1,1471096800,-1,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,1,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,/clubs/everton-fc-144,teams/england-everton-fc.png,Everton,/clubs/tottenham-hotspur-fc-92,teams/england-tottenham-hotspur-fc.png,Tottenham Hotspur,2.26,1.74,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,/england/tottenham-hotspur-fc-vs-everton-fc-h2...,9,38,True,True,False,False,False,False,True,['5'],['59']
4,2159,147,141,2016/2017,complete,19,1,-1,['11'],['67'],1,1,2,9,6,15,1,3,3,5,0,0,3,2,5,8,8,10,18,13,46,54,693.0,202.0,203.0,Riverside Stadium (Middlesbrough),,3,5,2.49,3.2,3.21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,2,1,0,0,1,1,1,1471096800,-1,0,0,0,0,0,-1,-1,1,-1,-1,-1,-1,-1,-1,2,2,1,3,4,4,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,/clubs/middlesbrough-fc-147,teams/england-middlesbrough-fc.png,Middlesbrough,/clubs/stoke-city-fc-141,teams/england-stoke-city-fc.png,Stoke City,0.95,0.89,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,/england/stoke-city-fc-vs-middlesbrough-fc-h2h...,9,38,True,True,False,False,False,False,True,['11'],['67']
5,2160,146,155,2016/2017,complete,19,1,-1,['58'],['9'],1,1,2,6,2,8,1,0,1,2,0,1,10,0,8,6,18,6,8,12,66,34,697.0,204.0,205.0,"St. Mary's Stadium (Southampton, Hampshire)",,1,4,1.8,3.64,5.13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,2,0,1,1,0,1,1,1471096800,-1,0,0,0,0,0,-1,-1,1,-1,-1,-1,-1,-1,-1,0,0,1,3,0,4,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,1,-1,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,/clubs/southampton-fc-146,teams/england-southampton-fc.png,Southampton,/clubs/watford-fc-155,teams/england-watford-fc.png,Watford,1.26,0.63,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,/england/southampton-fc-vs-watford-fc-h2h-stat...,9,38,True,True,False,False,False,False,True,['58'],['9']
6,2161,93,156,2016/2017,complete,19,1,-1,"['4', '87']",['71'],2,1,3,10,5,15,1,2,1,2,0,0,5,3,8,2,13,5,11,11,72,28,293.0,72.0,206.0,Etihad Stadium (Manchester),,1,2,1.21,7.0,9.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,3,1,0,1,1,2,1,1471105800,93,0,0,0,0,0,-1,-1,1,-1,-1,-1,-1,-1,-1,0,1,1,1,1,2,-1,0,0,0,0,0.0,0.0,0.0,1,0,1,0,0,0,1,1,1,0,-1,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,/clubs/manchester-city-fc-93,teams/england-manchester-city-fc.png,Manchester City,/clubs/sunderland-afc-156,teams/england-sunderland-afc.png,Sunderland,2.11,0.53,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,/england/manchester-city-fc-vs-sunderland-afc-...,9,38,True,True,True,False,False,False,True,"['4', '87']",['71']
7,2162,148,149,2016/2017,complete,19,1,-1,['69'],"['40', '59', '64']",1,3,4,4,2,6,3,4,0,1,0,0,3,5,2,1,5,6,6,10,53,47,526.0,207.0,208.0,"Vitality Stadium (Bournemouth, Dorset)",,0,1,5.21,3.68,1.78,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,4,0,1,1,2,3,1,1471177800,149,0,0,0,0,0,-1,-1,1,-1,-1,-1,-1,-1,-1,0,0,0,1,0,1,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,/clubs/afc-bournemouth-148,teams/england-afc-bournemouth.png,AFC Bournemouth,/clubs/manchester-united-fc-149,teams/england-manchester-united-fc.png,Manchester United,1.63,1.84,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,/england/afc-bournemouth-vs-manchester-united-...,9,38,True,True,True,True,False,False,True,['69'],"['40', '59', '64']"
8,2163,59,151,2016/2017,complete,19,1,-1,"['31', '64', '75']","['45+1', '49', '56', '63']",3,4,7,5,4,9,4,3,3,3,0,0,2,5,4,5,6,10,13,15,47,53,393.0,145.0,85.0,Emirates Stadium (London),,3,3,2.14,3.44,3.74,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,7,1,1,2,3,5,2,1471186800,151,0,0,0,0,0,-1,-1,1,-1,-1,-1,-1,-1,-1,1,3,2,0,4,2,-1,0,0,0,0,0.0,0.0,0.0,1,0,0,0,1,0,1,1,0,0,-1,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,/clubs/arsenal-fc-59,teams/england-arsenal-fc.png,Arsenal,/clubs/liverpool-fc-151,teams/england-liverpool-fc.png,Liverpool,2.37,1.84,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,/england/arsenal-fc-vs-liverpool-fc-h2h-stats#...,9,38,True,True,True,True,True,True,True,"['31', '64', '75']","['45+1', '49', '56', '63']"
9,2164,152,153,2016/2017,complete,19,1,-1,"['47', '89']",['77'],2,1,3,7,1,8,1,1,5,2,0,0,5,2,7,2,12,4,15,13,57,43,461.0,209.0,210.0,Stamford Bridge (London),,5,2,1.62,4.01,6.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,3,0,0,2,1,3,0,1471287600,152,0,0,0,0,0,-1,-1,1,-1,-1,-1,-1,-1,-1,2,1,3,1,3,4,-1,0,0,0,0,0.0,0.0,0.0,1,0,1,0,0,0,1,1,0,0,-1,-1,1,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,/clubs/chelsea-fc-152,teams/england-chelsea-fc.png,Chelsea,/clubs/west-ham-united-fc-153,teams/england-west-ham-united-fc.png,West Ham United,2.68,1.05,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,/england/chelsea-fc-vs-west-ham-united-fc-h2h-...,9,38,True,True,True,False,False,False,True,"['47', '89']",['77']


In [6]:
#columns to be used for modelling 
#goals
#


In [7]:
# inspect the last rows
data.tail()

Unnamed: 0,id,homeID,awayID,season,status,roundID,game_week,revised_game_week,homeGoals,awayGoals,homeGoalCount,awayGoalCount,totalGoalCount,team_a_corners,team_b_corners,totalCornerCount,team_a_offsides,team_b_offsides,team_a_yellow_cards,team_b_yellow_cards,team_a_red_cards,team_b_red_cards,team_a_shotsOnTarget,team_b_shotsOnTarget,team_a_shotsOffTarget,team_b_shotsOffTarget,team_a_shots,team_b_shots,team_a_fouls,team_b_fouls,team_a_possession,team_b_possession,refereeID,coach_a_ID,coach_b_ID,stadium_name,stadium_location,team_a_cards_num,team_b_cards_num,odds_ft_1,odds_ft_x,odds_ft_2,odds_ft_over05,odds_ft_over15,odds_ft_over25,odds_ft_over35,odds_ft_over45,odds_ft_under05,odds_ft_under15,odds_ft_under25,odds_ft_under35,odds_ft_under45,odds_btts_yes,odds_btts_no,odds_team_a_cs_yes,odds_team_a_cs_no,odds_team_b_cs_yes,odds_team_b_cs_no,odds_doublechance_1x,odds_doublechance_12,odds_doublechance_x2,odds_1st_half_result_1,odds_1st_half_result_x,odds_1st_half_result_2,odds_2nd_half_result_1,odds_2nd_half_result_x,odds_2nd_half_result_2,odds_dnb_1,odds_dnb_2,odds_corners_over_75,odds_corners_over_85,odds_corners_over_95,odds_corners_over_105,odds_corners_over_115,odds_corners_under_75,odds_corners_under_85,odds_corners_under_95,odds_corners_under_105,odds_corners_under_115,odds_corners_1,odds_corners_x,odds_corners_2,odds_team_to_score_first_1,odds_team_to_score_first_x,odds_team_to_score_first_2,odds_win_to_nil_1,odds_win_to_nil_2,odds_1st_half_over05,odds_1st_half_over15,odds_1st_half_over25,odds_1st_half_over35,odds_1st_half_under05,odds_1st_half_under15,odds_1st_half_under25,odds_1st_half_under35,odds_2nd_half_over05,odds_2nd_half_over15,odds_2nd_half_over25,odds_2nd_half_over35,odds_2nd_half_under05,odds_2nd_half_under15,odds_2nd_half_under25,odds_2nd_half_under35,odds_btts_1st_half_yes,odds_btts_1st_half_no,odds_btts_2nd_half_yes,odds_btts_2nd_half_no,overallGoalCount,ht_goals_team_a,ht_goals_team_b,goals_2hg_team_a,goals_2hg_team_b,GoalCount_2hg,HTGoalCount,date_unix,winningTeam,no_home_away,btts_potential,btts_fhg_potential,btts_2hg_potential,goalTimingDisabled,attendance,corner_timings_recorded,card_timings_recorded,team_a_fh_corners,team_b_fh_corners,team_a_2h_corners,team_b_2h_corners,corner_fh_count,corner_2h_count,team_a_fh_cards,team_b_fh_cards,team_a_2h_cards,team_b_2h_cards,total_fh_cards,total_2h_cards,attacks_recorded,team_a_dangerous_attacks,team_b_dangerous_attacks,team_a_attacks,team_b_attacks,team_a_xg,team_b_xg,total_xg,team_a_penalties_won,team_b_penalties_won,team_a_penalty_goals,team_b_penalty_goals,team_a_penalty_missed,team_b_penalty_missed,pens_recorded,goal_timings_recorded,team_a_0_10_min_goals,team_b_0_10_min_goals,team_a_corners_0_10_min,team_b_corners_0_10_min,team_a_cards_0_10_min,team_b_cards_0_10_min,throwins_recorded,team_a_throwins,team_b_throwins,freekicks_recorded,team_a_freekicks,team_b_freekicks,goalkicks_recorded,team_a_goalkicks,team_b_goalkicks,o45_potential,o35_potential,o25_potential,o15_potential,o05_potential,o15HT_potential,o05HT_potential,o05_2H_potential,o15_2H_potential,corners_potential,offsides_potential,cards_potential,avg_potential,home_url,home_image,home_name,away_url,away_image,away_name,home_ppg,away_ppg,pre_match_home_ppg,pre_match_away_ppg,pre_match_teamA_overall_ppg,pre_match_teamB_overall_ppg,u45_potential,u35_potential,u25_potential,u15_potential,u05_potential,corners_o85_potential,corners_o95_potential,corners_o105_potential,team_a_xg_prematch,team_b_xg_prematch,total_xg_prematch,match_url,competition_id,matches_completed_minimum,over05,over15,over25,over35,over45,over55,btts,homeGoals_timings,awayGoals_timings
6455,6689328,143,158,2023/2024,incomplete,100543,38,-1,[],[],0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,-1.0,-1.0,Selhurst Park (London),,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1716130800,-1,0,65,25,35,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,20,35,60,80,90,40,60,90,50,12.2,1.8,4.9,2.9,/clubs/crystal-palace-fc-143,teams/england-crystal-palace-fc.png,Crystal Palace,/clubs/aston-villa-fc-158,teams/england-aston-villa-fc.png,Aston Villa,0.9,1.4,0.9,1.4,1.05,2.1,80,65,40,20,10,85,75,55,1.3,1.48,2.78,/england/crystal-palace-fc-vs-aston-villa-fc-h...,9660,20,False,False,False,False,False,False,False,[],[]
6456,6689329,151,223,2023/2024,incomplete,100543,38,-1,[],[],0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,-1.0,-1.0,Anfield (Liverpool),,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1716130800,-1,0,60,30,30,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,25,35,70,85,95,50,60,90,50,11.0,4.2,4.3,3.25,/clubs/liverpool-fc-151,teams/england-liverpool-fc.png,Liverpool,/clubs/wolverhampton-wanderers-fc-223,teams/england-wolverhampton-wanderers-fc.png,Wolverhampton Wanderers,2.6,1.0,2.6,1.0,2.25,1.4,75,65,30,15,5,70,65,50,2.28,1.27,3.55,/england/liverpool-fc-vs-wolverhampton-wandere...,9660,20,False,False,False,False,False,False,False,[],[]
6457,6689330,271,162,2023/2024,incomplete,100543,38,-1,[],[],0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,-1.0,-1.0,"Kenilworth Road (Luton, Bedfordshire)",,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1716130800,-1,0,65,15,50,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,20,30,60,80,95,25,65,90,70,9.5,3.7,5.0,3.1,/clubs/luton-town-fc-271,teams/england-luton-town-fc.png,Luton Town,/clubs/fulham-fc-162,teams/england-fulham-fc.png,Fulham,0.8,0.6,0.8,0.6,0.79,1.2,80,70,40,20,5,75,55,50,1.36,1.11,2.47,/england/fulham-fc-vs-luton-town-fc-h2h-stats,9660,19,False,False,False,False,False,False,False,[],[]
6458,6689331,93,153,2023/2024,incomplete,100543,38,-1,[],[],0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,-1.0,-1.0,Etihad Stadium (Manchester),,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1716130800,-1,0,74,21,42,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,32,47,68,95,100,43,90,89,67,11.71,3.43,4.31,3.64,/clubs/manchester-city-fc-93,teams/england-manchester-city-fc.png,Manchester City,/clubs/west-ham-united-fc-153,teams/england-west-ham-united-fc.png,West Ham United,2.33,1.6,2.33,1.6,2.11,1.7,69,53,32,6,0,79,63,63,1.67,1.21,2.88,/england/manchester-city-fc-vs-west-ham-united...,9660,19,False,False,False,False,False,False,False,[],[]
6459,6689332,251,92,2023/2024,incomplete,100543,38,-1,[],[],0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,-1.0,-1.0,Bramall Lane (Sheffield),,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1716130800,-1,0,65,30,45,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,25,45,70,85,100,45,75,90,55,8.4,3.6,6.6,3.6,/clubs/sheffield-united-fc-251,teams/england-sheffield-united-fc.png,Sheffield United,/clubs/tottenham-hotspur-fc-92,teams/england-tottenham-hotspur-fc.png,Tottenham Hotspur,0.7,1.8,0.7,1.8,0.45,1.95,75,55,30,15,0,80,70,55,1.14,1.47,2.61,/england/tottenham-hotspur-fc-vs-sheffield-uni...,9660,20,False,False,False,False,False,False,False,[],[]


In [8]:
# inspect random sample of rows
data.sample(5)

Unnamed: 0,id,homeID,awayID,season,status,roundID,game_week,revised_game_week,homeGoals,awayGoals,homeGoalCount,awayGoalCount,totalGoalCount,team_a_corners,team_b_corners,totalCornerCount,team_a_offsides,team_b_offsides,team_a_yellow_cards,team_b_yellow_cards,team_a_red_cards,team_b_red_cards,team_a_shotsOnTarget,team_b_shotsOnTarget,team_a_shotsOffTarget,team_b_shotsOffTarget,team_a_shots,team_b_shots,team_a_fouls,team_b_fouls,team_a_possession,team_b_possession,refereeID,coach_a_ID,coach_b_ID,stadium_name,stadium_location,team_a_cards_num,team_b_cards_num,odds_ft_1,odds_ft_x,odds_ft_2,odds_ft_over05,odds_ft_over15,odds_ft_over25,odds_ft_over35,odds_ft_over45,odds_ft_under05,odds_ft_under15,odds_ft_under25,odds_ft_under35,odds_ft_under45,odds_btts_yes,odds_btts_no,odds_team_a_cs_yes,odds_team_a_cs_no,odds_team_b_cs_yes,odds_team_b_cs_no,odds_doublechance_1x,odds_doublechance_12,odds_doublechance_x2,odds_1st_half_result_1,odds_1st_half_result_x,odds_1st_half_result_2,odds_2nd_half_result_1,odds_2nd_half_result_x,odds_2nd_half_result_2,odds_dnb_1,odds_dnb_2,odds_corners_over_75,odds_corners_over_85,odds_corners_over_95,odds_corners_over_105,odds_corners_over_115,odds_corners_under_75,odds_corners_under_85,odds_corners_under_95,odds_corners_under_105,odds_corners_under_115,odds_corners_1,odds_corners_x,odds_corners_2,odds_team_to_score_first_1,odds_team_to_score_first_x,odds_team_to_score_first_2,odds_win_to_nil_1,odds_win_to_nil_2,odds_1st_half_over05,odds_1st_half_over15,odds_1st_half_over25,odds_1st_half_over35,odds_1st_half_under05,odds_1st_half_under15,odds_1st_half_under25,odds_1st_half_under35,odds_2nd_half_over05,odds_2nd_half_over15,odds_2nd_half_over25,odds_2nd_half_over35,odds_2nd_half_under05,odds_2nd_half_under15,odds_2nd_half_under25,odds_2nd_half_under35,odds_btts_1st_half_yes,odds_btts_1st_half_no,odds_btts_2nd_half_yes,odds_btts_2nd_half_no,overallGoalCount,ht_goals_team_a,ht_goals_team_b,goals_2hg_team_a,goals_2hg_team_b,GoalCount_2hg,HTGoalCount,date_unix,winningTeam,no_home_away,btts_potential,btts_fhg_potential,btts_2hg_potential,goalTimingDisabled,attendance,corner_timings_recorded,card_timings_recorded,team_a_fh_corners,team_b_fh_corners,team_a_2h_corners,team_b_2h_corners,corner_fh_count,corner_2h_count,team_a_fh_cards,team_b_fh_cards,team_a_2h_cards,team_b_2h_cards,total_fh_cards,total_2h_cards,attacks_recorded,team_a_dangerous_attacks,team_b_dangerous_attacks,team_a_attacks,team_b_attacks,team_a_xg,team_b_xg,total_xg,team_a_penalties_won,team_b_penalties_won,team_a_penalty_goals,team_b_penalty_goals,team_a_penalty_missed,team_b_penalty_missed,pens_recorded,goal_timings_recorded,team_a_0_10_min_goals,team_b_0_10_min_goals,team_a_corners_0_10_min,team_b_corners_0_10_min,team_a_cards_0_10_min,team_b_cards_0_10_min,throwins_recorded,team_a_throwins,team_b_throwins,freekicks_recorded,team_a_freekicks,team_b_freekicks,goalkicks_recorded,team_a_goalkicks,team_b_goalkicks,o45_potential,o35_potential,o25_potential,o15_potential,o05_potential,o15HT_potential,o05HT_potential,o05_2H_potential,o15_2H_potential,corners_potential,offsides_potential,cards_potential,avg_potential,home_url,home_image,home_name,away_url,away_image,away_name,home_ppg,away_ppg,pre_match_home_ppg,pre_match_away_ppg,pre_match_teamA_overall_ppg,pre_match_teamB_overall_ppg,u45_potential,u35_potential,u25_potential,u15_potential,u05_potential,corners_o85_potential,corners_o95_potential,corners_o105_potential,team_a_xg_prematch,team_b_xg_prematch,total_xg_prematch,match_url,competition_id,matches_completed_minimum,over05,over15,over25,over35,over45,over55,btts,homeGoals_timings,awayGoals_timings
1892,49339,217,59,2017/2018,complete,243,38,-1,[],['38'],0,1,1,7,4,11,0,2,1,0,0,0,5,5,12,3,17,8,12,8,47,53,393.0,403.0,145.0,The John Smith's Stadium,"Stadium Way, Huddersfield, West Yorkshire",1,0,5.6,4.41,1.61,1.01,1.12,1.42,2.05,3.7,29.0,7.2,3.0,1.84,1.35,1.5,2.4,9.0,1.07,3.0,1.36,2.4,1.21,1.17,5.3,2.67,1.93,5.0,2.95,1.79,0,0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.72,0.0,0.0,0.0,0.0,2.75,19.0,1.44,13.0,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.25,2.16,4.5,9.0,3.8,1.68,1.18,1.05,0,0,0,0,1,0,1,0,0,0,1,1526220000,59,0,53,17,25,0,24122,1,1,1,4,6,0,5,6,0,0,1,0,0,1,1,0,0,0,0,1.73,1.18,2.91,0,0,0,0,0,0,1,1,0,0,1,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,11,28,50,70,86,34,64,78,42,10.22,4.28,3.5,2.5,/clubs/huddersfield-town-fc-217,teams/england-huddersfield-town-fc.png,Huddersfield Town,/clubs/arsenal-fc-59,teams/england-arsenal-fc.png,Arsenal,1.21,0.84,1.28,0.72,1.0,1.62,89,73,50,31,14,67,48,34,1.27,1.54,2.81,/england/arsenal-fc-vs-huddersfield-town-fc-h2...,161,38,True,False,False,False,False,False,False,[],['38']
3995,753229,149,221,2009/2010,complete,53608,20,-1,"['28', '32', '44', '50', '75']",[],5,0,5,-1,-1,-1,-1,-1,0,1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,361.0,22031.0,221.0,Old Trafford (Manchester),,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,5,3,0,2,0,2,3,1262203200,149,0,56,17,39,0,74560,-1,1,-1,-1,-1,-1,-1,-1,0,0,0,1,0,1,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,11,39,78,89,100,33,84,89,73,0.0,0.0,2.89,3.56,/clubs/manchester-united-fc-149,teams/england-manchester-united-fc.png,Manchester United,/clubs/wigan-athletic-fc-221,teams/england-wigan-athletic-fc.png,Wigan Athletic,2.58,0.58,2.44,0.78,2.11,1.06,89,61,22,11,0,0,0,0,0.0,0.0,0.0,/england/manchester-united-fc-vs-wigan-athleti...,3125,38,True,True,True,True,True,False,False,"['28', '32', '44', '50', '75']",[]
5331,1308277,158,157,2021/2022,complete,72035,2,-1,"['45+3', '62']",[],2,0,2,3,4,7,3,2,3,4,0,0,3,2,10,8,13,10,8,11,47,53,721.0,404.0,235.0,Villa Park (Birmingham),,3,4,1.83,3.9,4.15,1.06,1.27,1.74,2.8,4.85,13.0,4.1,2.2,1.5,1.21,1.74,2.2,2.75,1.5,4.9,1.21,1.22,1.19,2.0,2.38,2.3,4.5,2.05,2.55,4.0,0,0,1.15,1.36,1.62,1.97,2.46,4.7,2.9,2.3,1.85,1.54,1.39,9.1,4.05,1.5,9.0,2.75,3.45,8.0,1.33,2.63,7.0,19.0,3.25,1.44,1.1,1.02,1.22,2.0,4.1,8.5,3.9,1.72,1.2,1.04,0,0,0,0,2,1,0,1,0,1,1,1629554400,158,0,0,0,0,0,41964,1,1,2,1,1,3,3,4,1,2,2,2,3,4,1,39,38,83,73,1.31,1.05,2.36,1,0,1,0,0,0,1,1,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,/clubs/aston-villa-fc-158,teams/england-aston-villa-fc.png,Aston Villa,/clubs/newcastle-united-fc-157,teams/england-newcastle-united-fc.png,Newcastle United,1.21,1.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,/england/newcastle-united-fc-vs-aston-villa-fc...,6135,38,True,True,False,False,False,False,False,"['45+3', '62']",[]
1783,49230,93,108,2017/2018,complete,243,27,-1,"['3', '48', '53', '77', '90']",['24'],5,1,6,8,2,10,1,2,2,3,0,0,12,3,6,0,18,3,10,12,70,30,716.0,72.0,204.0,Etihad Stadium,"Rowsley Street, Manchester",2,3,1.2,7.54,9.99,1.01,1.11,1.39,2.0,3.2,26.0,6.8,3.2,1.83,1.35,1.87,1.83,2.0,1.72,15.0,1.03,1.03,1.1,4.5,1.59,2.95,6.5,1.44,3.6,8.0,0,0,0.0,0.0,0.0,1.8,0.0,0.0,0.0,0.0,1.9,0.0,1.05,18.5,14.5,1.18,23.0,4.5,2.1,29.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.19,1.9,4.0,11.0,4.33,1.8,1.22,1.05,0,0,0,0,6,1,1,4,0,4,2,1518283800,93,0,70,23,46,0,54416,1,1,4,1,4,1,5,5,0,0,2,3,0,5,1,79,20,108,63,2.54,0.55,3.1,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,25,11,1,14,10,1,1,10,27,47,77,93,93,39,81,89,77,12.77,4.7,2.76,3.58,/clubs/manchester-city-fc-93,teams/england-manchester-city-fc.png,Manchester City,/clubs/leicester-city-fc-108,teams/england-leicester-city-fc.png,Leicester City,2.63,1.05,2.85,1.08,2.65,1.35,73,54,23,8,8,85,77,50,2.32,1.29,3.61,/england/manchester-city-fc-vs-leicester-city-...,161,38,True,True,True,True,True,True,True,"['3', '48', '53', '77', '90']",['24']
6119,6688990,251,144,2023/2024,complete,100543,4,-1,"['33', '45+3']","['14', '55']",2,2,4,4,6,10,1,2,1,2,0,0,9,7,4,3,13,10,11,13,45,55,690.0,406.0,197.0,Bramall Lane (Sheffield),,1,2,2.67,3.2,2.41,1.08,1.43,2.15,4.4,9.4,8.6,2.94,1.67,1.23,1.07,1.95,1.8,3.1,1.32,2.6,1.42,1.61,1.33,1.39,3.75,2.0,3.2,3.2,2.25,2.75,0,0,1.15,1.28,1.52,1.86,2.3,4.4,3.05,2.3,1.83,1.52,2.0,8.0,2.02,2.3,7.5,1.8,4.75,3.6,1.5,3.4,10.0,23.0,2.5,1.3,1.06,1.0,1.29,2.38,5.5,15.0,3.5,1.53,1.14,1.03,0,0,0,0,4,2,1,0,1,1,3,1693654200,-1,0,25,0,25,0,31124,1,1,2,4,2,2,6,4,0,1,1,1,1,2,1,33,54,82,134,1.69,1.53,3.22,0,0,0,0,0,0,1,1,0,0,1,0,0,1,1,27,26,1,15,12,1,7,2,0,50,75,75,100,50,50,100,75,9.0,2.5,7.5,3.0,/clubs/sheffield-united-fc-251,teams/england-sheffield-united-fc.png,Sheffield United,/clubs/everton-fc-144,teams/england-everton-fc.png,Everton,0.7,1.6,0.0,0.0,0.0,0.0,100,50,25,25,0,100,100,75,0.78,0.93,1.71,/england/everton-fc-vs-sheffield-united-fc-h2h...,9660,20,True,True,True,True,False,False,True,"['33', '45+3']","['14', '55']"


### Columns of interest:
This were are columns that were identified to contain interesting, problematic or missing data and warranted further inspection.

#### Status Column
The first column of interest is the status colum which indicates the whether a match was completed. Looking at the head, tail and random sample snapshots of the df, the status of a match can be either complete or incomplete. Let's inspect unique values in the column:

In [9]:
# check unique values in the status column
data['status'].unique()

array(['complete', 'incomplete', 'suspended'], dtype=object)

Based on the output above, match status can `complete`, `incomplete`, or `suspended`. The dataframe snapshots, particularly the tail, shows that most of the incomplete matches are from the 2023/2024 season. This makes sense because, the 2023/2024 is ongoing and there are bound to be incomplete matches. For our analysis we will want to drop entries where the match status. 

In [10]:
# Check list of incomplete matches
data[data['status'] == 'incomplete' ].sample(10)

Unnamed: 0,id,homeID,awayID,season,status,roundID,game_week,revised_game_week,homeGoals,awayGoals,homeGoalCount,awayGoalCount,totalGoalCount,team_a_corners,team_b_corners,totalCornerCount,team_a_offsides,team_b_offsides,team_a_yellow_cards,team_b_yellow_cards,team_a_red_cards,team_b_red_cards,team_a_shotsOnTarget,team_b_shotsOnTarget,team_a_shotsOffTarget,team_b_shotsOffTarget,team_a_shots,team_b_shots,team_a_fouls,team_b_fouls,team_a_possession,team_b_possession,refereeID,coach_a_ID,coach_b_ID,stadium_name,stadium_location,team_a_cards_num,team_b_cards_num,odds_ft_1,odds_ft_x,odds_ft_2,odds_ft_over05,odds_ft_over15,odds_ft_over25,odds_ft_over35,odds_ft_over45,odds_ft_under05,odds_ft_under15,odds_ft_under25,odds_ft_under35,odds_ft_under45,odds_btts_yes,odds_btts_no,odds_team_a_cs_yes,odds_team_a_cs_no,odds_team_b_cs_yes,odds_team_b_cs_no,odds_doublechance_1x,odds_doublechance_12,odds_doublechance_x2,odds_1st_half_result_1,odds_1st_half_result_x,odds_1st_half_result_2,odds_2nd_half_result_1,odds_2nd_half_result_x,odds_2nd_half_result_2,odds_dnb_1,odds_dnb_2,odds_corners_over_75,odds_corners_over_85,odds_corners_over_95,odds_corners_over_105,odds_corners_over_115,odds_corners_under_75,odds_corners_under_85,odds_corners_under_95,odds_corners_under_105,odds_corners_under_115,odds_corners_1,odds_corners_x,odds_corners_2,odds_team_to_score_first_1,odds_team_to_score_first_x,odds_team_to_score_first_2,odds_win_to_nil_1,odds_win_to_nil_2,odds_1st_half_over05,odds_1st_half_over15,odds_1st_half_over25,odds_1st_half_over35,odds_1st_half_under05,odds_1st_half_under15,odds_1st_half_under25,odds_1st_half_under35,odds_2nd_half_over05,odds_2nd_half_over15,odds_2nd_half_over25,odds_2nd_half_over35,odds_2nd_half_under05,odds_2nd_half_under15,odds_2nd_half_under25,odds_2nd_half_under35,odds_btts_1st_half_yes,odds_btts_1st_half_no,odds_btts_2nd_half_yes,odds_btts_2nd_half_no,overallGoalCount,ht_goals_team_a,ht_goals_team_b,goals_2hg_team_a,goals_2hg_team_b,GoalCount_2hg,HTGoalCount,date_unix,winningTeam,no_home_away,btts_potential,btts_fhg_potential,btts_2hg_potential,goalTimingDisabled,attendance,corner_timings_recorded,card_timings_recorded,team_a_fh_corners,team_b_fh_corners,team_a_2h_corners,team_b_2h_corners,corner_fh_count,corner_2h_count,team_a_fh_cards,team_b_fh_cards,team_a_2h_cards,team_b_2h_cards,total_fh_cards,total_2h_cards,attacks_recorded,team_a_dangerous_attacks,team_b_dangerous_attacks,team_a_attacks,team_b_attacks,team_a_xg,team_b_xg,total_xg,team_a_penalties_won,team_b_penalties_won,team_a_penalty_goals,team_b_penalty_goals,team_a_penalty_missed,team_b_penalty_missed,pens_recorded,goal_timings_recorded,team_a_0_10_min_goals,team_b_0_10_min_goals,team_a_corners_0_10_min,team_b_corners_0_10_min,team_a_cards_0_10_min,team_b_cards_0_10_min,throwins_recorded,team_a_throwins,team_b_throwins,freekicks_recorded,team_a_freekicks,team_b_freekicks,goalkicks_recorded,team_a_goalkicks,team_b_goalkicks,o45_potential,o35_potential,o25_potential,o15_potential,o05_potential,o15HT_potential,o05HT_potential,o05_2H_potential,o15_2H_potential,corners_potential,offsides_potential,cards_potential,avg_potential,home_url,home_image,home_name,away_url,away_image,away_name,home_ppg,away_ppg,pre_match_home_ppg,pre_match_away_ppg,pre_match_teamA_overall_ppg,pre_match_teamB_overall_ppg,u45_potential,u35_potential,u25_potential,u15_potential,u05_potential,corners_o85_potential,corners_o95_potential,corners_o105_potential,team_a_xg_prematch,team_b_xg_prematch,total_xg_prematch,match_url,competition_id,matches_completed_minimum,over05,over15,over25,over35,over45,over55,btts,homeGoals_timings,awayGoals_timings
6377,6689250,211,143,2023/2024,incomplete,100543,30,-1,[],[],0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,-1.0,-1.0,"The City Ground (Nottingham, Nottinghamshire)","Pavilion Road, Nottingham, Nottinghamshire",-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1711810800,-1,0,65,10,45,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,10,30,50,85,100,15,60,95,45,7.7,3.6,3.7,2.75,/clubs/nottingham-forest-fc-211,teams/england-nottingham-forest-fc.png,Nottingham Forest,/clubs/crystal-palace-fc-143,teams/england-crystal-palace-fc.png,Crystal Palace,1.2,1.2,1.2,1.2,1.0,1.05,90,70,50,15,0,70,55,35,1.27,1.19,2.46,/england/crystal-palace-fc-vs-nottingham-fores...,9660,20,False,False,False,False,False,False,False,[],[]
6406,6689279,93,271,2023/2024,incomplete,100543,33,-1,[],[],0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,-1.0,-1.0,Etihad Stadium (Manchester),,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1713016800,-1,0,67,17,45,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,28,50,67,84,100,34,78,84,56,11.22,3.11,3.67,3.5,/clubs/manchester-city-fc-93,teams/england-manchester-city-fc.png,Manchester City,/clubs/luton-town-fc-271,teams/england-luton-town-fc.png,Luton Town,2.33,0.78,2.33,0.78,2.11,0.79,73,50,33,17,0,73,62,50,1.67,0.94,2.61,/england/manchester-city-fc-vs-luton-town-fc-h...,9660,19,False,False,False,False,False,False,False,[],[]
6400,6689273,148,149,2023/2024,incomplete,100543,33,-1,[],[],0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,-1.0,-1.0,"Vitality Stadium (Bournemouth, Dorset)",,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1713016800,-1,0,37,21,16,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,16,48,75,90,27,54,85,59,9.79,3.87,4.38,2.28,/clubs/afc-bournemouth-148,teams/england-afc-bournemouth.png,AFC Bournemouth,/clubs/manchester-united-fc-149,teams/england-manchester-united-fc.png,Manchester United,1.33,1.3,1.33,1.3,1.32,1.55,100,84,52,26,11,84,74,69,1.41,1.22,2.63,/england/afc-bournemouth-vs-manchester-united-...,9660,19,False,False,False,False,False,False,False,[],[]
6358,6689231,153,145,2023/2024,incomplete,100543,28,-1,[],[],0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,-1.0,-1.0,London Stadium (London),,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1709996400,-1,0,55,20,25,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,10,30,50,85,95,30,80,85,50,8.8,4.1,5.5,2.7,/clubs/west-ham-united-fc-153,teams/england-west-ham-united-fc.png,West Ham United,/clubs/burnley-fc-145,teams/england-burnley-fc.png,Burnley,1.8,0.8,1.8,0.8,1.7,0.55,90,70,50,15,5,65,55,50,1.32,1.08,2.4,/england/burnley-fc-vs-west-ham-united-fc-h2h-...,9660,20,False,False,False,False,False,False,False,[],[]
6366,6689239,271,211,2023/2024,incomplete,100543,29,-1,[],[],0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,-1.0,-1.0,"Kenilworth Road (Luton, Bedfordshire)",,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1710601200,-1,0,65,25,40,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,25,30,60,80,95,50,70,80,60,9.3,3.7,5.2,3.0,/clubs/luton-town-fc-271,teams/england-luton-town-fc.png,Luton Town,/clubs/nottingham-forest-fc-211,teams/england-nottingham-forest-fc.png,Nottingham Forest,0.8,0.8,0.8,0.8,0.79,1.0,75,70,40,20,5,85,70,65,1.36,1.06,2.42,/england/nottingham-forest-fc-vs-luton-town-fc...,9660,19,False,False,False,False,False,False,False,[],[]
6327,6689200,211,153,2023/2024,incomplete,100543,25,-1,[],[],0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,-1.0,-1.0,"The City Ground (Nottingham, Nottinghamshire)","Pavilion Road, Nottingham, Nottinghamshire",-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1708182000,-1,0,75,15,55,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,25,40,65,95,100,20,65,100,75,8.8,4.4,4.2,3.25,/clubs/nottingham-forest-fc-211,teams/england-nottingham-forest-fc.png,Nottingham Forest,/clubs/west-ham-united-fc-153,teams/england-west-ham-united-fc.png,West Ham United,1.2,1.6,1.2,1.6,1.0,1.7,75,60,35,5,0,85,65,50,1.27,1.21,2.48,/england/west-ham-united-fc-vs-nottingham-fore...,9660,20,False,False,False,False,False,False,False,[],[]
6315,6689188,93,144,2023/2024,incomplete,100543,24,-1,[],[],0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,-1.0,-1.0,Etihad Stadium (Manchester),,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1707568200,-1,0,54,26,22,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,22,42,58,85,100,58,85,84,47,11.51,3.13,4.51,3.29,/clubs/manchester-city-fc-93,teams/england-manchester-city-fc.png,Manchester City,/clubs/everton-fc-144,teams/england-everton-fc.png,Everton,2.33,1.6,2.33,1.6,2.11,1.3,79,58,42,16,0,64,58,53,1.67,1.32,2.99,/england/manchester-city-fc-vs-everton-fc-h2h-...,9660,19,False,False,False,False,False,False,False,[],[]
6362,6689235,145,218,2023/2024,incomplete,100543,29,-1,[],[],0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,-1.0,-1.0,Turf Moor (Burnley),,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1710601200,-1,0,42,21,21,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,15,26,58,84,100,41,79,85,53,11.48,3.33,4.16,2.92,/clubs/burnley-fc-145,teams/england-burnley-fc.png,Burnley,/clubs/brentford-fc-218,teams/england-brentford-fc.png,Brentford,0.3,0.78,0.3,0.78,0.55,1.0,85,75,42,16,0,85,80,59,1.27,1.33,2.6,/england/burnley-fc-vs-brentford-fc-h2h-stats,9660,19,False,False,False,False,False,False,False,[],[]
6452,6689325,209,149,2023/2024,incomplete,100543,38,-1,[],[],0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,-1.0,-1.0,The American Express Community Stadium (Falmer...,,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1716130800,-1,0,65,25,25,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,10,35,55,80,95,30,70,90,50,11.5,4.0,4.8,2.8,/clubs/brighton-hove-albion-fc-209,teams/england-brighton-hove-albion-fc.png,Brighton & Hove Albion,/clubs/manchester-united-fc-149,teams/england-manchester-united-fc.png,Manchester United,1.9,1.3,1.9,1.3,1.55,1.55,90,65,45,20,5,85,65,45,2.0,1.22,3.22,/england/manchester-united-fc-vs-brighton-hove...,9660,20,False,False,False,False,False,False,False,[],[]
6336,6689209,151,271,2023/2024,incomplete,100543,26,-1,[],[],0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,-1.0,-1.0,Anfield (Liverpool),,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1708786800,-1,0,59,26,38,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,21,48,74,84,95,36,63,85,59,11.71,4.28,2.96,3.37,/clubs/liverpool-fc-151,teams/england-liverpool-fc.png,Liverpool,/clubs/luton-town-fc-271,teams/england-luton-town-fc.png,Luton Town,2.6,0.78,2.6,0.78,2.25,0.79,79,52,26,16,5,84,74,52,2.28,0.94,3.22,/england/liverpool-fc-vs-luton-town-fc-h2h-stats,9660,19,False,False,False,False,False,False,False,[],[]


In [11]:
# Check seasons from which matches are incomplete
data[data['status'] == 'incomplete' ]['season'].unique()

array(['2023/2024'], dtype=object)

In [12]:
# Check list of incomplete matches
data[data['status'] == 'suspended' ]['season'].unique()

array(['2023/2024'], dtype=object)

The above inspection confirms that all matches with the `incomplete` or `suspended` status are the current season. 

#### game_week and revised_game_week columns
The `revised_game_week` column indicates indicates whether the game-week (the week of the season when the match was played) was revised or not. We observe that for inspected sections of the dataframe, no matches appear to have revised match weeks.

In [13]:
# check for unique values in the 'revised_game_week' column
data['revised_game_week'].unique()

array([-1], dtype=int64)

 -1 is the only unique value in the `revised_game_week` column. Thus this column does not provide any useful information and will be dropped.
 As mentioned before, the `game_week`, indicates the week of the season when the match was played. Typically, the EPL games run from from game-week 1 to game-week 38 (assuming a team plays a single EPL game every week - which is usually the case). 

In [14]:
# check for unique values in the 'game_week' column
data['game_week'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38,  0], dtype=int64)

The above output mostly confirms our that there are typically 38 game weeks in a season. However, there appears to be some entries where the column value is 0.

In [15]:
# check for entries where game_week == 0
data[data['game_week'] == 0]

Unnamed: 0,id,homeID,awayID,season,status,roundID,game_week,revised_game_week,homeGoals,awayGoals,homeGoalCount,awayGoalCount,totalGoalCount,team_a_corners,team_b_corners,totalCornerCount,team_a_offsides,team_b_offsides,team_a_yellow_cards,team_b_yellow_cards,team_a_red_cards,team_b_red_cards,team_a_shotsOnTarget,team_b_shotsOnTarget,team_a_shotsOffTarget,team_b_shotsOffTarget,team_a_shots,team_b_shots,team_a_fouls,team_b_fouls,team_a_possession,team_b_possession,refereeID,coach_a_ID,coach_b_ID,stadium_name,stadium_location,team_a_cards_num,team_b_cards_num,odds_ft_1,odds_ft_x,odds_ft_2,odds_ft_over05,odds_ft_over15,odds_ft_over25,odds_ft_over35,odds_ft_over45,odds_ft_under05,odds_ft_under15,odds_ft_under25,odds_ft_under35,odds_ft_under45,odds_btts_yes,odds_btts_no,odds_team_a_cs_yes,odds_team_a_cs_no,odds_team_b_cs_yes,odds_team_b_cs_no,odds_doublechance_1x,odds_doublechance_12,odds_doublechance_x2,odds_1st_half_result_1,odds_1st_half_result_x,odds_1st_half_result_2,odds_2nd_half_result_1,odds_2nd_half_result_x,odds_2nd_half_result_2,odds_dnb_1,odds_dnb_2,odds_corners_over_75,odds_corners_over_85,odds_corners_over_95,odds_corners_over_105,odds_corners_over_115,odds_corners_under_75,odds_corners_under_85,odds_corners_under_95,odds_corners_under_105,odds_corners_under_115,odds_corners_1,odds_corners_x,odds_corners_2,odds_team_to_score_first_1,odds_team_to_score_first_x,odds_team_to_score_first_2,odds_win_to_nil_1,odds_win_to_nil_2,odds_1st_half_over05,odds_1st_half_over15,odds_1st_half_over25,odds_1st_half_over35,odds_1st_half_under05,odds_1st_half_under15,odds_1st_half_under25,odds_1st_half_under35,odds_2nd_half_over05,odds_2nd_half_over15,odds_2nd_half_over25,odds_2nd_half_over35,odds_2nd_half_under05,odds_2nd_half_under15,odds_2nd_half_under25,odds_2nd_half_under35,odds_btts_1st_half_yes,odds_btts_1st_half_no,odds_btts_2nd_half_yes,odds_btts_2nd_half_no,overallGoalCount,ht_goals_team_a,ht_goals_team_b,goals_2hg_team_a,goals_2hg_team_b,GoalCount_2hg,HTGoalCount,date_unix,winningTeam,no_home_away,btts_potential,btts_fhg_potential,btts_2hg_potential,goalTimingDisabled,attendance,corner_timings_recorded,card_timings_recorded,team_a_fh_corners,team_b_fh_corners,team_a_2h_corners,team_b_2h_corners,corner_fh_count,corner_2h_count,team_a_fh_cards,team_b_fh_cards,team_a_2h_cards,team_b_2h_cards,total_fh_cards,total_2h_cards,attacks_recorded,team_a_dangerous_attacks,team_b_dangerous_attacks,team_a_attacks,team_b_attacks,team_a_xg,team_b_xg,total_xg,team_a_penalties_won,team_b_penalties_won,team_a_penalty_goals,team_b_penalty_goals,team_a_penalty_missed,team_b_penalty_missed,pens_recorded,goal_timings_recorded,team_a_0_10_min_goals,team_b_0_10_min_goals,team_a_corners_0_10_min,team_b_corners_0_10_min,team_a_cards_0_10_min,team_b_cards_0_10_min,throwins_recorded,team_a_throwins,team_b_throwins,freekicks_recorded,team_a_freekicks,team_b_freekicks,goalkicks_recorded,team_a_goalkicks,team_b_goalkicks,o45_potential,o35_potential,o25_potential,o15_potential,o05_potential,o15HT_potential,o05HT_potential,o05_2H_potential,o15_2H_potential,corners_potential,offsides_potential,cards_potential,avg_potential,home_url,home_image,home_name,away_url,away_image,away_name,home_ppg,away_ppg,pre_match_home_ppg,pre_match_away_ppg,pre_match_teamA_overall_ppg,pre_match_teamB_overall_ppg,u45_potential,u35_potential,u25_potential,u15_potential,u05_potential,corners_o85_potential,corners_o95_potential,corners_o105_potential,team_a_xg_prematch,team_b_xg_prematch,total_xg_prematch,match_url,competition_id,matches_completed_minimum,over05,over15,over25,over35,over45,over55,btts,homeGoals_timings,awayGoals_timings
4180,753648,59,142,2008/2009,complete,53612,0,-1,['4'],[],1,0,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,145.0,422.0,Emirates Stadium (London),,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,1,0,0,0,0,1,1218887100,59,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,1,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.00,0.00,/clubs/arsenal-fc-59,teams/england-arsenal-fc.png,Arsenal,/clubs/west-bromwich-albion-fc-142,teams/england-west-bromwich-albion-fc.png,West Bromwich Albion,2.00,0.42,0.00,0.00,0.00,0.00,0,0,0,0,0,0,0,0,0.0,0.0,0.0,/england/arsenal-fc-vs-west-bromwich-albion-fc...,3131,38,True,False,False,False,False,False,False,['4'],[]
4181,753649,153,221,2008/2009,complete,53612,0,-1,"['4', '10']",['48'],2,1,3,-1,-1,-1,-1,-1,2,1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,32139.0,235.0,Boleyn Ground (London),"Green Street, London",2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,3,2,0,0,1,1,2,1218895200,153,0,0,0,0,0,-1,-1,1,-1,-1,-1,-1,-1,-1,1,0,1,1,1,2,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,2,0,-1,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.00,0.00,/clubs/west-ham-united-fc-153,teams/england-west-ham-united-fc.png,West Ham United,/clubs/wigan-athletic-fc-221,teams/england-wigan-athletic-fc.png,Wigan Athletic,1.53,0.84,0.00,0.00,0.00,0.00,0,0,0,0,0,0,0,0,0.0,0.0,0.0,/england/west-ham-united-fc-vs-wigan-athletic-...,3131,38,True,True,True,False,False,False,True,"['4', '10']",['48']
4182,753650,147,92,2008/2009,complete,53612,0,-1,"['71', '87']",['90'],2,1,3,-1,-1,-1,-1,-1,1,2,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,10655.0,598.0,Riverside Stadium (Middlesbrough),,1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,3,0,0,2,1,3,0,1218895200,147,0,0,0,0,0,-1,-1,1,-1,-1,-1,-1,-1,-1,0,0,1,2,0,3,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.00,0.00,/clubs/middlesbrough-fc-147,teams/england-middlesbrough-fc.png,Middlesbrough,/clubs/tottenham-hotspur-fc-92,teams/england-tottenham-hotspur-fc.png,Tottenham Hotspur,1.26,0.84,0.00,0.00,0.00,0.00,0,0,0,0,0,0,0,0,0.0,0.0,0.0,/england/tottenham-hotspur-fc-vs-middlesbrough...,3131,38,True,True,True,False,False,False,True,"['71', '87']",['90']
4183,753651,150,162,2008/2009,complete,53612,0,-1,"['23', '81']",['8'],2,1,3,-1,-1,-1,-1,-1,3,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,492.0,3610.0,KCOM Stadium (Hull),,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,3,1,1,1,0,1,2,1218895200,150,0,0,0,0,0,-1,-1,1,-1,-1,-1,-1,-1,-1,2,0,1,0,2,1,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,1,-1,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.00,0.00,/clubs/hull-city-afc-150,teams/england-hull-city-afc.png,Hull City,/clubs/fulham-fc-162,teams/england-fulham-fc.png,Fulham,0.74,0.89,0.00,0.00,0.00,0.00,0,0,0,0,0,0,0,0,0.0,0.0,0.0,/england/hull-city-afc-vs-fulham-fc-h2h-stats#...,3131,38,True,True,True,False,False,False,True,"['23', '81']",['8']
4184,753652,144,216,2008/2009,complete,53612,0,-1,"['44', '64']","['21', '66', '90']",2,3,5,-1,-1,-1,-1,-1,2,2,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,206.0,21130.0,Goodison Park (Liverpool),,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,5,1,1,1,2,3,2,1218895200,216,0,0,0,0,0,-1,-1,1,-1,-1,-1,-1,-1,-1,0,1,2,1,1,3,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.00,0.00,/clubs/everton-fc-144,teams/england-everton-fc.png,Everton,/clubs/blackburn-rovers-fc-216,teams/england-blackburn-rovers-fc.png,Blackburn Rovers,1.58,0.84,0.00,0.00,0.00,0.00,0,0,0,0,0,0,0,0,0.0,0.0,0.0,/england/everton-fc-vs-blackburn-rovers-fc-h2h...,3131,38,True,True,True,True,True,False,True,"['44', '64']","['21', '66', '90']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4935,755509,153,158,2007/2008,complete,53621,0,-1,"['8', '88']","['14', '58']",2,2,4,-1,-1,-1,-1,-1,2,2,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,32139.0,2746.0,Boleyn Ground (London),"Green Street, London",2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,4,1,1,1,1,2,2,1210514400,-1,0,61,17,28,0,-1,-1,1,-1,-1,-1,-1,-1,-1,2,1,0,1,3,1,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,1,0,-1,-1,2,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,9,28,56,86,97,28,78,86,48,0.0,0.0,3.29,2.94,/clubs/west-ham-united-fc-153,teams/england-west-ham-united-fc.png,West Ham United,/clubs/aston-villa-fc-158,teams/england-aston-villa-fc.png,Aston Villa,1.47,1.42,1.50,1.44,1.30,1.59,92,72,45,14,3,0,0,0,0.0,0.0,0.0,/england/west-ham-united-fc-vs-aston-villa-fc-...,3137,38,True,True,True,True,False,False,True,"['8', '88']","['14', '58']"
4936,755510,92,151,2007/2008,complete,53621,0,-1,[],"['69', '74']",0,2,2,-1,-1,-1,-1,-1,1,1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,598.0,230.0,White Hart Lane (London),"Bill Nicholson Way, 748 High Road, London",1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,2,0,0,0,2,2,0,1210514400,151,0,58,17,33,0,-1,-1,1,-1,-1,-1,-1,-1,-1,0,0,1,1,0,2,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,17,39,58,84,89,28,75,86,56,0.0,0.0,2.72,3.20,/clubs/tottenham-hotspur-fc-92,teams/england-tottenham-hotspur-fc.png,Tottenham Hotspur,/clubs/liverpool-fc-151,teams/england-liverpool-fc.png,Liverpool,1.53,1.79,1.61,1.72,1.24,1.97,84,61,42,17,11,0,0,0,0.0,0.0,0.0,/england/tottenham-hotspur-fc-vs-liverpool-fc-...,3137,38,True,True,False,False,False,False,False,[],"['69', '74']"
4937,755511,156,59,2007/2008,complete,53621,0,-1,[],['24'],0,1,1,-1,-1,-1,-1,-1,1,1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,32140.0,145.0,Stadium of Light (Sunderland),,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,1,0,0,0,1,1210514400,59,0,64,17,31,0,-1,-1,1,-1,-1,-1,-1,-1,-1,0,0,1,1,0,2,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,12,28,53,81,95,28,73,84,39,0.0,0.0,3.97,2.75,/clubs/sunderland-afc-156,teams/england-sunderland-afc.png,Sunderland,/clubs/arsenal-fc-59,teams/england-arsenal-fc.png,Arsenal,1.58,1.89,1.67,1.83,1.05,2.16,89,72,47,20,6,0,0,0,0.0,0.0,0.0,/england/arsenal-fc-vs-sunderland-afc-h2h-stat...,3137,38,True,False,False,False,False,False,False,[],['24']
4938,755512,206,216,2007/2008,complete,53621,0,-1,"['32', '73', '90+1', '90+3']",['49'],4,1,5,-1,-1,-1,-1,-1,3,2,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,10853.0,203.0,St Andrew's Trillion Trophy Stadium (Birmingham),,3,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,5,1,0,3,1,4,1,1210514400,206,0,67,14,42,0,-1,-1,1,-1,-1,-1,-1,-1,-1,2,1,1,1,3,2,-1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,1,1,0,0,-1,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,9,28,50,78,95,20,61,86,47,0.0,0.0,4.00,2.67,/clubs/birmingham-city-fc-206,teams/england-birmingham-city-fc.png,Birmingham City,/clubs/blackburn-rovers-fc-216,teams/england-blackburn-rovers-fc.png,Blackburn Rovers,1.37,1.42,1.28,1.50,0.86,1.57,92,73,50,22,6,0,0,0,0.0,0.0,0.0,/england/birmingham-city-fc-vs-blackburn-rover...,3137,38,True,True,True,True,True,False,True,"['32', '73', '90+1', '90+3']",['49']


In [16]:
# check seasons where game_week == 0 occur
data[data['game_week'] == 0]['season'].unique()

array(['2008/2009', '2007/2008'], dtype=object)

In [17]:
# check season unique game_week values for the 2008/2009 season
data[data['season'] == '2008/2009']['game_week'].unique()

array([0], dtype=int64)

In [18]:
# check season unique game_week values for the 2007/2008 season
data[data['season'] == '2007/2008']['game_week'].unique()

array([0], dtype=int64)

In [19]:
# check season unique game_week values for the 2009/2010 season
data[data['season'] == '2009/2010']['game_week'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 17, 14, 15, 16,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38], dtype=int64)

Further inspection shows that matches from the `2007/2008` and `2008/2009` seasons have all values in the `game_week` column as 0. `game_week` information seems to have been stardardized starting from `2009/2010` season. If `game_week` data from `2007/2008` and `2008/2009` seasons is to be used during EDA or modelling, it will be important to indicate the correct match-week information.

#### homeGoals and awayGoals columns
This columns consist of arrays that store goal timing data (when goals were scored during the match). Upon closer scrutiny of the dataset, it was observed that the columns `homeGoals` and `homeGoals_timings` contained identical information, as did `awayGoals` and `awayGoals_timings`. In the interest of eliminating redundancy and retaining more descriptive columns, it makes sense to remove the `homeGoals` and `awayGoals` columns.

In [20]:
data[['homeGoals', 'awayGoals', 'homeGoals_timings', 'awayGoals_timings' ]].head()

Unnamed: 0,homeGoals,awayGoals,homeGoals_timings,awayGoals_timings
0,"['45+1', '57']",['47'],"['45+1', '57']",['47']
1,[],['82'],[],['82']
2,[],['74'],[],['74']
3,['5'],['59'],['5'],['59']
4,['11'],['67'],['11'],['67']


#### corner & offside columns
Dataframe snapshots (head, tail, sample) showed that some seasons, particularly seasons priror to the `2013/2014` season had missing data (encoded as -1) in the following columns 
- `team_a_corners`
- `team_b_corners`
- `totalCornerCount`
- `team_a_offsides`
- `team_b_offsides`
- `team_a_shotsOnTarget`
- `team_b_shotsOnTarget`
- `team_a_shotsOffTarget`
- `team_b_shotsOffTarget`
- `team_a_shots`
- `team_b_shots`
- `team_a_fouls`
- `team_b_fouls`
- `team_a_possession`
- `team_b_possession`

In [21]:
# group by season and get descriptive stats for columns suspected to contain missing data
missing_data_columns = [
                        'team_a_corners',
                        'team_b_corners',
                        'totalCornerCount',
                        'team_a_offsides',
                        'team_b_offsides',
                        'team_a_shotsOnTarget',
                        'team_b_shotsOnTarget',
                        'team_a_shotsOffTarget',
                        'team_b_shotsOffTarget',
                        'team_a_shots',
                        'team_b_shots',
                        'team_a_fouls',
                        'team_b_fouls',
                        'team_a_possession',
                        'team_b_possession'
                        ]

data.groupby('season')[missing_data_columns].describe()

Unnamed: 0_level_0,team_a_corners,team_a_corners,team_a_corners,team_a_corners,team_a_corners,team_a_corners,team_a_corners,team_a_corners,team_b_corners,team_b_corners,team_b_corners,team_b_corners,team_b_corners,team_b_corners,team_b_corners,team_b_corners,totalCornerCount,totalCornerCount,totalCornerCount,totalCornerCount,totalCornerCount,totalCornerCount,totalCornerCount,totalCornerCount,team_a_offsides,team_a_offsides,team_a_offsides,team_a_offsides,team_a_offsides,team_a_offsides,team_a_offsides,team_a_offsides,team_b_offsides,team_b_offsides,team_b_offsides,team_b_offsides,team_b_offsides,team_b_offsides,team_b_offsides,team_b_offsides,team_a_shotsOnTarget,team_a_shotsOnTarget,team_a_shotsOnTarget,team_a_shotsOnTarget,team_a_shotsOnTarget,team_a_shotsOnTarget,team_a_shotsOnTarget,team_a_shotsOnTarget,team_b_shotsOnTarget,team_b_shotsOnTarget,team_b_shotsOnTarget,team_b_shotsOnTarget,team_b_shotsOnTarget,team_b_shotsOnTarget,team_b_shotsOnTarget,team_b_shotsOnTarget,team_a_shotsOffTarget,team_a_shotsOffTarget,team_a_shotsOffTarget,team_a_shotsOffTarget,team_a_shotsOffTarget,team_a_shotsOffTarget,team_a_shotsOffTarget,team_a_shotsOffTarget,team_b_shotsOffTarget,team_b_shotsOffTarget,team_b_shotsOffTarget,team_b_shotsOffTarget,team_b_shotsOffTarget,team_b_shotsOffTarget,team_b_shotsOffTarget,team_b_shotsOffTarget,team_a_shots,team_a_shots,team_a_shots,team_a_shots,team_a_shots,team_a_shots,team_a_shots,team_a_shots,team_b_shots,team_b_shots,team_b_shots,team_b_shots,team_b_shots,team_b_shots,team_b_shots,team_b_shots,team_a_fouls,team_a_fouls,team_a_fouls,team_a_fouls,team_a_fouls,team_a_fouls,team_a_fouls,team_a_fouls,team_b_fouls,team_b_fouls,team_b_fouls,team_b_fouls,team_b_fouls,team_b_fouls,team_b_fouls,team_b_fouls,team_a_possession,team_a_possession,team_a_possession,team_a_possession,team_a_possession,team_a_possession,team_a_possession,team_a_possession,team_b_possession,team_b_possession,team_b_possession,team_b_possession,team_b_possession,team_b_possession,team_b_possession,team_b_possession
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
season,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2,Unnamed: 55_level_2,Unnamed: 56_level_2,Unnamed: 57_level_2,Unnamed: 58_level_2,Unnamed: 59_level_2,Unnamed: 60_level_2,Unnamed: 61_level_2,Unnamed: 62_level_2,Unnamed: 63_level_2,Unnamed: 64_level_2,Unnamed: 65_level_2,Unnamed: 66_level_2,Unnamed: 67_level_2,Unnamed: 68_level_2,Unnamed: 69_level_2,Unnamed: 70_level_2,Unnamed: 71_level_2,Unnamed: 72_level_2,Unnamed: 73_level_2,Unnamed: 74_level_2,Unnamed: 75_level_2,Unnamed: 76_level_2,Unnamed: 77_level_2,Unnamed: 78_level_2,Unnamed: 79_level_2,Unnamed: 80_level_2,Unnamed: 81_level_2,Unnamed: 82_level_2,Unnamed: 83_level_2,Unnamed: 84_level_2,Unnamed: 85_level_2,Unnamed: 86_level_2,Unnamed: 87_level_2,Unnamed: 88_level_2,Unnamed: 89_level_2,Unnamed: 90_level_2,Unnamed: 91_level_2,Unnamed: 92_level_2,Unnamed: 93_level_2,Unnamed: 94_level_2,Unnamed: 95_level_2,Unnamed: 96_level_2,Unnamed: 97_level_2,Unnamed: 98_level_2,Unnamed: 99_level_2,Unnamed: 100_level_2,Unnamed: 101_level_2,Unnamed: 102_level_2,Unnamed: 103_level_2,Unnamed: 104_level_2,Unnamed: 105_level_2,Unnamed: 106_level_2,Unnamed: 107_level_2,Unnamed: 108_level_2,Unnamed: 109_level_2,Unnamed: 110_level_2,Unnamed: 111_level_2,Unnamed: 112_level_2,Unnamed: 113_level_2,Unnamed: 114_level_2,Unnamed: 115_level_2,Unnamed: 116_level_2,Unnamed: 117_level_2,Unnamed: 118_level_2,Unnamed: 119_level_2,Unnamed: 120_level_2
2007/2008,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0
2008/2009,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0
2009/2010,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0
2010/2011,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0
2011/2012,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,380.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0
2012/2013,380.0,6.252632,3.158833,0.0,4.0,6.0,8.0,15.0,380.0,4.792105,2.784511,0.0,3.0,4.0,6.0,17.0,380.0,11.044737,3.681315,0.0,9.0,11.0,13.25,23.0,380.0,2.339474,1.654296,0.0,1.0,2.0,3.0,9.0,380.0,2.134211,1.703756,0.0,1.0,2.0,3.0,9.0,380.0,8.581579,3.743233,2.0,6.0,8.0,11.0,21.0,380.0,7.015789,3.243791,0.0,5.0,7.0,8.0,21.0,380.0,6.555263,3.23725,0.0,4.0,6.0,9.0,21.0,380.0,4.971053,2.649579,0.0,3.0,4.0,6.0,16.0,380.0,15.136842,5.403235,2.0,11.0,15.0,18.0,36.0,380.0,11.986842,4.688991,2.0,9.0,11.0,14.0,31.0,380.0,10.265789,3.144772,2.0,8.0,10.0,12.0,23.0,380.0,10.697368,3.568477,2.0,8.0,10.0,13.0,23.0,380.0,52.013158,7.738287,27.0,47.0,53.0,57.0,74.0,380.0,47.986842,7.738287,26.0,43.0,47.0,53.0,73.0
2013/2014,380.0,6.092105,2.962286,0.0,4.0,6.0,8.0,14.0,380.0,4.665789,2.695915,0.0,3.0,4.0,6.0,14.0,380.0,10.757895,3.522909,2.0,8.0,11.0,13.0,21.0,380.0,2.073684,1.67705,0.0,1.0,2.0,3.0,10.0,380.0,1.986842,1.760707,0.0,1.0,2.0,3.0,10.0,380.0,5.163158,2.461248,0.0,3.0,5.0,6.0,14.0,380.0,4.213158,2.125194,0.0,3.0,4.0,6.0,13.0,380.0,6.660526,3.335745,0.0,4.0,6.0,9.0,22.0,380.0,5.5,3.222202,0.0,3.0,5.0,7.0,20.0,380.0,11.823684,4.708715,2.0,8.0,11.0,15.0,33.0,380.0,9.713158,4.437706,2.0,7.0,9.0,12.0,30.0,380.0,10.313158,3.1813,2.0,8.0,10.0,13.0,20.0,380.0,10.839474,3.48582,2.0,8.0,11.0,13.0,24.0,380.0,52.226316,8.941401,30.0,46.0,52.0,59.0,76.0,380.0,47.773684,8.941401,24.0,41.0,48.0,54.0,70.0
2014/2015,380.0,6.042105,3.243555,0.0,4.0,6.0,8.0,18.0,380.0,4.663158,2.568782,0.0,3.0,4.0,6.0,13.0,380.0,10.705263,3.521135,1.0,8.0,11.0,13.0,23.0,380.0,1.939474,1.617523,0.0,1.0,2.0,3.0,9.0,380.0,1.844737,1.54257,0.0,1.0,2.0,3.0,8.0,380.0,4.589474,2.450544,0.0,3.0,4.0,6.0,14.0,380.0,3.739474,2.070784,0.0,2.0,4.0,5.0,11.0,380.0,5.307895,2.803586,0.0,3.0,5.0,7.0,15.0,380.0,4.357895,2.485486,0.0,3.0,4.0,6.0,13.0,380.0,9.897368,4.216752,2.0,7.0,9.0,13.0,29.0,380.0,8.097368,3.438643,0.0,6.0,8.0,10.0,19.0,380.0,10.971053,3.349746,4.0,8.0,11.0,13.0,23.0,380.0,11.071053,3.472881,1.0,9.0,11.0,14.0,20.0,380.0,52.026316,8.840083,27.0,47.0,52.0,58.0,77.0,380.0,47.973684,8.840083,23.0,42.0,48.0,53.0,73.0
2015/2016,380.0,5.931579,3.247983,0.0,4.0,5.0,8.0,17.0,380.0,4.876316,2.50971,0.0,3.0,5.0,7.0,14.0,380.0,10.807895,3.604811,1.0,8.0,10.0,13.0,25.0,380.0,1.994737,1.515086,0.0,1.0,2.0,3.0,8.0,380.0,1.834211,1.548303,0.0,1.0,1.5,3.0,10.0,380.0,4.636842,2.424525,0.0,3.0,4.0,6.0,13.0,380.0,3.915789,2.293334,0.0,2.0,4.0,5.0,14.0,380.0,5.478947,2.922405,0.0,3.0,5.0,7.0,15.0,380.0,4.226316,2.263941,0.0,3.0,4.0,6.0,11.0,380.0,10.115789,4.211088,2.0,7.0,10.0,12.25,24.0,380.0,8.142105,3.34014,2.0,6.0,8.0,10.0,18.0,380.0,9.863158,3.481911,2.0,7.0,10.0,12.0,21.0,380.0,11.105263,3.339928,4.0,9.0,11.0,13.0,22.0,380.0,52.123684,8.212094,27.0,47.0,52.0,58.0,76.0,380.0,47.876316,8.212094,24.0,42.0,48.0,53.0,73.0
2016/2017,380.0,5.673684,3.085217,0.0,3.0,5.0,8.0,19.0,380.0,4.742105,2.705767,0.0,3.0,4.0,6.0,15.0,380.0,10.415789,3.442826,2.0,8.0,10.0,13.0,23.0,380.0,1.957895,1.612391,0.0,1.0,2.0,3.0,9.0,380.0,1.736842,1.461633,0.0,1.0,2.0,2.0,9.0,380.0,5.857895,3.039896,0.0,4.0,5.0,8.0,19.0,380.0,4.726316,2.599745,0.0,3.0,4.0,6.0,17.0,380.0,7.468421,3.966423,0.0,5.0,7.0,10.0,25.0,380.0,6.110526,3.218252,0.0,4.0,6.0,8.0,19.0,380.0,13.326316,5.921549,0.0,9.0,12.0,17.0,37.0,380.0,10.836842,4.763506,0.0,7.0,10.0,14.0,27.0,380.0,10.718421,3.421518,2.0,8.0,11.0,13.0,23.0,380.0,11.2,3.527801,2.0,9.0,11.0,14.0,23.0,380.0,50.242105,11.558926,-1.0,44.0,51.0,58.0,74.0,380.0,48.147368,11.411671,-1.0,42.0,48.0,56.0,73.0


The output above confirms that seasons prior to the 2013/2014 season indeed are missing data in the columns identified above. Given that shots, shots on target, and corner kicks are occasions in a football/soccer match when the probability of scoring is high, and therefore probably have significant match outcome prediction power, we will likely retain only data from 2013/2014 season onwards.

#### Referee ID column
The 'refereeID' column is a unique identifier for the referee who officiated a match. Dataframe snapshots revealed that the referee column had some missing values:

In [22]:
data[data['refereeID'].isnull()].shape

(2784, 215)

In [23]:
data[data['refereeID'].isnull()]['season'].unique()

array(['2016/2017', '2015/2016', '2014/2015', '2013/2014', '2017/2018',
       '2012/2013', '2008/2009', '2007/2008', '2023/2024'], dtype=object)

Output indicates that 2,784 entries in the refereeID column have null values. Given that Referees are randomly selected and are often assumed to have very little influence on the match outcome (assuming match officiation is fair), we can safely drop the referee column.

#### Stadium Location
Examination of dataframe shows that the stadium location column has some missing values. since this column has limited relevance, we will add to the list of columns to be dropped.

In [24]:
data[data['stadium_location'].isnull()]['season'].unique()

array(['2016/2017', '2015/2016', '2014/2015', '2013/2014', '2018/2019',
       '2019/2020', '2011/2012', '2010/2011', '2009/2010', '2008/2009',
       '2007/2008', '2020/2021', '2021/2022', '2022/2023', '2023/2024'],
      dtype=object)

#### odds columns
The dataframe snapshots (head, tail, sample) revealed that the values of most of the odds columns, with the exception of `odds_ft_1`, `odds_ft_x`, & `odds_ft_1`  columns, were zeros. Odds represent the probability of a given match outcome, therefore, are not expected to be zero.Given that this columns essentially are missing data, we will drop them from the dataset.

In [25]:
# get list of odds columns 
list_of_columns = list(data.columns)

# odds columns
odds_columns = [column for column in list_of_columns if column.startswith('odds')]


In [26]:
# compute the % zero values for each column and store the results in a dictionary
zero_counts = {}
for column in odds_columns:
    zero_counts[column] = (data[column] == 0).sum()/6460

In [27]:
zero_counts

{'odds_ft_1': 0.3238390092879257,
 'odds_ft_x': 0.3238390092879257,
 'odds_ft_2': 0.3238390092879257,
 'odds_ft_over05': 0.6174922600619195,
 'odds_ft_over15': 0.6174922600619195,
 'odds_ft_over25': 0.6164086687306501,
 'odds_ft_over35': 0.6174922600619195,
 'odds_ft_over45': 0.6174922600619195,
 'odds_ft_under05': 0.6174922600619195,
 'odds_ft_under15': 0.6174922600619195,
 'odds_ft_under25': 0.6167182662538699,
 'odds_ft_under35': 0.6173374613003096,
 'odds_ft_under45': 0.6174922600619195,
 'odds_btts_yes': 0.6174922600619195,
 'odds_btts_no': 0.6174922600619195,
 'odds_team_a_cs_yes': 0.6188854489164086,
 'odds_team_a_cs_no': 0.6188854489164086,
 'odds_team_b_cs_yes': 0.6339009287925697,
 'odds_team_b_cs_no': 0.6335913312693499,
 'odds_doublechance_1x': 0.6195046439628483,
 'odds_doublechance_12': 0.6195046439628483,
 'odds_doublechance_x2': 0.6195046439628483,
 'odds_1st_half_result_1': 0.6185758513931888,
 'odds_1st_half_result_x': 0.6187306501547988,
 'odds_1st_half_result_2': 0.

The output above reveals that 32% of entries in the `odds_ft_1`, `odds_ft_x`, & `odds_ft_1`  columns have invalid odd data (zeros) while the other odd columns consists of between 62% and 100%  invalid data.Therefore, to ensure model integrity all odds columns except `odds_ft_1`, `odds_ft_x`, & `odds_ft_1`  will be dropped from the dataset.Entries with invalid data in the `odds_ft_1`, `odds_ft_x`, & `odds_ft_1`  columns will be dropped. 

#### winning_team column
Inspection revealed that this column indicates winning team using the teamid and draws using -1. For EDA it might be useful to create column that indicates home win with `1`, draws with `x` and away wins with 2, which is the industry standard.

#### no_home_away column
We observed tha the `no_home_away` appears to consist of mostly 0 which makes sense as this would only be 1 if a match was played on a neutral ground. This rarely happens in the EPL.

In [28]:
data['no_home_away'].unique()

array([0], dtype=int64)

The above output confirms that no games were played on neutral ground and thus this column can be dropped.
#### Attendance column
In this column we noted that zero attendance or missing attendance was encoded with -1. 

In [29]:
data[data['attendance']==-1]['attendance'].count()

1822

In [30]:
# List of seasons to filter out
season_not_relevant = ['2007/2008', '2008/2009', '2009/2010', '2010/2011', 
                       '2011/2012', '2012/2013', '2023/2024']

# Filter out rows where the "season" column is in the list
filtered_data = data[~data['season'].isin(season_not_relevant)].copy()


# check missing attendance for relevant seasons
filtered_data[filtered_data['attendance']==-1]['attendance'].count()

729

In [31]:
filtered_data[filtered_data['attendance']==-1].groupby('season')['attendance'].count()

season
2013/2014     16
2014/2015     16
2015/2016     16
2016/2017     16
2020/2021     81
2021/2022    263
2022/2023    321
Name: attendance, dtype: int64

Further inspection reveals that for the relevant seasons, seasons to be considered to have sufficient data, very few matches between `2013` and `2019` had missing fan attendance information. While missing attendance data was expected for the `2020/2021` due covid restrictions, missing data in `2021/2022` and `2022/2023` is unusual. 
Since attendance data may be useful in predicting match outcome, missing data could be obtained from past attendance data or stadium capacity can be used with the assumption that the stadium was full.

#### goalTimingDisabled, corner_timings_recorded, & card_timings_recorded columns
This columns consisted mostly of 1's or 0's. API documentation revealed that this columns were used to indicate whether goal timing, corner timing or card timing data was captured. This information has little relevance and can be safely dropped dropped.

#### team_a_fh_corners	team_b_fh_corners	team_a_2h_corners	team_b_2h_corners	corner_fh_count	corner_2h_count	team_a_fh_cards	team_b_fh_cards	team_a_2h_cards	team_b_2h_cards	total_fh_cards	total_2h_cards 

- Review this columns (2016/2017) season showed some missing data for these this season

#### attacks_recorded column
The attacks_recorded column tracks whether or not attack data was captured encoded as -1 (attack data not recorded) and 1 (attack data was recorded). This is a metadata column and thus can be dropped as it does not have any predictive relevance. 

In [32]:
filtered_data['attacks_recorded'].unique()

array([-1,  1], dtype=int64)

In [33]:
filtered_data['attacks_recorded'].value_counts(normalize=True)

 1    0.690263
-1    0.309737
Name: attacks_recorded, dtype: float64

The output above shows that approximately 56% of matches are missing attack data. The affected columns include:
- `team_a_dangerous_attacks`	
- `team_b_dangerous_attacks`	
- `team_a_attacks`	
- `team_b_attacks`

Given that a large fraction of the attack data is missing, and because this data cannot be imputed, dropping the affected columns is a sensible approach. Additionally, given that live match predictions, which rely on attack data,  is not part of the project scope, dropping attack data does not significantly affect the success of the project. 

In [34]:
data['competition_id'].unique()

array([   9,   10,   11,   12,  161,  246, 1625, 2012, 3119, 3121, 3125,
       3131, 3137, 4759, 6135, 7704, 9660], dtype=int64)

ffffffffffffff

In [35]:
# Updating the list of columns to keep, including the additional columns requested
updated_columns_to_keep = [
    'id', 'homeID', 'awayID', 'season', 'homeGoalCount', 'awayGoalCount', 'totalGoalCount',
    'home_ppg', 'away_ppg', 'pre_match_home_ppg', 'pre_match_away_ppg', 
    'pre_match_teamA_overall_ppg', 'pre_match_teamB_overall_ppg', 
    'team_a_cards_num', 'team_b_cards_num', 'odds_ft_1', 'odds_ft_x', 'odds_ft_2',
    'winningTeam', 'team_a_xg_prematch', 'team_b_xg_prematch', 'total_xg_prematch'
]

# Keeping only the updated set of columns in the filtered_data DataFrame
filtered_data = filtered_data[updated_columns_to_keep]

# Creating a new column for the total number of cards
filtered_data['total_cards'] = filtered_data['team_a_cards_num'] + filtered_data['team_b_cards_num']

# Re-coding the 'winningTeam' column to 1x2 format (assuming 1 for home win, 2 for away win, and 0 for draw)
# Modify the logic below as per the actual coding of the 'winningTeam' column in your dataset
filtered_data['winningTeam_1x2'] = filtered_data['winningTeam'].map({1: 1, 2: 2, 'draw': 0})

# Display the first few rows of the modified DataFrame
filtered_data.head()


Unnamed: 0,id,homeID,awayID,season,homeGoalCount,awayGoalCount,totalGoalCount,home_ppg,away_ppg,pre_match_home_ppg,pre_match_away_ppg,pre_match_teamA_overall_ppg,pre_match_teamB_overall_ppg,team_a_cards_num,team_b_cards_num,odds_ft_1,odds_ft_x,odds_ft_2,winningTeam,team_a_xg_prematch,team_b_xg_prematch,total_xg_prematch,total_cards,winningTeam_1x2
0,2155,150,108,2016/2017,2,1,3,1.47,0.53,0.0,0.0,0.0,0.0,2,2,3.41,3.19,2.39,150,0.0,0.0,0.0,4,
1,2156,145,154,2016/2017,0,1,1,1.74,0.74,0.0,0.0,0.0,0.0,3,2,2.45,3.22,3.26,154,0.0,0.0,0.0,5,
2,2157,143,142,2016/2017,0,1,1,1.05,0.84,0.0,0.0,0.0,0.0,2,2,2.2,3.25,3.8,142,0.0,0.0,0.0,4,
3,2158,144,92,2016/2017,1,1,2,2.26,1.74,0.0,0.0,0.0,0.0,0,0,3.13,3.36,2.45,-1,0.0,0.0,0.0,0,
4,2159,147,141,2016/2017,1,1,2,0.95,0.89,0.0,0.0,0.0,0.0,3,5,2.49,3.2,3.21,-1,0.0,0.0,0.0,8,


In [36]:
print(filtered_data.columns)


Index(['id', 'homeID', 'awayID', 'season', 'homeGoalCount', 'awayGoalCount',
       'totalGoalCount', 'home_ppg', 'away_ppg', 'pre_match_home_ppg',
       'pre_match_away_ppg', 'pre_match_teamA_overall_ppg',
       'pre_match_teamB_overall_ppg', 'team_a_cards_num', 'team_b_cards_num',
       'odds_ft_1', 'odds_ft_x', 'odds_ft_2', 'winningTeam',
       'team_a_xg_prematch', 'team_b_xg_prematch', 'total_xg_prematch',
       'total_cards', 'winningTeam_1x2'],
      dtype='object')


### Feature engineering

In [37]:
# Feature 1: Goal Difference
filtered_data.loc[:, 'goal_difference_per_game'] = abs(filtered_data['homeGoalCount'] - filtered_data['awayGoalCount'])

# Feature 2: Card Difference
filtered_data.loc[:, 'card_difference'] = filtered_data['team_a_cards_num'] - filtered_data['team_b_cards_num']

# Feature 3: Combined Team Strength
filtered_data.loc[:, 'combined_team_strength'] = (filtered_data['home_ppg'] + filtered_data['away_ppg']) / 2


In [38]:
filtered_data.head()

Unnamed: 0,id,homeID,awayID,season,homeGoalCount,awayGoalCount,totalGoalCount,home_ppg,away_ppg,pre_match_home_ppg,pre_match_away_ppg,pre_match_teamA_overall_ppg,pre_match_teamB_overall_ppg,team_a_cards_num,team_b_cards_num,odds_ft_1,odds_ft_x,odds_ft_2,winningTeam,team_a_xg_prematch,team_b_xg_prematch,total_xg_prematch,total_cards,winningTeam_1x2,goal_difference_per_game,card_difference,combined_team_strength
0,2155,150,108,2016/2017,2,1,3,1.47,0.53,0.0,0.0,0.0,0.0,2,2,3.41,3.19,2.39,150,0.0,0.0,0.0,4,,1,0,1.0
1,2156,145,154,2016/2017,0,1,1,1.74,0.74,0.0,0.0,0.0,0.0,3,2,2.45,3.22,3.26,154,0.0,0.0,0.0,5,,1,1,1.24
2,2157,143,142,2016/2017,0,1,1,1.05,0.84,0.0,0.0,0.0,0.0,2,2,2.2,3.25,3.8,142,0.0,0.0,0.0,4,,1,0,0.945
3,2158,144,92,2016/2017,1,1,2,2.26,1.74,0.0,0.0,0.0,0.0,0,0,3.13,3.36,2.45,-1,0.0,0.0,0.0,0,,0,0,2.0
4,2159,147,141,2016/2017,1,1,2,0.95,0.89,0.0,0.0,0.0,0.0,3,5,2.49,3.2,3.21,-1,0.0,0.0,0.0,8,,0,-2,0.92


In [39]:
# Inspect unique values in the 'winningTeam' column
unique_values_in_winningTeam = filtered_data['winningTeam'].unique()
unique_values_in_winningTeam


array([150, 154, 142,  -1,  93, 149, 151, 152,  92, 144, 145, 147, 153,
       108,  59, 148, 143, 155, 146, 141, 156, 158, 159, 157, 160, 162,
       161, 217, 209, 223, 251, 222, 218, 211], dtype=int64)

## Encode Categorical Variables

In [40]:
# Check data types of columns in filtered_data
data_types = filtered_data.dtypes

# Identify potential categorical columns (usually of type 'object' or 'category')
categorical_columns = data_types[data_types == 'object'].index.tolist()

# Display data types and potential categorical columns
print("Data Types:\n", data_types)

Data Types:
 id                               int64
homeID                           int64
awayID                           int64
season                          object
homeGoalCount                    int64
awayGoalCount                    int64
totalGoalCount                   int64
home_ppg                       float64
away_ppg                       float64
pre_match_home_ppg             float64
pre_match_away_ppg             float64
pre_match_teamA_overall_ppg    float64
pre_match_teamB_overall_ppg    float64
team_a_cards_num                 int64
team_b_cards_num                 int64
odds_ft_1                      float64
odds_ft_x                      float64
odds_ft_2                      float64
winningTeam                      int64
team_a_xg_prematch             float64
team_b_xg_prematch             float64
total_xg_prematch              float64
total_cards                      int64
winningTeam_1x2                float64
goal_difference_per_game         int64
card_differe

Potential Categorical Columns: 'season' as it is an object.

In [46]:
# Assuming 'season' is the only categorical column
# One-hot encoding the 'season' column
encoded_seasons = pd.get_dummies(filtered_data['season'], prefix='season')

# Concatenating the encoded columns back to the original DataFrame and dropping the original 'season' column
filtered_data_encoded = pd.concat([filtered_data.drop('season', axis=1), encoded_seasons], axis=1)

# Display the first few rows to verify
filtered_data_encoded.head()


Unnamed: 0,id,homeID,awayID,homeGoalCount,awayGoalCount,totalGoalCount,home_ppg,away_ppg,pre_match_home_ppg,pre_match_away_ppg,pre_match_teamA_overall_ppg,pre_match_teamB_overall_ppg,team_a_cards_num,team_b_cards_num,odds_ft_1,odds_ft_x,odds_ft_2,winningTeam,team_a_xg_prematch,team_b_xg_prematch,total_xg_prematch,total_cards,winningTeam_1x2,goal_difference_per_game,card_difference,combined_team_strength,season_2013/2014,season_2014/2015,season_2015/2016,season_2016/2017,season_2017/2018,season_2018/2019,season_2019/2020,season_2020/2021,season_2021/2022,season_2022/2023
0,2155,150,108,2,1,3,1.47,0.53,0.0,0.0,0.0,0.0,2,2,3.41,3.19,2.39,150,0.0,0.0,0.0,4,,1,0,1.0,0,0,0,1,0,0,0,0,0,0
1,2156,145,154,0,1,1,1.74,0.74,0.0,0.0,0.0,0.0,3,2,2.45,3.22,3.26,154,0.0,0.0,0.0,5,,1,1,1.24,0,0,0,1,0,0,0,0,0,0
2,2157,143,142,0,1,1,1.05,0.84,0.0,0.0,0.0,0.0,2,2,2.2,3.25,3.8,142,0.0,0.0,0.0,4,,1,0,0.945,0,0,0,1,0,0,0,0,0,0
3,2158,144,92,1,1,2,2.26,1.74,0.0,0.0,0.0,0.0,0,0,3.13,3.36,2.45,-1,0.0,0.0,0.0,0,,0,0,2.0,0,0,0,1,0,0,0,0,0,0
4,2159,147,141,1,1,2,0.95,0.89,0.0,0.0,0.0,0.0,3,5,2.49,3.2,3.21,-1,0.0,0.0,0.0,8,,0,-2,0.92,0,0,0,1,0,0,0,0,0,0


## Scaling 

To ensure that the numerical features in our dataset have the same scale and our models are sensitive to the magnitude of inputs.

Standard Scaling using scikit-learn

In [49]:
from sklearn.preprocessing import StandardScaler

# Selecting numerical columns for scaling (excluding one-hot encoded season columns and identifier columns)
numerical_columns = filtered_data_encoded.select_dtypes(include=['int64', 'float64']).columns
numerical_columns = numerical_columns.drop(['id', 'homeID', 'awayID'])  # Exclude identifier columns

# Applying Standard Scaler
scaler = StandardScaler()
filtered_data_encoded[numerical_columns] = scaler.fit_transform(filtered_data_encoded[numerical_columns])

# Display the first few rows of the scaled DataFrame
filtered_data_encoded.head()


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Unnamed: 0,id,homeID,awayID,homeGoalCount,awayGoalCount,totalGoalCount,home_ppg,away_ppg,pre_match_home_ppg,pre_match_away_ppg,pre_match_teamA_overall_ppg,pre_match_teamB_overall_ppg,team_a_cards_num,team_b_cards_num,odds_ft_1,odds_ft_x,odds_ft_2,winningTeam,team_a_xg_prematch,team_b_xg_prematch,total_xg_prematch,total_cards,winningTeam_1x2,goal_difference_per_game,card_difference,combined_team_strength,season_2013/2014,season_2014/2015,season_2015/2016,season_2016/2017,season_2017/2018,season_2018/2019,season_2019/2020,season_2020/2021,season_2021/2022,season_2022/2023
0,-0.732438,0.125591,-1.016149,0.359963,-0.181914,0.154549,-0.218973,-1.298951,-1.941841,-1.597538,-2.110245,-2.134071,0.252393,0.07805,0.226944,-0.723651,-0.589057,0.651371,-1.28624,-1.291992,-1.330041,0.212528,,-0.353164,0.128468,-1.100589,0,0,0,1,0,0,0,0,0,0
1,-0.732437,-0.01033,0.234329,-1.156511,-0.181914,-1.043993,0.299874,-0.88208,-1.941841,-1.597538,-2.110245,-2.134071,1.016611,0.07805,-0.226025,-0.702015,-0.354013,0.710356,-1.28624,-1.291992,-1.330041,0.697577,,-0.353164,0.706196,-0.41299,0,0,0,1,0,0,0,0,0,0
2,-0.732435,-0.064699,-0.091883,-1.156511,-0.181914,-1.043993,-1.026068,-0.68357,-1.941841,-1.597538,-2.110245,-2.134071,0.252393,0.07805,-0.343986,-0.680378,-0.208124,0.533401,-1.28624,-1.291992,-1.330041,0.212528,,-0.353164,0.128468,-1.258164,0,0,0,1,0,0,0,0,0,0
3,-0.732434,-0.037514,-1.451098,-0.398274,-0.181914,-0.444722,1.299136,1.103022,-1.941841,-1.597538,-2.110245,-2.134071,-1.276043,-1.368725,0.094828,-0.601046,-0.572847,-1.57532,-1.28624,-1.291992,-1.330041,-1.727669,,-1.154849,0.128468,1.764409,0,0,0,1,0,0,0,0,0,0
4,-0.732433,0.044039,-0.119067,-0.398274,-0.181914,-0.444722,-1.218234,-0.584315,-1.941841,-1.597538,-2.110245,-2.134071,1.016611,2.248211,-0.207151,-0.716439,-0.367522,-1.57532,-1.28624,-1.291992,-1.330041,2.152725,,-1.154849,-1.026987,-1.329789,0,0,0,1,0,0,0,0,0,0


Min-Max Scaler

In [43]:
# from sklearn.preprocessing import MinMaxScaler

# # Selecting numerical columns for scaling
# # Exclude one-hot encoded season columns and any other non-numerical columns
# numerical_columns = filtered_data_encoded.select_dtypes(include=['int64', 'float64']).columns

# # Applying MinMax Scaler
# scaler = MinMaxScaler()
# filtered_data_encoded[numerical_columns] = scaler.fit_transform(filtered_data_encoded[numerical_columns])

# # Display the first few rows of the scaled DataFrame
# filtered_data_encoded.head()


If you're unsure, you can try both MinMaxScaler and StandardScaler, train your model with each, and evaluate their performance using cross-validation or a separate validation set. This empirical approach will help you determine which scaler works better for your specific dataset and prediction task.

## Splitting the Dataset

In [50]:
from sklearn.model_selection import train_test_split

# Assuming 'totalGoalCount', 'total_cards', and 'winningTeam_1x2' are your target columns
target_columns = ['totalGoalCount', 'total_cards', 'winningTeam_1x2']

# Features (excluding the target columns)
X = filtered_data_encoded.drop(target_columns, axis=1)

# Target variables
y = filtered_data_encoded[target_columns]

# Split the dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


Shape of X_train: (3040, 33)
Shape of X_test: (760, 33)
Shape of y_train: (3040, 3)
Shape of y_test: (760, 3)


## Modeling

### Model Selection

1. Linear Regression (Total Goal Count, Total Cards):

Pros: Simple, interpretable, and fast. Can provide insights into linear relationships between features and target variables.

Cons: Assumes a linear relationship, which might not be suitable for complex relationships in the data.


2. Random Forest Regressor (Total Goal Count, Total Cards):

Pros: More flexible than linear regression, can capture non-linear relationships, handles complex datasets well, and less prone to overfitting.

Cons: Less interpretable compared to linear regression.


1. Logistic Regression (Winning Team Prediction):

Pros: Simple, interpretable, and fast. Suitable for binary classification tasks.

Cons: Assumes a linear relationship, may not handle complex relationships as well as non-linear models.


2. Random Forest Classifier (Winning Team Prediction):

Pros: Flexible, handles non-linear relationships, robust to overfitting, and can capture complex decision boundaries.

Cons: More complex and less interpretable compared to logistic regression.

#### Base models


#### Total Goal Count

In [56]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Train a Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train['totalGoalCount'])

# Predict on the test set
y_pred_lr = lr_model.predict(X_test)

# Evaluate the model
mse_lr = mean_squared_error(y_test['totalGoalCount'], y_pred_lr)
r2_lr = r2_score(y_test['totalGoalCount'], y_pred_lr)

# Print metrics with up to 4 significant figures
print("Linear Regression - Mean Squared Error: {:.4g}".format(mse_lr))
print("Linear Regression - R^2 Score: {:.4g}".format(r2_lr))


Linear Regression - Mean Squared Error: 3.329e-30
Linear Regression - R^2 Score: 1


Our model's near-zero MSE and perfect R² score suggest exceptional accuracy but raise overfitting concerns. It's reasonable to re-examine your data and model complexity, and conduct independent tests or cross-validation for validation on unseen data.

#### Total Cards

In [59]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Train a Linear Regression model for Total Cards
lr_model = LinearRegression()
lr_model.fit(X_train, y_train['total_cards'])  # Change target to 'total_cards'

# Predict on the test set
y_pred_lr = lr_model.predict(X_test)

# Evaluate the model for Total Cards
mse_lr = mean_squared_error(y_test['total_cards'], y_pred_lr)  # Change to 'total_cards'
r2_lr = r2_score(y_test['total_cards'], y_pred_lr)  # Change to 'total_cards'

# Print metrics with up to 4 significant figures
print("Linear Regression for Total Cards - Mean Squared Error: {:.4g}".format(mse_lr))
print("Linear Regression for Total Cards - R^2 Score: {:.4g}".format(r2_lr))


Linear Regression for Total Cards - Mean Squared Error: 1.211e-30
Linear Regression for Total Cards - R^2 Score: 1


The Linear Regression model for Total Cards, with an MSE of 1.211e-30 and an R² score of 1, indicates near-perfect predictions but suggests potential overfitting to our training data.

#### Winning Team

In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train a Logistic Regression model for Winning Team Prediction
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train['winningTeam_1x2'])

# Predict on the test set
y_pred_logreg = logreg_model.predict(X_test)

# Evaluate the model
accuracy_logreg = accuracy_score(y_test['winningTeam_1x2'], y_pred_logreg)
report_logreg = classification_report(y_test['winningTeam_1x2'], y_pred_logreg)

# Print evaluation metrics
print("Logistic Regression for Winning Team - Accuracy:", accuracy_logreg)
print("Classification Report:\n", report_logreg)


ValueError: Input y contains NaN.

#### Second Model

#### Total Goal Count

In [58]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Train a Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train['totalGoalCount'])

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test['totalGoalCount'], y_pred_rf)
r2_rf = r2_score(y_test['totalGoalCount'], y_pred_rf)

# Print metrics with up to 4 significant figures
print("Random Forest Regressor - Mean Squared Error: {:.4g}".format(mse_rf))
print("Random Forest Regressor - R^2 Score: {:.4g}".format(r2_rf))


Random Forest Regressor - Mean Squared Error: 0.001853
Random Forest Regressor - R^2 Score: 0.9982


Our Random Forest model shows an MSE of 0.001853 and an R² of 0.9982, indicating high accuracy and excellent fit to the data. The low MSE suggests minimal error in predictions, while the R² close to 1 reflects the model's effectiveness in explaining the variability of the target variable.

#### Total Cards

In [61]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Train a Random Forest Regressor for Total Cards
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train['total_cards'])  # Change target to 'total_cards'

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model for Total Cards
mse_rf = mean_squared_error(y_test['total_cards'], y_pred_rf)  # Change to 'total_cards'
r2_rf = r2_score(y_test['total_cards'], y_pred_rf)  # Change to 'total_cards'

# Print metrics with up to 4 significant figures
print("Random Forest Regressor for Total Cards - Mean Squared Error: {:.4g}".format(mse_rf))
print("Random Forest Regressor for Total Cards - R^2 Score: {:.4g}".format(r2_rf))


Random Forest Regressor for Total Cards - Mean Squared Error: 0.003896
Random Forest Regressor for Total Cards - R^2 Score: 0.9962


The Random Forest model for Total Cards shows an MSE of 0.003896 and an R² of 0.9962, indicating high accuracy and a strong fit to the data.

#### Winning Team

In [64]:
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest Classifier for Winning Team Prediction
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train['winningTeam_1x2'])

# Predict on the test set
y_pred_rf_class = rf_classifier.predict(X_test)

# Evaluate the model
accuracy_rf_class = accuracy_score(y_test['winningTeam_1x2'], y_pred_rf_class)
report_rf_class = classification_report(y_test['winningTeam_1x2'], y_pred_rf_class)

# Print evaluation metrics
print("Random Forest Classifier for Winning Team - Accuracy:", accuracy_rf_class)
print("Classification Report:\n", report_rf_class)


ValueError: Input y contains NaN.