# Overview

Objective:  
- consolidate game data files obtained from [footystats](https://footystats.org/).  

Note:  
- main data items are those relevant to match identification and goal timings.

# Setup

## Imports

In [1]:
# data wrangling
import json
import numpy as np
import pandas as pd
# files, system
import os

# Read and consolidate files

In [2]:
RAW_DATA_PATH = '../data/raw/footystats'

get names of individual csv files (to be consolidated in next step)

## Read

In [3]:
os.listdir(RAW_DATA_PATH)

['australia-a-league-matches-2016-to-2017-stats.csv',
 'australia-a-league-matches-2019-to-2020-stats.csv',
 'russia-russian-premier-league-matches-2019-to-2020-stats.csv',
 'netherlands-eredivisie-matches-2018-to-2019-stats.csv',
 'russia-russian-premier-league-matches-2016-to-2017-stats.csv',
 'italy-serie-a-matches-2018-to-2019-stats.csv',
 'italy-serie-b-matches-2017-to-2018-stats.csv',
 'spain-segunda-division-matches-2018-to-2019-stats.csv',
 'germany-bundesliga-matches-2015-to-2016-stats.csv',
 'france-ligue-2-matches-2016-to-2017-stats.csv',
 'france-ligue-2-matches-2019-to-2020-stats.csv',
 'japan-j1-league-matches-2016-to-2016-stats.csv',
 'usa-mls-matches-2018-to-2018-stats.csv',
 'australia-a-league-matches-2015-to-2016-stats.csv',
 'england-premier-league-matches-2017-to-2018-stats.csv',
 'russia-russian-premier-league-matches-2015-to-2016-stats.csv',
 'argentina-primera-division-matches-2015-to-2015-stats.csv',
 'spain-la-liga-matches-2018-to-2019-stats.csv',
 'germany-bu

## Consolidate

consolidate data into one pandas DataFrame, with a new column identifying the country

In [4]:
COUNTRY_INDEX = 0

df = pd.DataFrame()

for filename in os.listdir(RAW_DATA_PATH):
    df_individual = pd.read_csv('/'.join([RAW_DATA_PATH, filename]), parse_dates=['date_GMT'])
    country = filename.split('-')[COUNTRY_INDEX]
    df_individual['country'] = country
    df = pd.concat([df, df_individual])
df

Unnamed: 0,timestamp,date_GMT,status,attendance,home_team_name,away_team_name,referee,Pre-Match PPG (Home),Pre-Match PPG (Away),home_ppg,...,odds_ft_draw,odds_ft_away_team_win,odds_ft_over15,odds_ft_over25,odds_ft_over35,odds_ft_over45,odds_btts_yes,odds_btts_no,stadium_name,country
0,1475830200,2016-10-07 08:50:00,complete,15805.0,Brisbane Roar,Melbourne Victory FC,,0.00,0.00,1.86,...,3.56,3.02,0.0,0.0,0.0,0.0,0.0,0.0,Suncorp Stadium,australia
1,1475908500,2016-10-08 06:35:00,complete,10034.0,Wellington Phoenix,Melbourne City FC,,0.00,0.00,1.36,...,3.67,2.41,0.0,0.0,0.0,0.0,0.0,0.0,Westpac Stadium,australia
2,1475916600,2016-10-08 08:50:00,complete,61880.0,Western Sydney Wanderers,Sydney FC,,0.00,0.00,1.36,...,3.51,3.47,0.0,0.0,0.0,0.0,0.0,0.0,ANZ Stadium (Sydney),australia
3,1475924400,2016-10-08 11:00:00,complete,9501.0,Perth Glory FC,Central Coast Mariners,,0.00,0.00,1.93,...,4.24,5.89,0.0,0.0,0.0,0.0,0.0,0.0,nib Stadium,australia
4,1475992800,2016-10-09 06:00:00,complete,9145.0,Newcastle Jets FC,Adelaide United,,0.00,0.00,1.15,...,3.75,1.96,0.0,0.0,0.0,0.0,0.0,0.0,McDonald Jones Stadium,australia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,1495373400,2017-05-21 13:30:00,complete,38423.0,Kaiserslautern,Nürnberg,,1.63,1.31,1.71,...,3.82,5.14,0.0,0.0,0.0,0.0,0.0,0.0,Fritz-Walter-Stadion,germany
302,1495373400,2017-05-21 13:30:00,complete,13090.0,Greuther Fürth,Union Berlin,,1.81,1.31,1.71,...,3.55,3.48,0.0,0.0,0.0,0.0,0.0,0.0,Sportpark Ronhof Thomas Sommer,germany
303,1495373400,2017-05-21 13:30:00,complete,37320.0,Fortuna Düsseldorf,Erzgebirge Aue,,1.00,0.88,1.12,...,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,ESPRIT arena (Düsseldorf),germany
304,1495373400,2017-05-21 13:30:00,complete,12470.0,Sandhausen,Hannover 96,,1.44,1.38,1.41,...,4.00,1.66,0.0,0.0,0.0,0.0,0.0,0.0,BWT-Stadion am Hardtwald,germany


## Filter columns

In [5]:
df.columns

Index(['timestamp', 'date_GMT', 'status', 'attendance', 'home_team_name',
       'away_team_name', 'referee', 'Pre-Match PPG (Home)',
       'Pre-Match PPG (Away)', 'home_ppg', 'away_ppg', 'home_team_goal_count',
       'away_team_goal_count', 'total_goal_count', 'total_goals_at_half_time',
       'home_team_goal_count_half_time', 'away_team_goal_count_half_time',
       'home_team_goal_timings', 'away_team_goal_timings',
       'home_team_corner_count', 'away_team_corner_count',
       'home_team_yellow_cards', 'home_team_red_cards',
       'away_team_yellow_cards', 'away_team_red_cards',
       'home_team_first_half_cards', 'home_team_second_half_cards',
       'away_team_first_half_cards', 'away_team_second_half_cards',
       'home_team_shots', 'away_team_shots', 'home_team_shots_on_target',
       'away_team_shots_on_target', 'home_team_shots_off_target',
       'away_team_shots_off_target', 'home_team_fouls', 'away_team_fouls',
       'home_team_possession', 'away_team_possession

Not all columns are relevant to our research, so we keep relevant columns.

In [6]:
cols = ['timestamp', 'date_GMT', 'status', 'home_team_name',
       'away_team_name', 'home_team_goal_count',
       'away_team_goal_count', 'home_team_goal_timings', 
        'away_team_goal_timings', 'country']

In [7]:
df = df[cols]
df

Unnamed: 0,timestamp,date_GMT,status,home_team_name,away_team_name,home_team_goal_count,away_team_goal_count,home_team_goal_timings,away_team_goal_timings,country
0,1475830200,2016-10-07 08:50:00,complete,Brisbane Roar,Melbourne Victory FC,1,1,90'6,83,australia
1,1475908500,2016-10-08 06:35:00,complete,Wellington Phoenix,Melbourne City FC,0,1,,31,australia
2,1475916600,2016-10-08 08:50:00,complete,Western Sydney Wanderers,Sydney FC,0,4,,51558589,australia
3,1475924400,2016-10-08 11:00:00,complete,Perth Glory FC,Central Coast Mariners,3,3,32735,568486,australia
4,1475992800,2016-10-09 06:00:00,complete,Newcastle Jets FC,Adelaide United,1,1,17,29,australia
...,...,...,...,...,...,...,...,...,...,...
301,1495373400,2017-05-21 13:30:00,complete,Kaiserslautern,Nürnberg,1,0,20,,germany
302,1495373400,2017-05-21 13:30:00,complete,Greuther Fürth,Union Berlin,1,2,66,3878,germany
303,1495373400,2017-05-21 13:30:00,complete,Fortuna Düsseldorf,Erzgebirge Aue,1,0,39,,germany
304,1495373400,2017-05-21 13:30:00,complete,Sandhausen,Hannover 96,1,1,57,60,germany


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110503 entries, 0 to 305
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   timestamp               110503 non-null  int64         
 1   date_GMT                110503 non-null  datetime64[ns]
 2   status                  110503 non-null  object        
 3   home_team_name          110503 non-null  object        
 4   away_team_name          110503 non-null  object        
 5   home_team_goal_count    110503 non-null  int64         
 6   away_team_goal_count    110503 non-null  int64         
 7   home_team_goal_timings  83272 non-null   object        
 8   away_team_goal_timings  72300 non-null   object        
 9   country                 110503 non-null  object        
dtypes: datetime64[ns](1), int64(3), object(6)
memory usage: 9.3+ MB


# Filter out non-relevant matches

Relevant matches must have `status` complete.

In [9]:
# status - complete
df = df[df['status'] == 'complete'].copy()

Research decision: use matches from May, 2015 on.

In [10]:
# date - from May, 2015
df = df[df['date_GMT'] >= '2015-05'].copy()
df

Unnamed: 0,timestamp,date_GMT,status,home_team_name,away_team_name,home_team_goal_count,away_team_goal_count,home_team_goal_timings,away_team_goal_timings,country
0,1475830200,2016-10-07 08:50:00,complete,Brisbane Roar,Melbourne Victory FC,1,1,90'6,83,australia
1,1475908500,2016-10-08 06:35:00,complete,Wellington Phoenix,Melbourne City FC,0,1,,31,australia
2,1475916600,2016-10-08 08:50:00,complete,Western Sydney Wanderers,Sydney FC,0,4,,51558589,australia
3,1475924400,2016-10-08 11:00:00,complete,Perth Glory FC,Central Coast Mariners,3,3,32735,568486,australia
4,1475992800,2016-10-09 06:00:00,complete,Newcastle Jets FC,Adelaide United,1,1,17,29,australia
...,...,...,...,...,...,...,...,...,...,...
301,1495373400,2017-05-21 13:30:00,complete,Kaiserslautern,Nürnberg,1,0,20,,germany
302,1495373400,2017-05-21 13:30:00,complete,Greuther Fürth,Union Berlin,1,2,66,3878,germany
303,1495373400,2017-05-21 13:30:00,complete,Fortuna Düsseldorf,Erzgebirge Aue,1,0,39,,germany
304,1495373400,2017-05-21 13:30:00,complete,Sandhausen,Hannover 96,1,1,57,60,germany


# Export

In [11]:
FOOTYSTATS_FILEPATH = '../data/raw/footystats/footystats.csv'

In [12]:
df.to_csv(FOOTYSTATS_FILEPATH, index=False)