#### This script gathers all raw files (including new files 2000 - 2009) gathered by Yang into one aggregate file.
#### The resulting DF will be the base-dataset used for SL.  
#### _Note: other columns will be added to this result DF in order to create an enhanced SL dataset_
#### _date: 20230621_


- Columns to keep from each raw file are: ['Date','HomeTeam','AwayTeam','FTHG','FTAG','FTR','HTHG','HTAG','HTR',
'HS','AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR']

- New column added: season (type int 1-22)


In [11]:
import pandas as pd
import numpy as np
import os

desired_width = 650
pd.set_option('display.width', desired_width)
np.set_printoptions(linewidth=desired_width)
pd.set_option('display.max_columns', 22)

In [12]:
sel_cols = ['Date','HomeTeam','AwayTeam','FTHG','FTAG','FTR','HTHG','HTAG','HTR',
'HS','AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR']

In [13]:
len(sel_cols)

21

In [14]:
# final_path = "C:\\Users\\Erick\\PycharmProjects\\pythonProject1\\inputs\\"
raw_path = "INPUTS/raw_files/"
raw_files_list = os.listdir(raw_path)
raw_files_list

['2000-2001.csv',
 '2001-2002.csv',
 '2002-2003.csv',
 '2003-2004.csv',
 '2004-2005.csv',
 '2005-2006.csv',
 '2006-2007.csv',
 '2007-2008.csv',
 '2008-2009.csv',
 '2009-2010.csv',
 '2010-2011.csv',
 '2011-2012.csv',
 '2012-2013.csv',
 '2013-2014.csv',
 '2014-2015.csv',
 '2015-2016.csv',
 '2016-2017.csv',
 '2017-2018.csv',
 '2018-2019.csv',
 '2019-2020.csv',
 '2020-2021.csv',
 '2021-2022.csv']

In [15]:
def modify_date_col(date_column):
    # changes the format of the dates from dd/mm/yy to YYYY/MM/DD
    new_dates_column = []
    for date_val in date_column:
        split_date_vals = date_val.split("/")
        new_dates_column.append(split_date_vals[1] + "/" + split_date_vals[0] + "/" + split_date_vals[-1])
    
    new_dates_column = pd.to_datetime(pd.Series(new_dates_column))
    new_dates_column = new_dates_column.dt.date # drop the timestamp in the date value.  Only keep YYYY/MM/DD
    
    return new_dates_column

In [16]:
frames = []
for i in range(len(raw_files_list)): 
    df1 = pd.read_csv(raw_path + raw_files_list[i])
    df1 = df1[sel_cols]
    df1.Date = modify_date_col(df1.Date)
    df1.insert(loc=0, column='season', value=i+1)
    frames.append(df1)
    
result = pd.concat(frames, ignore_index=True)
print(result.shape)
result.sample(5)

(8020, 22)


Unnamed: 0,season,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
2114,6,2006-01-21,Birmingham,Portsmouth,5,0,H,2,0,H,9,10,6,4,11,23,7,2,1,1,0,0
6744,18,2018-03-03,Swansea,West Ham,4,1,H,2,0,H,9,7,8,2,9,14,4,5,0,3,0,0
556,2,2001-12-23,Chelsea,Bolton,5,1,H,2,1,H,16,10,5,4,7,21,4,4,2,2,0,0
7290,21,2020-10-03,Leeds,Man City,1,1,D,0,1,A,12,23,7,2,11,12,7,10,2,1,0,0
1289,4,2003-12-07,Southampton,Charlton,3,2,H,2,0,H,18,20,14,14,12,12,8,9,0,0,0,0


In [20]:
result.to_csv('INPUTS/sl_base_dataset_20230621.csv', index=False)

In [9]:
# with pd.ExcelWriter('INPUTS/erick_june21.xlsx') as writer:  
#     result.to_excel(writer, sheet_name='all_seasons', index=False)