# Import relevant libraries

In [1]:
import os
import glob
import re
import numpy as np
import pandas as pd
import unicodedata

In [2]:
# disable false positive warnings
pd.options.mode.chained_assignment = None

# Setup path

In [3]:
# set up paths
base_path = os.path.join('..', 'datasets')
raw_data_path = os.path.join(base_path, 'raw_data')
cleaned_data_path = os.path.join(base_path, 'cleaned_data')

# Load data

In [4]:
# get folder names
folders = [(os.path.join(raw_data_path, name)) for name in os.listdir(raw_data_path)\
            if os.path.isdir(os.path.join(raw_data_path, name))]
print(folders)

['../datasets/raw_data/1992', '../datasets/raw_data/1993', '../datasets/raw_data/1994', '../datasets/raw_data/1995', '../datasets/raw_data/1996', '../datasets/raw_data/1997', '../datasets/raw_data/1998', '../datasets/raw_data/1999', '../datasets/raw_data/2000', '../datasets/raw_data/2001', '../datasets/raw_data/2002', '../datasets/raw_data/2003', '../datasets/raw_data/2004', '../datasets/raw_data/2005', '../datasets/raw_data/2006', '../datasets/raw_data/2007', '../datasets/raw_data/2008', '../datasets/raw_data/2009', '../datasets/raw_data/2010', '../datasets/raw_data/2011', '../datasets/raw_data/2012', '../datasets/raw_data/2013', '../datasets/raw_data/2014', '../datasets/raw_data/2015', '../datasets/raw_data/2016', '../datasets/raw_data/2017', '../datasets/raw_data/2018', '../datasets/raw_data/2019', '../datasets/raw_data/2020']


In [5]:
# initiate a table
denorm_table = []

# load and merge all files from 1992-2020
for folder in folders:
    
    # get all file paths a folder
    file_paths = glob.glob(os.path.join(folder, '*.csv'))
    
    for file_path in file_paths: 
        
        # read file
        file = pd.read_csv(file_path)
        
        # drop unmatched column
        if 'transfer_period' in file.columns:
            file.drop(columns = 'transfer_period', inplace = True)
        
        # concat table
        if len(denorm_table) == 0:
            denorm_table = file
        else:
            # check if columns of the two tables are the same 
            assert all(denorm_table.columns == file.columns), 'Columns do not match!'
            
            # concat table
            denorm_table = pd.concat([denorm_table, file])

In [6]:
# remove unnecessary columns
denorm_table.drop(columns = ['league_name', 'season'], inplace = True)

denorm_table.reset_index(drop = True, inplace = True)
print(f'row counts: {len(denorm_table)}')
denorm_table.head()

row counts: 168641


Unnamed: 0,club_name,player_name,age,position,club_involved_name,fee,transfer_movement,fee_cleaned,year
0,Fortuna Sittard,René Hofman,31.0,Right Winger,Roda JC,?,in,,1992
1,Fortuna Sittard,Peter Schmitz,21.0,Forward,Helmond Sport,?,in,,1992
2,Fortuna Sittard,Kenneth Nysaether,22.0,Centre-Forward,Lillestrøm SK,?,in,,1992
3,Fortuna Sittard,Arno van Zwam,22.0,Goalkeeper,Fortuna U19,-,in,0.0,1992
4,Fortuna Sittard,Jerry Taihuttu,22.0,Centre-Forward,Helmond Sport,Loan,out,0.0,1992


# Preprocessing

In [7]:
# remove duplicates
denorm_table.drop_duplicates(inplace = True)
denorm_table.reset_index(drop = True, inplace = True)
print(f'row counts: {len(denorm_table)}')
denorm_table.head()

row counts: 164703


Unnamed: 0,club_name,player_name,age,position,club_involved_name,fee,transfer_movement,fee_cleaned,year
0,Fortuna Sittard,René Hofman,31.0,Right Winger,Roda JC,?,in,,1992
1,Fortuna Sittard,Peter Schmitz,21.0,Forward,Helmond Sport,?,in,,1992
2,Fortuna Sittard,Kenneth Nysaether,22.0,Centre-Forward,Lillestrøm SK,?,in,,1992
3,Fortuna Sittard,Arno van Zwam,22.0,Goalkeeper,Fortuna U19,-,in,0.0,1992
4,Fortuna Sittard,Jerry Taihuttu,22.0,Centre-Forward,Helmond Sport,Loan,out,0.0,1992


In [8]:
# fix player_name by removing accent from alphabets

def strip_accents(text):

    try:
        text = unicode(text, 'utf-8')
    except NameError: 
        pass

    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")
    return str(text)

denorm_table['player_name'] = denorm_table['player_name'].apply(strip_accents)

In [9]:
# add column `num_transfer` which represent the number of time a player moved
denorm_table['count'] = 1
denorm_table['num_transfer'] = denorm_table.groupby(['player_name', 'year'])['count'].cumcount() + 1
denorm_table['max_transfer'] = denorm_table.groupby(['player_name', 'year'])['num_transfer'].transform(np.max)
denorm_table.drop(columns = 'count', inplace = True)
print(f'row counts: {len(denorm_table)}')
denorm_table.head()

row counts: 164703


Unnamed: 0,club_name,player_name,age,position,club_involved_name,fee,transfer_movement,fee_cleaned,year,num_transfer,max_transfer
0,Fortuna Sittard,Rene Hofman,31.0,Right Winger,Roda JC,?,in,,1992,1,2
1,Fortuna Sittard,Peter Schmitz,21.0,Forward,Helmond Sport,?,in,,1992,1,1
2,Fortuna Sittard,Kenneth Nysaether,22.0,Centre-Forward,Lillestrøm SK,?,in,,1992,1,1
3,Fortuna Sittard,Arno van Zwam,22.0,Goalkeeper,Fortuna U19,-,in,0.0,1992,1,1
4,Fortuna Sittard,Jerry Taihuttu,22.0,Centre-Forward,Helmond Sport,Loan,out,0.0,1992,1,1


In [10]:
# remove players who were transferred over 4 times a year (max_transfer > 8)
remove_players = denorm_table[denorm_table['max_transfer'] > 8]['player_name'].unique()

denorm_table = denorm_table[denorm_table['player_name'].isin(remove_players) == False]
denorm_table.drop(columns = ['num_transfer', 'max_transfer'], inplace = True)
denorm_table.reset_index(drop = True, inplace = True)
print(f'row counts: {len(denorm_table)}')
denorm_table.head()

row counts: 163001


Unnamed: 0,club_name,player_name,age,position,club_involved_name,fee,transfer_movement,fee_cleaned,year
0,Fortuna Sittard,Rene Hofman,31.0,Right Winger,Roda JC,?,in,,1992
1,Fortuna Sittard,Peter Schmitz,21.0,Forward,Helmond Sport,?,in,,1992
2,Fortuna Sittard,Kenneth Nysaether,22.0,Centre-Forward,Lillestrøm SK,?,in,,1992
3,Fortuna Sittard,Arno van Zwam,22.0,Goalkeeper,Fortuna U19,-,in,0.0,1992
4,Fortuna Sittard,Jerry Taihuttu,22.0,Centre-Forward,Helmond Sport,Loan,out,0.0,1992


In [11]:
# save table to csv
denorm_table.to_csv(os.path.join(cleaned_data_path, 'node_edge_data.csv'), index = False)

For performance analysis:
- Get a table of players who were bought at least once (for performance evaluation)

In [12]:
# Get a table of players that were bought at least once

pattern = r"£"

slicer = []

for i in range(len(denorm_table)):
    val = bool(re.match(pattern, str(denorm_table['fee'][i])))
    slicer.append(val)

perf_eval_table = denorm_table[slicer]
perf_eval_table.reset_index(drop = True, inplace = True)
print(f'number of observation: {len(perf_eval_table)}')
perf_eval_table.head()

number of observation: 26654


Unnamed: 0,club_name,player_name,age,position,club_involved_name,fee,transfer_movement,fee_cleaned,year
0,Ajax Amsterdam,Marc Overmars,19.0,Left Winger,Willem II,£817Th.,in,0.817,1992
1,Ajax Amsterdam,Jari Litmanen,21.0,Attacking Midfield,MYPA,£13Th.,in,0.013,1992
2,Ajax Amsterdam,Aron Winter,25.0,Defensive Midfield,Lazio,£1.17m,out,1.17,1992
3,Sparta Rotterdam,Winston Bogarde,21.0,Centre-Back,Excelsior,£5Th.,in,0.005,1992
4,Willem II Tilburg,Marc Overmars,19.0,Left Winger,Ajax,£817Th.,out,0.817,1992


In [13]:
# remove unnecessary column
perf_eval_table.drop(columns = ['fee'], inplace = True)

In [14]:
# clean data
perf_eval_table.dropna(inplace = True)
perf_eval_table.reset_index(drop = True, inplace = True)

In [15]:
# save table to csv
perf_eval_table.to_csv(os.path.join(cleaned_data_path, 'perf_eval_table.csv'), index = False)