# Import relevant libraries

In [1]:
import os
import re
import sqlite3
import numpy as np
import pandas as pd
import datetime
import tarfile

In [2]:
# disable false positive warnings
pd.options.mode.chained_assignment = None

# Unzip .tar.gz file

In [3]:
# set base_path
base_path = os.path.join('..', 'datasets')
raw_data_path = os.path.join(base_path, 'raw_data')
cleaned_data_path = os.path.join(base_path, 'cleaned_data')

In [4]:
# set file path
file_path = os.path.join(raw_data_path, 'fifa_data.tar.gz')

In [5]:
# extract .tar.gz
tar = tarfile.open(file_path, "r:gz")
tar.extractall(path = raw_data_path)
tar.close()

# Connect to database

In [6]:
# set db_path
db_path = os.path.join(raw_data_path, 'database.sqlite')

In [7]:
# connect to the database
connect = sqlite3.connect(db_path)

# ETL

In [8]:
# cursor
cursor = connect.cursor()

In [9]:
# get column names
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
columns = cursor.fetchall()

# get only alphanumeric characters
pattern = '\w+'

# get column names
columns = re.findall(pattern, str(columns))
print(columns)

['sqlite_sequence', 'Player_Attributes', 'Player', 'Match', 'League', 'Country', 'Team', 'Team_Attributes']


In [10]:
for column in columns: 
    # generate variable name
    var_name = column.lower()
    # get table
    vars()[var_name] = pd.read_sql_query(f"SELECT * from {column}", connect)
    print(f'The table `{var_name}` was created.')

The table `sqlite_sequence` was created.
The table `player_attributes` was created.
The table `player` was created.
The table `match` was created.
The table `league` was created.
The table `country` was created.
The table `team` was created.
The table `team_attributes` was created.


# Preprocessing

In [11]:
# select only necessary columns
player = player[['player_api_id', 'player_name']]
player.head()

Unnamed: 0,player_api_id,player_name
0,505942,Aaron Appindangoye
1,155782,Aaron Cresswell
2,162549,Aaron Doran
3,30572,Aaron Galindo
4,23780,Aaron Hughes


In [12]:
print(player_attributes.columns)

Index(['id', 'player_fifa_api_id', 'player_api_id', 'date', 'overall_rating',
       'potential', 'preferred_foot', 'attacking_work_rate',
       'defensive_work_rate', 'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes'],
      dtype='object')


In [13]:
# select only necessary columns
player_attributes = player_attributes[['player_api_id', 'date', 'overall_rating']]
player_attributes.head()

Unnamed: 0,player_api_id,date,overall_rating
0,505942,2016-02-18 00:00:00,67.0
1,505942,2015-11-19 00:00:00,67.0
2,505942,2015-09-21 00:00:00,62.0
3,505942,2015-03-20 00:00:00,61.0
4,505942,2007-02-22 00:00:00,61.0


In [14]:
# transform date to datetime format (%Y)
date = pd.to_datetime(player_attributes['date'])
year = date.dt.strftime('%Y')

# add column `year`
player_attributes.loc[:, 'year'] = year
# remove `date` column
player_attributes.drop(columns = 'date')

Unnamed: 0,player_api_id,overall_rating,year
0,505942,67.0,2016
1,505942,67.0,2015
2,505942,62.0,2015
3,505942,61.0,2015
4,505942,61.0,2007
...,...,...,...
183973,39902,83.0,2009
183974,39902,78.0,2009
183975,39902,77.0,2008
183976,39902,78.0,2007


In [15]:
# group data to get average overall rating by year
group = player_attributes.groupby(['player_api_id', 'year'])
df = pd.DataFrame(group['overall_rating'].agg('mean')).reset_index()

In [16]:
# merge table
denorm_table = pd.merge(df, player, on = 'player_api_id', how = 'left')

# remove unnecessary column
denorm_table.drop(columns = 'player_api_id', inplace = True)
denorm_table

Unnamed: 0,year,overall_rating,player_name
0,2007,63.000000,"Patryk Rachwal,18"
1,2008,60.000000,"Patryk Rachwal,18"
2,2010,60.000000,"Patryk Rachwal,18"
3,2011,58.500000,"Patryk Rachwal,18"
4,2012,58.000000,"Patryk Rachwal,18"
...,...,...,...
73054,2007,56.000000,Rees Greenwood
73055,2015,56.000000,Rees Greenwood
73056,2016,57.333333,Rees Greenwood
73057,2007,58.000000,Alexandre Azevedo


In [17]:
# set destination path
file_path = os.path.join(cleaned_data_path,'player_rating.csv')
# save csv
denorm_table.to_csv(file_path, index =  False)

## Get the top players (define $z_{rating} \ge 1$)

In [18]:
def z_score(array):
    z = (array - np.mean(array)) / np.std(array)
    return z

In [19]:
denorm_table['z'] = z_score(denorm_table['overall_rating'])

In [20]:
top_player = denorm_table[denorm_table['z'] >= 1]

In [21]:
# set destination path
file_path = os.path.join(cleaned_data_path,'top_player_fifa.csv')
# save csv
top_player.to_csv(file_path, index =  False)