# Transfers Capstone - Data Wrangling

### Data Collection: 2016-2019 Transfers Data
Data sourced from: https://github.com/ewenme/transfers

### I. Loading Packages
 - Loading packages and setting up the environment

In [1]:
# Load python packages
import os
import pandas as pd
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [2]:
# Checking my working directory
os.getcwd()

'/home/tdraths/sb_assignments/Transfers_Capstone'

In [3]:
# Ensuring I have the folders containing my datasets
os.listdir("/home/tdraths/sb_assignments/Transfers_Capstone")

['data', '.ipynb_checkpoints', 'figures', 'models', 'Data_Wrangling.ipynb']

In [4]:
# My preferred method to begin loading a csv into a dataframe
path = "/home/tdraths/sb_assignments/Transfers_Capstone/data/original_data_sources"
os.chdir(path)

### II. Reading CSV into Dataframe
 - Transfer data for the 2019 season of the English Championship
 - Data source: https://github.com/ewenme/transfers

In [5]:
# Reading CSV into DataFrame
ec_2019 = pd.read_csv('english_championship_2019.csv') 

In [6]:
# Quick look at a small segment of data
ec_2019.head() 

Unnamed: 0,club_name,player_name,age,position,club_involved_name,fee,transfer_movement,fee_cleaned,league_name,year,season
0,Cardiff City,Robert Glatzel,25,Centre-Forward,1.FC Heidenheim,£5.40m,in,5.4,Championship,2019,2019/2020
1,Cardiff City,Aden Flint,30,Centre-Back,Middlesbrough,£4.01m,in,4.01,Championship,2019,2019/2020
2,Cardiff City,Will Vaulks,25,Defensive Midfield,Rotherham,£2.12m,in,2.12,Championship,2019,2019/2020
3,Cardiff City,Gavin Whyte,23,Right Winger,Oxford United,£1.98m,in,1.98,Championship,2019,2019/2020
4,Cardiff City,Isaac Vassell,25,Centre-Forward,Birmingham,£1.98m,in,1.98,Championship,2019,2019/2020


In [7]:
ec_2019.columns 
# I'll keep all of these columns intact, as I think they'll be useful later

Index(['club_name', 'player_name', 'age', 'position', 'club_involved_name',
       'fee', 'transfer_movement', 'fee_cleaned', 'league_name', 'year',
       'season'],
      dtype='object')

In [8]:
ec_2019.dtypes 
#'Fee' is feature with object types, 'fee_cleaned' contains float types. 
# I will use 'fee_cleaned' for the best look at the 

club_name              object
player_name            object
age                     int64
position               object
club_involved_name     object
fee                    object
transfer_movement      object
fee_cleaned           float64
league_name            object
year                    int64
season                 object
dtype: object

In [9]:
ec_2019.info()
# 'Fee_cleaned' looks like it has some null values in it I'll deal with later.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 975 entries, 0 to 974
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   club_name           975 non-null    object 
 1   player_name         975 non-null    object 
 2   age                 975 non-null    int64  
 3   position            975 non-null    object 
 4   club_involved_name  975 non-null    object 
 5   fee                 975 non-null    object 
 6   transfer_movement   975 non-null    object 
 7   fee_cleaned         688 non-null    float64
 8   league_name         975 non-null    object 
 9   year                975 non-null    int64  
 10  season              975 non-null    object 
dtypes: float64(1), int64(2), object(8)
memory usage: 83.9+ KB


In [10]:
ec_2019.shape

(975, 11)

In [11]:
ec_2019.nunique()

club_name              24
player_name           645
age                    24
position               15
club_involved_name    304
fee                   127
transfer_movement       2
fee_cleaned            58
league_name             1
year                    1
season                  1
dtype: int64

In [12]:
# Looking at number of records per club
ec_2019.club_name.value_counts()

Bristol City            60
Huddersfield Town       55
Nottingham Forest       55
Reading FC              52
Leeds United            50
Charlton Athletic       45
Birmingham City         45
Stoke City              44
Queens Park Rangers     43
Swansea City            42
Wigan Athletic          42
Brentford FC            41
Hull City               40
Cardiff City            39
Derby County            38
Barnsley FC             38
Preston North End       35
Luton Town              35
Fulham FC               34
West Bromwich Albion    34
Sheffield Wednesday     31
Millwall FC             28
Middlesbrough FC        27
Blackburn Rovers        22
Name: club_name, dtype: int64

In [13]:
ec_2019.agg([min, max]).T
# A better look at the difference between 'fee' and 'fee_cleaned' data types
# Fee_cleaned is a float type, with 1.0 = £1mil.

Unnamed: 0,min,max
club_name,Barnsley FC,Wigan Athletic
player_name,Aapo Halme,Álvaro Giménez
age,16,40
position,Attacking Midfield,Second Striker
club_involved_name,1. FC Köln,Zulte Waregem
fee,-,£990Th.
transfer_movement,in,out
fee_cleaned,0,24.3
league_name,Championship,Championship
year,2019,2019


In [14]:
# It's pretty clear from the .info method that the only column with null values is 'fee_cleaned'
# I want to ensure that's the case, and I want to see exactly what proportion of 'fee_cleaned' needs to be handled
nulls = pd.DataFrame(ec_2019.isnull().sum().sort_values(ascending=False)/len(ec_2019), columns = ['percent'])
percent_null = nulls['percent'] > 0
nulls[percent_null]

Unnamed: 0,percent
fee_cleaned,0.294359


In [15]:
# I know from the data that 'fee_cleaned' null values are null because there was no fee associated with the transfer
# I'll fill with 0, and then later I'll check that the only values in 'fee_cleaned' are between 0 and some higher float.
ec_2019.fee_cleaned.fillna(0, inplace=True)

In [16]:
# Checking for duplicate rows.
duplicateRowsDF_2019 = ec_2019[ec_2019.duplicated()]
duplicateRowsDF_2019

Unnamed: 0,club_name,player_name,age,position,club_involved_name,fee,transfer_movement,fee_cleaned,league_name,year,season


In [91]:
df1 = ec_2019.groupby(['year', 'club_name'], as_index=False).sum()
df1.head()

Unnamed: 0.1,year,club_name,Unnamed: 0,age,fee_cleaned,club_total
0,2019,Barnsley FC,34599,866,9.78,0.0
1,2019,Birmingham City,30105,1125,29.61,0.0
2,2019,Blackburn Rovers,13013,573,8.87,0.0
3,2019,Brentford FC,18614,912,67.16,0.0
4,2019,Bristol City,18390,1459,71.38,0.0


In [17]:
# Saving ec_2019 back to a csv with no nulls in 'fee_cleaned'
ec_2019.to_csv("/home/tdraths/sb_assignments/Transfers_Capstone/data/data_cleaning_outputs/english_championship_2019_output.csv")

### III. Reading CSV into Dataframe
 - Transfer data for the 2018 season of the English Championship
 - Data source: https://github.com/ewenme/transfers

In [18]:
ec_2018 = pd.read_csv("/home/tdraths/sb_assignments/Transfers_Capstone/data/original_data_sources/english_championship_2018.csv")

In [19]:
ec_2018.head()

Unnamed: 0,club_name,player_name,age,position,club_involved_name,fee,transfer_movement,fee_cleaned,league_name,year,season
0,Swansea City,Bersant Celina,21,Attacking Midfield,Man City,£3.06m,in,3.06,Championship,2018,2018/2019
1,Swansea City,Joel Asoro,19,Centre-Forward,Sunderland,£2.07m,in,2.07,Championship,2018,2018/2019
2,Swansea City,Declan John,23,Left-Back,Rangers,£801k,in,0.801,Championship,2018,2018/2019
3,Swansea City,Barrie McKay,23,Left Winger,Nottm Forest,£504k,in,0.504,Championship,2018,2018/2019
4,Swansea City,Yan Dhanda,19,Left Midfield,Liverpool U23,Free Transfer,in,0.0,Championship,2018,2018/2019


In [20]:
ec_2018.columns

Index(['club_name', 'player_name', 'age', 'position', 'club_involved_name',
       'fee', 'transfer_movement', 'fee_cleaned', 'league_name', 'year',
       'season'],
      dtype='object')

In [21]:
ec_2018.dtypes

club_name              object
player_name            object
age                     int64
position               object
club_involved_name     object
fee                    object
transfer_movement      object
fee_cleaned           float64
league_name            object
year                    int64
season                 object
dtype: object

In [22]:
ec_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1105 entries, 0 to 1104
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   club_name           1105 non-null   object 
 1   player_name         1105 non-null   object 
 2   age                 1105 non-null   int64  
 3   position            1105 non-null   object 
 4   club_involved_name  1105 non-null   object 
 5   fee                 1105 non-null   object 
 6   transfer_movement   1105 non-null   object 
 7   fee_cleaned         1040 non-null   float64
 8   league_name         1105 non-null   object 
 9   year                1105 non-null   int64  
 10  season              1105 non-null   object 
dtypes: float64(1), int64(2), object(8)
memory usage: 95.1+ KB


In [23]:
ec_2018.shape

(1105, 11)

In [24]:
ec_2018.nunique()

club_name              24
player_name           615
age                    23
position               15
club_involved_name    295
fee                   141
transfer_movement       2
fee_cleaned            77
league_name             1
year                    1
season                  1
dtype: int64

In [25]:
ec_2018.club_name.value_counts()

Nottingham Forest       74
Reading FC              58
Wigan Athletic          57
Sheffield United        55
Aston Villa             55
Leeds United            54
Ipswich Town            54
Middlesbrough FC        48
Norwich City            48
Preston North End       47
Derby County            46
Birmingham City         45
Millwall FC             45
Bolton Wanderers        44
Rotherham United        44
Stoke City              43
Bristol City            42
Queens Park Rangers     41
Blackburn Rovers        41
Swansea City            40
West Bromwich Albion    38
Brentford FC            32
Hull City               32
Sheffield Wednesday     22
Name: club_name, dtype: int64

In [26]:
ec_2018.agg([min, max]).T

Unnamed: 0,min,max
club_name,Aston Villa,Wigan Athletic
player_name,Aaron Drinan,Örjan Nyland
age,17,39
position,Attacking Midfield,Second Striker
club_involved_name,1. FC Köln,Östersund
fee,-,£990k
transfer_movement,in,out
fee_cleaned,0,22.5
league_name,Championship,Championship
year,2018,2018


In [27]:
nulls = pd.DataFrame(ec_2018.isnull().sum().sort_values(ascending=False)/len(ec_2018), columns = ['percent'])
percent_null = nulls['percent'] > 0
nulls[percent_null]

Unnamed: 0,percent
fee_cleaned,0.058824


In [28]:
ec_2018.fee_cleaned.fillna(0, inplace=True)

In [29]:
duplicateRowsDF_2018 = ec_2018[ec_2018.duplicated()]
duplicateRowsDF_2018
# Looks like we have a couple of duplicate rows
# I'll leave them alone for now, because I think they may reflect the same players going on loan twice in a season

Unnamed: 0,club_name,player_name,age,position,club_involved_name,fee,transfer_movement,fee_cleaned,league_name,year,season
262,Derby County,Max Lowe,21,Left-Back,Aberdeen FC,Loan,out,0.0,Championship,2018,2018/2019
775,Nottingham Forest,Zach Clough,23,Second Striker,Rochdale,Loan,out,0.0,Championship,2018,2018/2019


In [30]:
ec_2018.to_csv("/home/tdraths/sb_assignments/Transfers_Capstone/data/data_cleaning_outputs/english_championship_2018_output.csv")

### IV. Reading CSV into Dataframe
 - Transfer data for the 2017 season of the English Championship
 - Data source: https://github.com/ewenme/transfers

In [31]:
ec_2017 = pd.read_csv("/home/tdraths/sb_assignments/Transfers_Capstone/data/original_data_sources/english_championship_2017.csv")

In [32]:
ec_2017.head()

Unnamed: 0,club_name,player_name,age,position,club_involved_name,fee,transfer_movement,fee_cleaned,league_name,year,season
0,Hull City,Kevin Stewart,23,Defensive Midfield,Liverpool,£4.05m,in,4.05,Championship,2017,2017/2018
1,Hull City,Nouha Dicko,25,Centre-Forward,Wolves,£3.42m,in,3.42,Championship,2017,2017/2018
2,Hull City,Jon Toral,22,Attacking Midfield,Arsenal U23,£2.97m,in,2.97,Championship,2017,2017/2018
3,Hull City,Stephen Kingsley,23,Left-Back,Swansea,£2.97m,in,2.97,Championship,2017,2017/2018
4,Hull City,Jackson Irvine,24,Defensive Midfield,Burton Albion,£1.94m,in,1.94,Championship,2017,2017/2018


In [33]:
ec_2017.columns

Index(['club_name', 'player_name', 'age', 'position', 'club_involved_name',
       'fee', 'transfer_movement', 'fee_cleaned', 'league_name', 'year',
       'season'],
      dtype='object')

In [34]:
ec_2017.dtypes

club_name              object
player_name            object
age                     int64
position               object
club_involved_name     object
fee                    object
transfer_movement      object
fee_cleaned           float64
league_name            object
year                    int64
season                 object
dtype: object

In [35]:
ec_2017.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1156 entries, 0 to 1155
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   club_name           1156 non-null   object 
 1   player_name         1156 non-null   object 
 2   age                 1156 non-null   int64  
 3   position            1156 non-null   object 
 4   club_involved_name  1156 non-null   object 
 5   fee                 1156 non-null   object 
 6   transfer_movement   1156 non-null   object 
 7   fee_cleaned         1089 non-null   float64
 8   league_name         1156 non-null   object 
 9   year                1156 non-null   int64  
 10  season              1156 non-null   object 
dtypes: float64(1), int64(2), object(8)
memory usage: 99.5+ KB


In [36]:
ec_2017.shape

(1156, 11)

In [37]:
ec_2017.nunique()

club_name              24
player_name           688
age                    22
position               15
club_involved_name    319
fee                   144
transfer_movement       2
fee_cleaned            88
league_name             1
year                    1
season                  1
dtype: int64

In [38]:
ec_2017.club_name.value_counts()

Barnsley FC                71
Wolverhampton Wanderers    67
Nottingham Forest          64
Norwich City               62
Cardiff City               58
Leeds United               57
Birmingham City            56
Middlesbrough FC           55
Hull City                  53
Burton Albion              51
Millwall FC                50
Bristol City               47
Sunderland AFC             47
Fulham FC                  47
Sheffield United           45
Bolton Wanderers           44
Reading FC                 43
Ipswich Town               43
Queens Park Rangers        36
Aston Villa                36
Derby County               36
Preston North End          33
Sheffield Wednesday        28
Brentford FC               27
Name: club_name, dtype: int64

In [39]:
ec_2017.agg([min, max]).T

Unnamed: 0,min,max
club_name,Aston Villa,Wolverhampton Wanderers
player_name,Aapo Halme,Álvaro Negredo
age,17,38
position,Attacking Midfield,Second Striker
club_involved_name,1.FSV Mainz 05,Zulte Waregem
fee,-,£990k
transfer_movement,in,out
fee_cleaned,0,25.65
league_name,Championship,Championship
year,2017,2017


In [40]:
nulls = pd.DataFrame(ec_2017.isnull().sum().sort_values(ascending=False)/len(ec_2017), columns = ['percent'])
percent_null = nulls['percent'] > 0
nulls[percent_null]

Unnamed: 0,percent
fee_cleaned,0.057958


In [41]:
ec_2017.fee_cleaned.fillna(0, inplace=True)

In [42]:
duplicateRowsDF_2017 = ec_2017[ec_2017.duplicated()]
duplicateRowsDF_2017
# Same as 2018. I'll leave them for now and ensure later that they are 'double loans' in a season

Unnamed: 0,club_name,player_name,age,position,club_involved_name,fee,transfer_movement,fee_cleaned,league_name,year,season
481,Preston North End,Eoin Doyle,29,Centre-Forward,Oldham Athletic,Loan,out,0.0,Championship,2017,2017/2018
797,Bristol City,Max O'Leary,21,Goalkeeper,Solihull Moors,Loan,out,0.0,Championship,2017,2017/2018


In [43]:
ec_2017.to_csv("/home/tdraths/sb_assignments/Transfers_Capstone/data/data_cleaning_outputs/english_championship_2017_output.csv")

### V. Reading CSV into Dataframe
 - Transfer data for the 2016 season of the English Championship
 - Data source: https://github.com/ewenme/transfers

In [44]:
ec_2016 = pd.read_csv("/home/tdraths/sb_assignments/Transfers_Capstone/data/original_data_sources/english_championship_2016.csv")

In [45]:
ec_2016.head()

Unnamed: 0,club_name,player_name,age,position,club_involved_name,fee,transfer_movement,fee_cleaned,league_name,year,season
0,Newcastle United,Dwight Gayle,25,Centre-Forward,Crystal Palace,£10.80m,in,10.8,Championship,2016,2016/2017
1,Newcastle United,Matt Ritchie,26,Right Midfield,Bournemouth,£10.80m,in,10.8,Championship,2016,2016/2017
2,Newcastle United,Matz Sels,24,Goalkeeper,KAA Gent,£5.94m,in,5.94,Championship,2016,2016/2017
3,Newcastle United,Grant Hanley,24,Centre-Back,Blackburn,£5.94m,in,5.94,Championship,2016,2016/2017
4,Newcastle United,Ciaran Clark,26,Centre-Back,Aston Villa,£5.40m,in,5.4,Championship,2016,2016/2017


In [46]:
ec_2016.columns

Index(['club_name', 'player_name', 'age', 'position', 'club_involved_name',
       'fee', 'transfer_movement', 'fee_cleaned', 'league_name', 'year',
       'season'],
      dtype='object')

In [47]:
ec_2016.dtypes

club_name              object
player_name            object
age                     int64
position               object
club_involved_name     object
fee                    object
transfer_movement      object
fee_cleaned           float64
league_name            object
year                    int64
season                 object
dtype: object

In [48]:
ec_2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1203 entries, 0 to 1202
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   club_name           1203 non-null   object 
 1   player_name         1203 non-null   object 
 2   age                 1203 non-null   int64  
 3   position            1203 non-null   object 
 4   club_involved_name  1203 non-null   object 
 5   fee                 1203 non-null   object 
 6   transfer_movement   1203 non-null   object 
 7   fee_cleaned         1117 non-null   float64
 8   league_name         1203 non-null   object 
 9   year                1203 non-null   int64  
 10  season              1203 non-null   object 
dtypes: float64(1), int64(2), object(8)
memory usage: 103.5+ KB


In [49]:
ec_2016.shape

(1203, 11)

In [50]:
ec_2016.nunique()

club_name              24
player_name           725
age                    23
position               15
club_involved_name    341
fee                   153
transfer_movement       2
fee_cleaned            95
league_name             1
year                    1
season                  1
dtype: int64

In [51]:
ec_2016.club_name.value_counts()

Wigan Athletic             82
Barnsley FC                66
Fulham FC                  62
Reading FC                 60
Queens Park Rangers        60
Rotherham United           58
Cardiff City               58
Nottingham Forest          57
Birmingham City            57
Burton Albion              57
Wolverhampton Wanderers    54
Bristol City               52
Aston Villa                49
Newcastle United           49
Leeds United               45
Blackburn Rovers           44
Brighton & Hove Albion     42
Huddersfield Town          42
Sheffield Wednesday        41
Preston North End          39
Norwich City               36
Derby County               36
Ipswich Town               33
Brentford FC               24
Name: club_name, dtype: int64

In [52]:
ec_2016.agg([min, max]).T

Unnamed: 0,min,max
club_name,Aston Villa,Wolverhampton Wanderers
player_name,Aaron Collins,Álex López
age,16,38
position,Attacking Midfield,Second Striker
club_involved_name,1.FC K'lautern,York City
fee,-,£900k
transfer_movement,in,out
fee_cleaned,0,31.5
league_name,Championship,Championship
year,2016,2016


In [53]:
nulls = pd.DataFrame(ec_2016.isnull().sum().sort_values(ascending=False)/len(ec_2016), columns = ['percent'])
percent_null = nulls['percent'] > 0
nulls[percent_null]

Unnamed: 0,percent
fee_cleaned,0.071488


In [54]:
ec_2016.fee_cleaned.fillna(0, inplace=True)

In [55]:
duplicateRowsDF_2016 = ec_2016[ec_2016.duplicated()]
duplicateRowsDF_2016

Unnamed: 0,club_name,player_name,age,position,club_involved_name,fee,transfer_movement,fee_cleaned,league_name,year,season
634,Blackburn Rovers,Marvin Emnes,28,Centre-Forward,Swansea,Loan,in,0.0,Championship,2016,2016/2017


In [56]:
ec_2016.to_csv("/home/tdraths/sb_assignments/Transfers_Capstone/data/data_cleaning_outputs/english_championship_2016_output.csv")

In [57]:
os.listdir("/home/tdraths/sb_assignments/Transfers_Capstone/data/data_cleaning_outputs")

['english_championship_2019_output.csv',
 'elc_spi_output.csv',
 'english_championship_2018_output.csv',
 'english_championship_2017_output.csv',
 'english_championship_2016_output.csv']

### VI. Reading SPI CSV into Dataframe
 - Match data from 2016 through 2020 used to determine Soccer Power Index (SPI) rankings
 - Data sourced from: https://github.com/fivethirtyeight/data/tree/master/soccer-spi

In [58]:
os.chdir("/home/tdraths/sb_assignments/Transfers_Capstone/data/original_data_sources")

In [59]:
spi_df = pd.read_csv('spi_matches.csv')
spi_df.head()

Unnamed: 0,season,date,league_id,league,team1,team2,spi1,spi2,prob1,prob2,...,importance1,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2
0,2016,2016-07-09,7921,FA Women's Super League,Liverpool Women,Reading,51.56,50.42,0.4389,0.2767,...,,,2.0,0.0,,,,,,
1,2016,2016-07-10,7921,FA Women's Super League,Arsenal Women,Notts County Ladies,46.61,54.03,0.3572,0.3608,...,,,2.0,0.0,,,,,,
2,2016,2016-07-10,7921,FA Women's Super League,Chelsea FC Women,Birmingham City,59.85,54.64,0.4799,0.2487,...,,,1.0,1.0,,,,,,
3,2016,2016-07-16,7921,FA Women's Super League,Liverpool Women,Notts County Ladies,53.0,52.35,0.4289,0.2699,...,,,0.0,0.0,,,,,,
4,2016,2016-07-17,7921,FA Women's Super League,Chelsea FC Women,Arsenal Women,59.43,60.99,0.4124,0.3157,...,,,1.0,2.0,,,,,,


 - Reading SPI csv into DataFrame
 - It contains match-specific data for every major soccer league in the world, going back to 2016
 - I'm only jusing English Championship data for now
 - I can imagine some usefulness in comparing EC transfer spending with PremLeague/LaLiga/etc spending

In [60]:
spi_df.loc[spi_df['league'] == 'English League Championship'].head() 
#English League Championship ID is 2412

Unnamed: 0,season,date,league_id,league,team1,team2,spi1,spi2,prob1,prob2,...,importance1,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2
2992,2017,2017-08-04,2412,English League Championship,Sunderland,Derby County,50.39,40.83,0.5266,0.2184,...,,,1.0,1.0,2.24,1.23,1.92,1.38,1.05,1.05
2994,2017,2017-08-04,2412,English League Championship,Nottingham Forest,Millwall,35.55,28.23,0.5149,0.2186,...,,,1.0,0.0,0.45,3.49,1.26,2.73,1.05,0.0
3004,2017,2017-08-05,2412,English League Championship,Sheffield United,Brentford,27.72,39.7,0.3031,0.4486,...,,,1.0,0.0,0.72,1.84,0.97,1.43,1.05,0.0
3005,2017,2017-08-05,2412,English League Championship,Queens Park Rangers,Reading,36.33,34.9,0.442,0.2823,...,,,2.0,0.0,2.15,0.29,1.27,0.51,2.1,0.0
3006,2017,2017-08-05,2412,English League Championship,Fulham,Norwich City,43.0,42.6,0.4434,0.3142,...,,,1.0,1.0,1.19,1.71,2.35,1.88,1.05,1.05


In [61]:
#Filtering spi_df for "English League Championship"
elc = spi_df[spi_df.league.eq('English League Championship')]
elc.head()

Unnamed: 0,season,date,league_id,league,team1,team2,spi1,spi2,prob1,prob2,...,importance1,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2
2992,2017,2017-08-04,2412,English League Championship,Sunderland,Derby County,50.39,40.83,0.5266,0.2184,...,,,1.0,1.0,2.24,1.23,1.92,1.38,1.05,1.05
2994,2017,2017-08-04,2412,English League Championship,Nottingham Forest,Millwall,35.55,28.23,0.5149,0.2186,...,,,1.0,0.0,0.45,3.49,1.26,2.73,1.05,0.0
3004,2017,2017-08-05,2412,English League Championship,Sheffield United,Brentford,27.72,39.7,0.3031,0.4486,...,,,1.0,0.0,0.72,1.84,0.97,1.43,1.05,0.0
3005,2017,2017-08-05,2412,English League Championship,Queens Park Rangers,Reading,36.33,34.9,0.442,0.2823,...,,,2.0,0.0,2.15,0.29,1.27,0.51,2.1,0.0
3006,2017,2017-08-05,2412,English League Championship,Fulham,Norwich City,43.0,42.6,0.4434,0.3142,...,,,1.0,1.0,1.19,1.71,2.35,1.88,1.05,1.05


In [62]:
elc.columns
# I am not concerned with the match-specific columns toward the last half of this list.
# I will likely only keep features from 'season' through 'spi2'

Index(['season', 'date', 'league_id', 'league', 'team1', 'team2', 'spi1',
       'spi2', 'prob1', 'prob2', 'probtie', 'proj_score1', 'proj_score2',
       'importance1', 'importance2', 'score1', 'score2', 'xg1', 'xg2', 'nsxg1',
       'nsxg2', 'adj_score1', 'adj_score2'],
      dtype='object')

In [63]:
elc.dtypes

season           int64
date            object
league_id        int64
league          object
team1           object
team2           object
spi1           float64
spi2           float64
prob1          float64
prob2          float64
probtie        float64
proj_score1    float64
proj_score2    float64
importance1    float64
importance2    float64
score1         float64
score2         float64
xg1            float64
xg2            float64
nsxg1          float64
nsxg2          float64
adj_score1     float64
adj_score2     float64
dtype: object

In [64]:
elc_nulls = pd.DataFrame(elc.isnull().sum().sort_values(ascending=False)/len(elc), columns = ['percent'])
elc_percent_null = elc_nulls['percent'] > 0
elc_nulls[elc_percent_null]
# High percentages of null values, but only in the columns that I am unlikely to need.

Unnamed: 0,percent
importance2,0.363473
importance1,0.363473
adj_score2,0.238866
nsxg2,0.238866
nsxg1,0.238866
xg2,0.238866
xg1,0.238866
adj_score1,0.238866
score2,0.237517
score1,0.237517


In [65]:
# Dropping those unnecessary columns. I'm only focusing on the actual SPI ranking for clubs
# The match specific data isn't important for my questions.
columns = ['importance2', 'importance1', 'adj_score2', 'nsxg2', 'nsxg1', 'xg2', 'xg1', 'adj_score1', 'score2', 'score1', 'prob1', 'prob2', 'probtie', 'proj_score1', 'proj_score2']
elc.drop(columns, inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [66]:
elc.head()
# Having a look at the new DF before saving it back to a .csv

Unnamed: 0,season,date,league_id,league,team1,team2,spi1,spi2
2992,2017,2017-08-04,2412,English League Championship,Sunderland,Derby County,50.39,40.83
2994,2017,2017-08-04,2412,English League Championship,Nottingham Forest,Millwall,35.55,28.23
3004,2017,2017-08-05,2412,English League Championship,Sheffield United,Brentford,27.72,39.7
3005,2017,2017-08-05,2412,English League Championship,Queens Park Rangers,Reading,36.33,34.9
3006,2017,2017-08-05,2412,English League Championship,Fulham,Norwich City,43.0,42.6


In [67]:
elc.to_csv("/home/tdraths/sb_assignments/Transfers_Capstone/data/data_cleaning_outputs/elc_spi_output.csv")

### Using CONCAT to  combine clean .cvs Files

In [68]:
ec_2019 = pd.read_csv("/home/tdraths/sb_assignments/Transfers_Capstone/data/data_cleaning_outputs/english_championship_2019_output.csv")
ec_2018 = pd.read_csv("/home/tdraths/sb_assignments/Transfers_Capstone/data/data_cleaning_outputs/english_championship_2018_output.csv")
ec_2017 = pd.read_csv("/home/tdraths/sb_assignments/Transfers_Capstone/data/data_cleaning_outputs/english_championship_2017_output.csv")
ec_2016 = pd.read_csv("/home/tdraths/sb_assignments/Transfers_Capstone/data/data_cleaning_outputs/english_championship_2016_output.csv")

In [69]:
ec_all_years = pd.concat([ec_2016, ec_2017, ec_2018, ec_2019])

In [70]:
ec_all_years.columns

Index(['Unnamed: 0', 'club_name', 'player_name', 'age', 'position',
       'club_involved_name', 'fee', 'transfer_movement', 'fee_cleaned',
       'league_name', 'year', 'season'],
      dtype='object')

In [71]:
ec_all_years.dtypes

Unnamed: 0              int64
club_name              object
player_name            object
age                     int64
position               object
club_involved_name     object
fee                    object
transfer_movement      object
fee_cleaned           float64
league_name            object
year                    int64
season                 object
dtype: object

In [72]:
ec_all_years.sort_values('club_name')

Unnamed: 0.1,Unnamed: 0,club_name,player_name,age,position,club_involved_name,fee,transfer_movement,fee_cleaned,league_name,year,season
173,173,Aston Villa,Tammy Abraham,21,Centre-Forward,Chelsea,"End of loanMay 31, 2019",out,0.0,Championship,2018,2018/2019
104,104,Aston Villa,Ross McCormack,30,Second Striker,Nottm Forest,"End of loanMay 31, 2017",in,0.0,Championship,2016,2016/2017
105,105,Aston Villa,José Crespo,29,Centre-Back,Rayo Vallecano,"End of loanJun 30, 2016",in,0.0,Championship,2016,2016/2017
106,106,Aston Villa,Callum Robinson,21,Left Winger,Preston NE,"End of loanJun 30, 2016",in,0.0,Championship,2016,2016/2017
107,107,Aston Villa,Janoi Donacien,22,Centre-Back,Newport County,"End of loanJun 30, 2016",in,0.0,Championship,2016,2016/2017
...,...,...,...,...,...,...,...,...,...,...,...,...
698,698,Wolverhampton Wanderers,Jonathan Flatt,22,Goalkeeper,Cheltenham Town,Loan,out,0.0,Championship,2017,2017/2018
699,699,Wolverhampton Wanderers,Prince Oniangué,29,Defensive Midfield,SCO Angers,Loan,out,0.0,Championship,2017,2017/2018
700,700,Wolverhampton Wanderers,Michal Zyro,25,Right Winger,Charlton,Loan,out,0.0,Championship,2017,2017/2018
694,694,Wolverhampton Wanderers,Jack Ruddy,19,Goalkeeper,Oldham Athletic,Loan,out,0.0,Championship,2017,2017/2018


In [73]:
newcastle = ec_all_years[ec_all_years['club_name'] == 'Newcastle United']
newcastle.head()

Unnamed: 0.1,Unnamed: 0,club_name,player_name,age,position,club_involved_name,fee,transfer_movement,fee_cleaned,league_name,year,season
0,0,Newcastle United,Dwight Gayle,25,Centre-Forward,Crystal Palace,£10.80m,in,10.8,Championship,2016,2016/2017
1,1,Newcastle United,Matt Ritchie,26,Right Midfield,Bournemouth,£10.80m,in,10.8,Championship,2016,2016/2017
2,2,Newcastle United,Matz Sels,24,Goalkeeper,KAA Gent,£5.94m,in,5.94,Championship,2016,2016/2017
3,3,Newcastle United,Grant Hanley,24,Centre-Back,Blackburn,£5.94m,in,5.94,Championship,2016,2016/2017
4,4,Newcastle United,Ciaran Clark,26,Centre-Back,Aston Villa,£5.40m,in,5.4,Championship,2016,2016/2017


In [84]:
ec_all_years.nunique()

Unnamed: 0            1203
club_name               35
player_name           1753
age                     25
position                15
club_involved_name     602
fee                    436
transfer_movement        2
fee_cleaned            198
league_name              1
year                     4
season                   4
dtype: int64

In [86]:
ec_all_years['club_name'].unique()

array(['Newcastle United', 'Norwich City', 'Aston Villa',
       'Brighton & Hove Albion', 'Derby County', 'Sheffield Wednesday',
       'Ipswich Town', 'Cardiff City', 'Brentford FC', 'Birmingham City',
       'Preston North End', 'Queens Park Rangers', 'Leeds United',
       'Wolverhampton Wanderers', 'Blackburn Rovers', 'Nottingham Forest',
       'Reading FC', 'Bristol City', 'Huddersfield Town', 'Fulham FC',
       'Rotherham United', 'Wigan Athletic', 'Burton Albion',
       'Barnsley FC', 'Hull City', 'Middlesbrough FC', 'Sunderland AFC',
       'Sheffield United', 'Bolton Wanderers', 'Millwall FC',
       'Swansea City', 'Stoke City', 'West Bromwich Albion', 'Luton Town',
       'Charlton Athletic'], dtype=object)