In [1]:
# import modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import time

In [2]:
# Set up headless browser
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(options=options)

# Load the page
url = "https://fbref.com/en/comps/9/2024-2025/stats/2024-2025-Premier-League-Stats"
driver.get(url)

# Wait for the page to load
time.sleep(5)

# Scroll to the Player Standard Stats table
try:
    table_element = driver.find_element(By.ID, "stats_standard")
    actions = ActionChains(driver)
    actions.move_to_element(table_element).perform()
    time.sleep(3)  # let JS populate the table
except Exception as e:
    print("Could not scroll to table:", e)

# Get page source
html = driver.page_source
driver.quit()

# Parse the correct table
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", {"id": "stats_standard"})
if table is None:
    raise Exception("Player Standard Stats table not found in page source!")

# Convert to DataFrame
df = pd.read_html(str(table))[0]

# Save to CSV
df.to_csv("player_standard_stats_pl_2024_25.csv", index=False)
print("✓ Player Standard Stats saved to 'player_standard_stats_pl_2024_25.csv'")


✓ Player Standard Stats saved to 'player_standard_stats_pl_2024_25.csv'


In [3]:
df = pd.read_csv('player_standard_stats_pl_2024_25.csv')

In [4]:
df.head()

Unnamed: 0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Playing Time,Playing Time.1,Playing Time.2,...,Per 90 Minutes.1,Per 90 Minutes.2,Per 90 Minutes.3,Per 90 Minutes.4,Per 90 Minutes.5,Per 90 Minutes.6,Per 90 Minutes.7,Per 90 Minutes.8,Per 90 Minutes.9,Unnamed: 36_level_0
0,Rk,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,...,Ast,G+A,G-PK,G+A-PK,xG,xAG,xG+xAG,npxG,npxG+xAG,Matches
1,1,Max Aarons,eng ENG,DF,Bournemouth,24,2000,3,1,86,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,Matches
2,2,Joshua Acheampong,eng ENG,DF,Chelsea,18,2006,4,2,170,...,0.00,0.00,0.00,0.00,0.12,0.00,0.12,0.12,0.12,Matches
3,3,Tyler Adams,us USA,MF,Bournemouth,25,1999,28,21,1965,...,0.14,0.14,0.00,0.14,0.07,0.05,0.12,0.07,0.12,Matches
4,4,Tosin Adarabioyo,eng ENG,DF,Chelsea,26,1997,22,15,1409,...,0.06,0.13,0.06,0.13,0.06,0.01,0.07,0.06,0.07,Matches


The original talbe had two levels of header rows, so will need to clean that up

In [5]:
df.iloc[0, :]

Unnamed: 0_level_0           Rk
Unnamed: 1_level_0       Player
Unnamed: 2_level_0       Nation
Unnamed: 3_level_0          Pos
Unnamed: 4_level_0        Squad
Unnamed: 5_level_0          Age
Unnamed: 6_level_0         Born
Playing Time                 MP
Playing Time.1           Starts
Playing Time.2              Min
Playing Time.3              90s
Performance                 Gls
Performance.1               Ast
Performance.2               G+A
Performance.3              G-PK
Performance.4                PK
Performance.5             PKatt
Performance.6              CrdY
Performance.7              CrdR
Expected                     xG
Expected.1                 npxG
Expected.2                  xAG
Expected.3             npxG+xAG
Progression                PrgC
Progression.1              PrgP
Progression.2              PrgR
Per 90 Minutes              Gls
Per 90 Minutes.1            Ast
Per 90 Minutes.2            G+A
Per 90 Minutes.3           G-PK
Per 90 Minutes.4         G+A-PK
Per 90 M

Starting with `Rk` and `Matches` in row 0, these hold no values as `Rk` refers to the index of the player when sorted alphabetically, and `Matches` is a hyperlink on the webpage to view each player's individual match statistics. These will be removed.

I'll also remove the per 90 minute stats to make renaming the columns easier, then add them back in later as calculated fields.

In [6]:
df.drop(columns = ['Unnamed: 0_level_0', 'Unnamed: 36_level_0', 'Per 90 Minutes', 'Per 90 Minutes.1', 'Per 90 Minutes.2',
                  'Per 90 Minutes.3', 'Per 90 Minutes.4', 'Per 90 Minutes.5', 'Per 90 Minutes.6',
                  'Per 90 Minutes.7', 'Per 90 Minutes.8', 'Per 90 Minutes.9'], inplace = True)
df.iloc[0, :]

Unnamed: 1_level_0      Player
Unnamed: 2_level_0      Nation
Unnamed: 3_level_0         Pos
Unnamed: 4_level_0       Squad
Unnamed: 5_level_0         Age
Unnamed: 6_level_0        Born
Playing Time                MP
Playing Time.1          Starts
Playing Time.2             Min
Playing Time.3             90s
Performance                Gls
Performance.1              Ast
Performance.2              G+A
Performance.3             G-PK
Performance.4               PK
Performance.5            PKatt
Performance.6             CrdY
Performance.7             CrdR
Expected                    xG
Expected.1                npxG
Expected.2                 xAG
Expected.3            npxG+xAG
Progression               PrgC
Progression.1             PrgP
Progression.2             PrgR
Name: 0, dtype: object

In [7]:
df.head()

Unnamed: 0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Playing Time,Playing Time.1,Playing Time.2,Playing Time.3,...,Performance.5,Performance.6,Performance.7,Expected,Expected.1,Expected.2,Expected.3,Progression,Progression.1,Progression.2
0,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,...,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR
1,Max Aarons,eng ENG,DF,Bournemouth,24,2000,3,1,86,1.0,...,0,0,0,0.0,0.0,0.0,0.0,1,8,3
2,Joshua Acheampong,eng ENG,DF,Chelsea,18,2006,4,2,170,1.9,...,0,1,0,0.2,0.2,0.0,0.2,0,8,0
3,Tyler Adams,us USA,MF,Bournemouth,25,1999,28,21,1965,21.8,...,0,7,0,1.6,1.6,1.0,2.6,14,76,10
4,Tosin Adarabioyo,eng ENG,DF,Chelsea,26,1997,22,15,1409,15.7,...,0,4,0,0.9,0.9,0.2,1.2,5,42,1


In [8]:
# rename the columns with a for-loop
column_names = list(df.columns)

for i in range(len(column_names)):
    df.rename(columns={column_names[i]: df.iloc[0, i]}, inplace=True)

In [9]:
df.head()

Unnamed: 0,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,...,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR
0,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,...,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR
1,Max Aarons,eng ENG,DF,Bournemouth,24,2000,3,1,86,1.0,...,0,0,0,0.0,0.0,0.0,0.0,1,8,3
2,Joshua Acheampong,eng ENG,DF,Chelsea,18,2006,4,2,170,1.9,...,0,1,0,0.2,0.2,0.0,0.2,0,8,0
3,Tyler Adams,us USA,MF,Bournemouth,25,1999,28,21,1965,21.8,...,0,7,0,1.6,1.6,1.0,2.6,14,76,10
4,Tosin Adarabioyo,eng ENG,DF,Chelsea,26,1997,22,15,1409,15.7,...,0,4,0,0.9,0.9,0.2,1.2,5,42,1


In [10]:
# drop the first row with duplicate information
df.drop(index=0, inplace=True)

In [11]:
df.head()

Unnamed: 0,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,...,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR
1,Max Aarons,eng ENG,DF,Bournemouth,24,2000,3,1,86,1.0,...,0,0,0,0.0,0.0,0.0,0.0,1,8,3
2,Joshua Acheampong,eng ENG,DF,Chelsea,18,2006,4,2,170,1.9,...,0,1,0,0.2,0.2,0.0,0.2,0,8,0
3,Tyler Adams,us USA,MF,Bournemouth,25,1999,28,21,1965,21.8,...,0,7,0,1.6,1.6,1.0,2.6,14,76,10
4,Tosin Adarabioyo,eng ENG,DF,Chelsea,26,1997,22,15,1409,15.7,...,0,4,0,0.9,0.9,0.2,1.2,5,42,1
5,Simon Adingra,ci CIV,"FW,MF",Brighton,22,2002,29,12,1097,12.2,...,0,0,0,2.5,2.5,2.5,4.9,50,18,136


### Data Cleaning

The `Player` column should have only unique players

In [12]:
df.Player.nunique()

563

In [13]:
df.shape

(596, 25)

Let's see where the duplicates are

In [15]:
df.Player.value_counts().value_counts()

count
1     550
2      12
22      1
Name: count, dtype: int64

Check to see what is repeated 22 times!

In [21]:
df.Player.value_counts()

Player
Player                22
Reiss Nelson           2
Carlos Alcaraz         2
Joachim Andersen       2
Evan Ferguson          2
                      ..
Alejandro Garnacho     1
Andrés García          1
Idrissa Gana Gueye     1
Cody Gakpo             1
Martin Ødegaard        1
Name: count, Length: 563, dtype: int64

So it looks like since the table was on a webpage, they may have repeated the Player label, let's make sure

In [22]:
df.loc[df.Player == 'Player']

Unnamed: 0,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,...,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR
26,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,...,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR
52,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,...,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR
78,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,...,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR
104,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,...,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR
130,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,...,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR
156,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,...,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR
182,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,...,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR
208,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,...,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR
234,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,...,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR
260,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,...,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR


So since these are all repeats of the column names, all of these rows will be removed.

In [23]:
rows_to_drop = [26, 52, 78, 104, 130, 156, 182, 208, 
                234, 260, 286, 312, 338, 364, 390, 
                416, 442, 468, 494, 520, 546, 572]

df.drop(index=rows_to_drop, inplace=True)

df.Player.value_counts().value_counts()

count
1    550
2     12
Name: count, dtype: int64

Now, let's check to see the other duplicates

In [29]:
players_twice = df['Player'].value_counts()
players_twice = players_twice[players_twice == 2].index
df[df['Player'].isin(players_twice)]

Unnamed: 0,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,...,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR
13,Carlos Alcaraz,ar ARG,MF,Southampton,21,2002,1,0,10,0.1,...,0,0,0,0.1,0.1,0.1,0.2,4,6,1
14,Carlos Alcaraz,ar ARG,"FW,MF",Everton,21,2002,15,7,764,8.5,...,0,4,0,1.7,1.7,1.6,3.4,19,42,21
24,Joachim Andersen,dk DEN,DF,Crystal Palace,28,1996,1,1,90,1.0,...,0,1,0,0.0,0.0,0.0,0.0,1,4,1
25,Joachim Andersen,dk DEN,DF,Fulham,28,1996,29,29,2583,28.7,...,0,6,1,1.0,1.0,0.5,1.5,16,112,2
42,Jordan Ayew,gh GHA,"FW,MF",Crystal Palace,32,1991,1,0,21,0.2,...,0,1,0,0.5,0.5,0.0,0.5,0,0,1
43,Jordan Ayew,gh GHA,"FW,MF",Leicester City,32,1991,30,19,1638,18.2,...,1,5,0,3.2,2.4,1.1,3.5,49,46,86
95,Trevoh Chalobah,eng ENG,DF,Crystal Palace,25,1999,12,12,1060,11.8,...,0,1,0,1.2,1.2,0.4,1.6,11,34,6
96,Trevoh Chalobah,eng ENG,DF,Chelsea,25,1999,13,11,910,10.1,...,0,2,0,0.8,0.8,1.2,2.0,4,37,2
143,Axel Disasi,fr FRA,DF,Aston Villa,26,1998,7,5,490,5.4,...,0,2,0,0.1,0.1,0.4,0.5,10,16,5
144,Axel Disasi,fr FRA,DF,Chelsea,26,1998,6,4,364,4.0,...,0,1,0,0.3,0.3,0.1,0.4,6,16,2


These were all players who were transferred during the season to other Premier League clubs. I will not be removing or modifying their data and will allow the duplicate entries to exist.

`Nation` column modification

From the above tables, the nations are listed twice, so to get a better feel for each one, let's see how all of them are entered.

In [31]:
df.Nation.unique()

array(['eng ENG', 'us USA', 'ci CIV', 'ng NGA', 'dz ALG', 'no NOR',
       'ch SUI', 'nl NED', 'ar ARG', 'br BRA', 'py PAR', nan, 'mx MEX',
       'fr FRA', 'dk DEN', 'jm JAM', 'es ESP', 'se SWE', 'gh GHA',
       'cm CMR', 'tr TUR', 'pl POL', 'de GER', 'uy URU', 'gw GNB',
       'ml MLI', 'nir NIR', 'cl CHI', 'wls WAL', 'al ALB', 'au AUS',
       'ec ECU', 'sct SCO', 'it ITA', 'pt POR', 'ie IRL', 'be BEL',
       'bd BAN', 'cz CZE', 'zm ZAM', 'at AUT', 'co COL', 'bf BFA',
       'ro ROU', 'sk SVK', 'jp JPN', 'sn SEN', 'hr CRO', 'iq IRQ',
       'kr KOR', 'ca CAN', 'hu HUN', 'ma MAR', 'uz UZB', 'ga GAB',
       'rs SRB', 'eg EGY', 'gr GRE', 'gm GAM', 'ua UKR', 'zw ZIM',
       'xk KVX', 'is ISL', 'ms MSR', 'cd COD', 'nz NZL'], dtype=object)

I'll modify each one to be the three letter abbreviation.

In [32]:
df['Nation'] = df['Nation'].replace({'eng ENG':'ENG', 'us USA':'USA', 'ci CIV':'CIV', 'ng NGA':'NGA', 'dz ALG':'ALG', 'no NOR':'NOR',
                                    'ch SUI':'SUI', 'nl NED':'NED', 'ar ARG':'ARG', 'br BRA':'BRA', 'py PAR':'PAR', 'mx MEX':'MEX',
                                     'fr FRA':'FRA', 'dk DEN':'DEN', 'jm JAM':'JAM', 'es ESP':'ESP', 'se SWE':'SWE', 'gh GHA':'GHA',
                                     'cm CMR':'CMR', 'tr TUR':'TUR', 'pl POL':'POL', 'de GER':'GER', 'uy URU':'URU', 'gw GNB':'GNB',
                                     'ml MLI':'MLI', 'nir NIR':'NIR', 'cl CHI':'CHI', 'wls WAL':'WAL', 'al ALB':'ALB', 'au AUS':'AUS',
                                     'ec ECU':'ECU', 'sct SCO':'SCO', 'it ITA':'ITA', 'pt POR':'POR', 'ie IRL':'IRL', 'be BEL':'BEL',
                                     'bd BAN':'BAN', 'cz CZE':'CZE', 'zm ZAM':'ZAM', 'at AUT':'AUT', 'co COL':'COL', 'bf BFA':'BFA',
                                     'ro ROU':'ROU', 'sk SVK':'SVK', 'jp JPN':'JPN', 'sn SEN':'SEN', 'hr CRO':'CRO', 'iq IRQ':'IRQ',
                                     'kr KOR':'KOR', 'ca CAN':'CAN', 'hu HUN':'HUN', 'ma MAR':'MAR', 'uz UZB':'UZB', 'ga GAB':'GAB',
                                     'rs SRB':'SRB', 'eg EGY':'EGY', 'gr GRE':'GRE', 'gm GAM':'GAM', 'ua UKR':'UKR', 'zw ZIM':'ZIM',
                                     'xk KVX':'KVX', 'is ISL':'ISL', 'ms MSR':'MSR', 'cd COD':'COD', 'nz NZL':'NZL'})

In [34]:
#check to make sure the changes stuck
df.Nation.unique()

array(['ENG', 'USA', 'CIV', 'NGA', 'ALG', 'NOR', 'SUI', 'NED', 'ARG',
       'BRA', 'PAR', nan, 'MEX', 'FRA', 'DEN', 'JAM', 'ESP', 'SWE', 'GHA',
       'CMR', 'TUR', 'POL', 'GER', 'URU', 'GNB', 'MLI', 'NIR', 'CHI',
       'WAL', 'ALB', 'AUS', 'ECU', 'SCO', 'ITA', 'POR', 'IRL', 'BEL',
       'BAN', 'CZE', 'ZAM', 'AUT', 'COL', 'BFA', 'ROU', 'SVK', 'JPN',
       'SEN', 'CRO', 'IRQ', 'KOR', 'CAN', 'HUN', 'MAR', 'UZB', 'GAB',
       'SRB', 'EGY', 'GRE', 'GAM', 'UKR', 'ZIM', 'KVX', 'ISL', 'MSR',
       'COD', 'NZL'], dtype=object)

In [44]:
df.columns

Index(['Player', 'Nation', 'Pos', 'Squad', 'Age', 'Born', 'MP', 'Starts',
       'Min', '90s', 'Gls', 'Ast', 'G+A', 'G-PK', 'PK', 'PKatt', 'CrdY',
       'CrdR', 'xG', 'npxG', 'xAG', 'npxG+xAG', 'PrgC', 'PrgP', 'PrgR'],
      dtype='object')

`Pos` column - I'll modify this to be only the player's main position.

In [45]:
df.Pos.unique()

array(['DF', 'MF', 'FW,MF', 'GK', 'FW', 'FW,DF', 'DF,MF', 'MF,DF',
       'MF,FW', 'DF,FW'], dtype=object)

In [48]:
df.Pos.value_counts()

Pos
DF       186
MF       112
FW        85
FW,MF     60
GK        44
MF,FW     44
DF,MF     16
MF,DF     13
FW,DF      7
DF,FW      7
Name: count, dtype: int64

In [49]:
df['Pos'] = df['Pos'].replace({'FW,MF':'FW', 'MF,FW':'MF', 'DF,MF':'DF', 'MF,DF':'MF', 'FW,DF':'FW', 'DF,FW':'DF'})
df.Pos.value_counts()

Pos
DF    209
MF    169
FW    152
GK     44
Name: count, dtype: int64

Next let's look at `Squad`

In [51]:
df.Squad.unique()

array(['Bournemouth', 'Chelsea', 'Brighton', 'Wolves', 'Crystal Palace',
       "Nott'ham Forest", 'Brentford', 'Manchester City', 'Southampton',
       'Everton', 'Liverpool', 'Newcastle Utd', 'Leicester City',
       'West Ham', 'Manchester Utd', 'Fulham', 'Aston Villa', 'Tottenham',
       'Ipswich Town', 'Arsenal'], dtype=object)

The rest of the columns are statistical entries, so next let's check for missing values and determine how to handle them.

In [54]:
df.isna().sum()

Player      0
Nation      4
Pos         0
Squad       0
Age         4
Born        4
MP          0
Starts      0
Min         0
90s         0
Gls         0
Ast         0
G+A         0
G-PK        0
PK          0
PKatt       0
CrdY        0
CrdR        0
xG          0
npxG        0
xAG         0
npxG+xAG    0
PrgC        0
PrgP        0
PrgR        0
dtype: int64

In [55]:
df.loc[df.Nation.isna()]

Unnamed: 0,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,...,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR
18,Olabade Aluko,,DF,Leicester City,,,1,0,2,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0,0,0
178,Jake Evans,,FW,Leicester City,,,4,0,24,0.3,...,0,0,0,0.0,0.0,0.0,0.0,0,0,1
341,Mateus Mane,,MF,Wolves,,,1,0,2,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0,0,0
374,Jeremy Monga,,FW,Leicester City,,,7,0,112,1.2,...,0,0,0,0.2,0.2,0.0,0.2,6,3,10


Looks like the culprits for the missing values are these four players. I'll look up their information and fill it in.

In [56]:
# Olabade Aluko
df.loc[18, 'Nation'] = 'ENG'
df.loc[18, 'Age'] = 18
df.loc[18, 'Born'] = 2006

# Jake Evans
df.loc[178, 'Nation'] = 'ENG'
df.loc[178, 'Age'] = 16
df.loc[178, 'Born'] = 2008

# Mateus Mane
df.loc[341, 'Nation'] = 'ENG'
df.loc[341, 'Age'] = 17
df.loc[341, 'Born'] = 2007

# Jeremy Monga
df.loc[374, 'Nation'] = 'ENG'
df.loc[374, 'Age'] = 2009
df.loc[374, 'Born'] = 15

In [57]:
df.isna().sum()

Player      0
Nation      0
Pos         0
Squad       0
Age         0
Born        0
MP          0
Starts      0
Min         0
90s         0
Gls         0
Ast         0
G+A         0
G-PK        0
PK          0
PKatt       0
CrdY        0
CrdR        0
xG          0
npxG        0
xAG         0
npxG+xAG    0
PrgC        0
PrgP        0
PrgR        0
dtype: int64

Check to see that the data types are correct for each column

In [58]:
df.dtypes

Player      object
Nation      object
Pos         object
Squad       object
Age         object
Born        object
MP          object
Starts      object
Min         object
90s         object
Gls         object
Ast         object
G+A         object
G-PK        object
PK          object
PKatt       object
CrdY        object
CrdR        object
xG          object
npxG        object
xAG         object
npxG+xAG    object
PrgC        object
PrgP        object
PrgR        object
dtype: object

In [59]:
# update them to be more in line with the actual data types

df = df.astype({'Nation': 'category',
               'Pos':'category',
               'Squad': 'category',
               'Age':'int64',
               'Born':'int64',
               'MP':'int64',
               'Starts':'int64',
               'Min':'float64',
               '90s':'float64',
               'Gls':'int64',
               'Ast':'int64',
               'G+A':'int64',
               'G-PK':'int64',
               'PK':'int64',
               'PKatt':'int64',
               'CrdY':'int64',
               'CrdR':'int64',
               'xG':'float64',
               'npxG':'float64',
               'xAG':'float64',
               'npxG+xAG':'float64',
               'PrgC':'float64',
               'PrgP':'float64',
               'PrgR':'float64'})

df.dtypes

Player        object
Nation      category
Pos         category
Squad       category
Age            int64
Born           int64
MP             int64
Starts         int64
Min          float64
90s          float64
Gls            int64
Ast            int64
G+A            int64
G-PK           int64
PK             int64
PKatt          int64
CrdY           int64
CrdR           int64
xG           float64
npxG         float64
xAG          float64
npxG+xAG     float64
PrgC         float64
PrgP         float64
PrgR         float64
dtype: object

Lastly, let's add back in the per 90 minute statistics

In [62]:
df['Gls_90'] = round(df['Gls'] / df['90s'], 3)
df['Ast_90'] = round(df['Ast'] / df['90s'], 3)
df['G+A_90'] = round(df['G+A'] / df['90s'], 3)
df['G-PK_90'] = round(df['G-PK'] / df['90s'], 3)
df['G+A-PK_90'] = round((df['G+A']-df['PK']) / df['90s'], 3)
df['xG_90'] = round(df['xG'] / df['90s'], 3)
df['xAG_90'] = round(df['xAG'] / df['90s'], 3)
df['xG+xAG_90'] = round(df['xG_90']+df['xAG_90'], 3)
df['npxG_90'] = round(df['npxG'] / df['90s'], 3)
df['npXG+xAG_90'] = round(df['xG_90']+df['xAG_90'], 3)

In [63]:
df.dtypes

Player           object
Nation         category
Pos            category
Squad          category
Age               int64
Born              int64
MP                int64
Starts            int64
Min             float64
90s             float64
Gls               int64
Ast               int64
G+A               int64
G-PK              int64
PK                int64
PKatt             int64
CrdY              int64
CrdR              int64
xG              float64
npxG            float64
xAG             float64
npxG+xAG        float64
PrgC            float64
PrgP            float64
PrgR            float64
Gls_90          float64
Ast_90          float64
G+A_90          float64
G-PK_90         float64
G+A-PK_90       float64
xG_90           float64
xAG_90          float64
xG+xAG_90       float64
npxG_90         float64
npXG+xAG_90     float64
dtype: object

Save the cleaned data set to csv for future use

In [64]:
df.to_csv('epl_players_24_25_cleaned.csv', index=False)