# The Sofifa Scraper

In [31]:
import numpy as np
import pandas as pd
import calendar
import matplotlib.pyplot as plt
import math
from datetime import datetime

In [17]:
pd.set_option('display.max.columns', 30)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max.rows', 1000)
pd.set_option("display.float_format", lambda x: "%.2f" % x )

## Load Dataset

In [459]:
#Load the datset
df = pd.read_csv("./results/player_dataset.csv")

In [460]:
#Preview the original dataset
df.head(2)

Unnamed: 0,name,fullname,playerData,playerPos,playerCountry,overallRating,potential,playerValue,playerWage,preferedFoot,releaseClause,clubName,league,clubKitNumber,contractStart,contractEnd,nationalTeam,countryKitNumber,profileImg,playerLink
0,K. Mbappé,Kylian Mbappé Lottin,"['26', 'Dec', '20', '1998', '182cm', '6\'0""', '81kg', '179lbs']",ST LW LM,France,91,94,€173.5M,€610K,Right,€368.7M,Real Madrid,La Liga,10.0,"Jul 1, 2024",2029,France,10,https://cdn.sofifa.net/players/231/747/26_360.png,/player/231747/kylian-mbappe/260006/
1,M. Salah,Mohamed Salah Hamed Ghalyمحمد صلاح,"['33', 'Jun', '15', '1992', '175cm', '5\'9""', '72kg', '159lbs']",RM RW,Egypt,91,91,€82M,€370K,Left,€151.7M,Liverpool,Premier League,11.0,"Jul 1, 2017",2027,Liverpool,11,https://cdn.sofifa.net/players/209/331/26_360.png,/player/209331/mohamed-salah/260006/


### Staging dataset

The staging dataset is use so as not to alter the original dataset till I full done

In [461]:
df1 = df.copy()

In [462]:
df1.head(2)

Unnamed: 0,name,fullname,playerData,playerPos,playerCountry,overallRating,potential,playerValue,playerWage,preferedFoot,releaseClause,clubName,league,clubKitNumber,contractStart,contractEnd,nationalTeam,countryKitNumber,profileImg,playerLink
0,K. Mbappé,Kylian Mbappé Lottin,"['26', 'Dec', '20', '1998', '182cm', '6\'0""', '81kg', '179lbs']",ST LW LM,France,91,94,€173.5M,€610K,Right,€368.7M,Real Madrid,La Liga,10.0,"Jul 1, 2024",2029,France,10,https://cdn.sofifa.net/players/231/747/26_360.png,/player/231747/kylian-mbappe/260006/
1,M. Salah,Mohamed Salah Hamed Ghalyمحمد صلاح,"['33', 'Jun', '15', '1992', '175cm', '5\'9""', '72kg', '159lbs']",RM RW,Egypt,91,91,€82M,€370K,Left,€151.7M,Liverpool,Premier League,11.0,"Jul 1, 2017",2027,Liverpool,11,https://cdn.sofifa.net/players/209/331/26_360.png,/player/209331/mohamed-salah/260006/


## Overview

In [463]:
df1.head(2)

Unnamed: 0,name,fullname,playerData,playerPos,playerCountry,overallRating,potential,playerValue,playerWage,preferedFoot,releaseClause,clubName,league,clubKitNumber,contractStart,contractEnd,nationalTeam,countryKitNumber,profileImg,playerLink
0,K. Mbappé,Kylian Mbappé Lottin,"['26', 'Dec', '20', '1998', '182cm', '6\'0""', '81kg', '179lbs']",ST LW LM,France,91,94,€173.5M,€610K,Right,€368.7M,Real Madrid,La Liga,10.0,"Jul 1, 2024",2029,France,10,https://cdn.sofifa.net/players/231/747/26_360.png,/player/231747/kylian-mbappe/260006/
1,M. Salah,Mohamed Salah Hamed Ghalyمحمد صلاح,"['33', 'Jun', '15', '1992', '175cm', '5\'9""', '72kg', '159lbs']",RM RW,Egypt,91,91,€82M,€370K,Left,€151.7M,Liverpool,Premier League,11.0,"Jul 1, 2017",2027,Liverpool,11,https://cdn.sofifa.net/players/209/331/26_360.png,/player/209331/mohamed-salah/260006/


In [464]:
df1.shape

(1500, 20)

In [465]:
df1.dtypes

name                 object
fullname             object
playerData           object
playerPos            object
playerCountry        object
overallRating         int64
potential             int64
playerValue          object
playerWage           object
preferedFoot         object
releaseClause        object
clubName             object
league               object
clubKitNumber       float64
contractStart        object
contractEnd          object
nationalTeam         object
countryKitNumber      int64
profileImg           object
playerLink           object
dtype: object

## Data Cleaning

In [490]:
base_url = "https://sofifa.com"

In [492]:
#Cleaning the name and the fullname
df1['Name'] = df1['Name'].str.strip()
df1['Fullname'] = df1['Fullname'].str.strip()

In [467]:
# Cleaning the playerData
df1['playerData'] = df1['playerData'].str.replace("[", "").str.replace("]", "").str.replace("'", "").str.replace("'", "")
df1[['Age', 'BirthMonth', 'BirthDay', 'BirthYear', 'Height CM', 'Height Foot Inches', 'Weight KG', 'Weight Pounds']] = df1['playerData'].str.split(", ", expand = True)
df1.head(1)

Unnamed: 0,name,fullname,playerData,playerPos,playerCountry,overallRating,potential,playerValue,playerWage,preferedFoot,releaseClause,clubName,league,clubKitNumber,contractStart,contractEnd,nationalTeam,countryKitNumber,profileImg,playerLink,Age,BirthMonth,BirthDay,BirthYear,Height CM,Height Foot Inches,Weight KG,Weight Pounds
0,K. Mbappé,Kylian Mbappé Lottin,"26, Dec, 20, 1998, 182cm, 6\0"", 81kg, 179lbs",ST LW LM,France,91,94,€173.5M,€610K,Right,€368.7M,Real Madrid,La Liga,10.0,"Jul 1, 2024",2029,France,10,https://cdn.sofifa.net/players/231/747/26_360.png,/player/231747/kylian-mbappe/260006/,26,Dec,20,1998,182cm,"6\0""",81kg,179lbs


In [423]:
df1.columns

Index(['name', 'fullname', 'playerData', 'playerPos', 'playerCountry',
       'overallRating', 'potential', 'playerValue', 'playerWage',
       'preferedFoot', 'releaseClause', 'clubName', 'league', 'clubKitNumber',
       'contractStart', 'contractEnd', 'nationalTeam', 'countryKitNumber',
       'profileImg', 'playerLink', 'Age', 'BirthMonth', 'BirthDay',
       'BirthYear', 'Height CM', 'Height Foot Inches', 'Weight KG',
       'Weight Pounds'],
      dtype='object')

In [469]:
#Rearranging 
df1 = df1[['name', 'fullname', 'Age', 'BirthMonth', 'BirthDay',
       'BirthYear', 'Height CM', 'Height Foot Inches', 'Weight KG',
       'Weight Pounds', 'playerPos', 
       'overallRating', 'potential', 'playerValue', 'playerWage',
       'releaseClause', 'clubName', 'league', 'clubKitNumber',
       'contractStart', 'contractEnd', 'playerCountry', 'nationalTeam', 'countryKitNumber',
       'profileImg', 'playerLink']]

df1.head(1)

Unnamed: 0,name,fullname,Age,BirthMonth,BirthDay,BirthYear,Height CM,Height Foot Inches,Weight KG,Weight Pounds,playerPos,overallRating,potential,playerValue,playerWage,releaseClause,clubName,league,clubKitNumber,contractStart,contractEnd,playerCountry,nationalTeam,countryKitNumber,profileImg,playerLink
0,K. Mbappé,Kylian Mbappé Lottin,26,Dec,20,1998,182cm,"6\0""",81kg,179lbs,ST LW LM,91,94,€173.5M,€610K,€368.7M,Real Madrid,La Liga,10.0,"Jul 1, 2024",2029,France,France,10,https://cdn.sofifa.net/players/231/747/26_360.png,/player/231747/kylian-mbappe/260006/


In [470]:
df1.dtypes

name                   object
fullname               object
Age                    object
BirthMonth             object
BirthDay               object
BirthYear              object
Height CM              object
Height Foot Inches     object
Weight KG              object
Weight Pounds          object
playerPos              object
overallRating           int64
potential               int64
playerValue            object
playerWage             object
releaseClause          object
clubName               object
league                 object
clubKitNumber         float64
contractStart          object
contractEnd            object
playerCountry          object
nationalTeam           object
countryKitNumber        int64
profileImg             object
playerLink             object
dtype: object

In [471]:
# Merging the BirthMonth, BirthDay, BirthYear columns to one BirthDate Column
df1.loc[:, 'Birthdate'] = df1['BirthDay'] +"/"+ df1['BirthMonth'] +"/"+ df1['BirthYear']
df1.head(3)

Unnamed: 0,name,fullname,Age,BirthMonth,BirthDay,BirthYear,Height CM,Height Foot Inches,Weight KG,Weight Pounds,playerPos,overallRating,potential,playerValue,playerWage,releaseClause,clubName,league,clubKitNumber,contractStart,contractEnd,playerCountry,nationalTeam,countryKitNumber,profileImg,playerLink,Birthdate
0,K. Mbappé,Kylian Mbappé Lottin,26,Dec,20,1998,182cm,"6\0""",81kg,179lbs,ST LW LM,91,94,€173.5M,€610K,€368.7M,Real Madrid,La Liga,10.0,"Jul 1, 2024",2029,France,France,10,https://cdn.sofifa.net/players/231/747/26_360.png,/player/231747/kylian-mbappe/260006/,20/Dec/1998
1,M. Salah,Mohamed Salah Hamed Ghalyمحمد صلاح,33,Jun,15,1992,175cm,"5\9""",72kg,159lbs,RM RW,91,91,€82M,€370K,€151.7M,Liverpool,Premier League,11.0,"Jul 1, 2017",2027,Egypt,Liverpool,11,https://cdn.sofifa.net/players/209/331/26_360.png,/player/209331/mohamed-salah/260006/,15/Jun/1992
2,J. Bellingham,Jude Victor William Bellingham,22,Jun,29,2003,186cm,"6\1""",75kg,165lbs,CAM CM,90,94,€174.5M,€320K,€370.8M,Real Madrid,La Liga,5.0,"Jul 1, 2023",2029,England,England,10,https://cdn.sofifa.net/players/252/371/26_360.png,/player/252371/jude-bellingham/260006/,29/Jun/2003


In [472]:
df1.columns

Index(['name', 'fullname', 'Age', 'BirthMonth', 'BirthDay', 'BirthYear',
       'Height CM', 'Height Foot Inches', 'Weight KG', 'Weight Pounds',
       'playerPos', 'overallRating', 'potential', 'playerValue', 'playerWage',
       'releaseClause', 'clubName', 'league', 'clubKitNumber', 'contractStart',
       'contractEnd', 'playerCountry', 'nationalTeam', 'countryKitNumber',
       'profileImg', 'playerLink', 'Birthdate'],
      dtype='object')

In [473]:
#Rearranging Columns
df1 = df1[['name', 'fullname', 'Age', 'Birthdate',
       'Height CM', 'Weight KG', 
       'playerPos', 'overallRating', 'potential', 'playerValue', 'playerWage',
       'releaseClause', 'clubName', 'league', 'clubKitNumber', 'contractStart',
       'contractEnd', 'playerCountry', 'nationalTeam', 'countryKitNumber',
       'profileImg', 'playerLink']]

df1.head(3)

Unnamed: 0,name,fullname,Age,Birthdate,Height CM,Weight KG,playerPos,overallRating,potential,playerValue,playerWage,releaseClause,clubName,league,clubKitNumber,contractStart,contractEnd,playerCountry,nationalTeam,countryKitNumber,profileImg,playerLink
0,K. Mbappé,Kylian Mbappé Lottin,26,20/Dec/1998,182cm,81kg,ST LW LM,91,94,€173.5M,€610K,€368.7M,Real Madrid,La Liga,10.0,"Jul 1, 2024",2029,France,France,10,https://cdn.sofifa.net/players/231/747/26_360.png,/player/231747/kylian-mbappe/260006/
1,M. Salah,Mohamed Salah Hamed Ghalyمحمد صلاح,33,15/Jun/1992,175cm,72kg,RM RW,91,91,€82M,€370K,€151.7M,Liverpool,Premier League,11.0,"Jul 1, 2017",2027,Egypt,Liverpool,11,https://cdn.sofifa.net/players/209/331/26_360.png,/player/209331/mohamed-salah/260006/
2,J. Bellingham,Jude Victor William Bellingham,22,29/Jun/2003,186cm,75kg,CAM CM,90,94,€174.5M,€320K,€370.8M,Real Madrid,La Liga,5.0,"Jul 1, 2023",2029,England,England,10,https://cdn.sofifa.net/players/252/371/26_360.png,/player/252371/jude-bellingham/260006/


In [474]:
#Columns to Title_Snake case
df1.columns = df1.columns.str.title().str.replace(" ", "_")
df1.columns

Index(['Name', 'Fullname', 'Age', 'Birthdate', 'Height_Cm', 'Weight_Kg',
       'Playerpos', 'Overallrating', 'Potential', 'Playervalue', 'Playerwage',
       'Releaseclause', 'Clubname', 'League', 'Clubkitnumber', 'Contractstart',
       'Contractend', 'Playercountry', 'Nationalteam', 'Countrykitnumber',
       'Profileimg', 'Playerlink'],
      dtype='object')

In [479]:
# Renaming the column name to a more standard format
df1 = df1.rename (columns = {
    'Weight_Kg' : 'Weight_KG',
    'Height_Cm' : 'Height_CM',
    'Overallrating' : 'OverallRating',
    'Playerpos': 'Position',
    'Playervalue': 'PlayerValue_EUR',
    'Playerwage': 'PlayerWage_EUR',
    'Releaseclause': 'ReleaseClause_EUR',
    'Clubname': 'ClubName',
    'Clubkitnumber': 'ClubKitNumber',
    'Contractstart': 'ContractStart',
    'Contractend': 'ContractEnd',
    'Playercountry': 'Nationality',
    'Nationalteam': 'NationalTeam',
    'Countrykitnumber': 'NationalTeamKitNumber',
    'Profileimg': 'ProfileImg',
    'Playerlink': 'PlayerLink',
})

df1.head(3)

Unnamed: 0,Name,Fullname,Age,Birthdate,Height_CM,Weight_KG,Position,OverallRating,Potential,PlayerValue_EUR,PlayerWage_EUR,ReleaseClause_EUR,ClubName,League,ClubKitNumber,ContractStart,ContractEnd,Nationality,NationalTeam,NationalTeamKitNumber,ProfileImg,PlayerLink
0,K. Mbappé,Kylian Mbappé Lottin,26,1998-12-20,182,81,ST LW LM,91,94,€173.5M,€610K,€368.7M,Real Madrid,La Liga,10.0,"Jul 1, 2024",2029,France,France,10,https://cdn.sofifa.net/players/231/747/26_360.png,/player/231747/kylian-mbappe/260006/
1,M. Salah,Mohamed Salah Hamed Ghalyمحمد صلاح,33,1992-06-15,175,72,RM RW,91,91,€82M,€370K,€151.7M,Liverpool,Premier League,11.0,"Jul 1, 2017",2027,Egypt,Liverpool,11,https://cdn.sofifa.net/players/209/331/26_360.png,/player/209331/mohamed-salah/260006/
2,J. Bellingham,Jude Victor William Bellingham,22,2003-06-29,186,75,CAM CM,90,94,€174.5M,€320K,€370.8M,Real Madrid,La Liga,5.0,"Jul 1, 2023",2029,England,England,10,https://cdn.sofifa.net/players/252/371/26_360.png,/player/252371/jude-bellingham/260006/


In [478]:
# Standardizing the Birthdate to datetime dtype
df1['Birthdate'] = pd.to_datetime(df1['Birthdate'])

# The age column to int type
if df1['Age'].dtype == 'O': 
    df1['Age'] = df1['Age'].astype('int64')

# Cleaning the height column
if df1['Height_CM'].dtype == 'O': 
    df1['Height_CM'] = df1['Height_CM'].str.replace("cm", "")
    df1['Height_CM'] = df1['Height_CM'].astype('int64')

# Cleaning the weight column
if df1['Weight_KG'].dtype == 'O': 
    df1['Weight_KG'] = df1['Weight_KG'].str.replace("kg", "")
    df1['Weight_KG'] = df1['Weight_KG'].astype('int64')

# Cleaning the overallRating column
if df1['OverallRating'].dtype == 'O': 
    df1['OverallRating'] = df1['OverallRating'].astype('int64')

# Cleaning the potential column
if df1['Potential'].dtype == 'O': 
    df1['Potential'] = df1['Potential'].astype('int64')

In [482]:
#Function cleanUpAmountCol - to cleaning up the amount columns
#param: colName - column name
def cleanUpAmountCol(dfr, colName):
    if (dfr[colName].dtype == 'O'):
        # Removing the currency symbol
        dfr.loc[:,colName] = dfr[colName].str.replace("€", "")

        #Cleaning the space character
        dfr.loc[:,colName] = dfr[colName].str.replace(" ", "")
        
        #some of the amount columns contained this string 'Acceleration type Controlled lengthy'
        #this was converted to 0
        dfr.loc[dfr[colName].str.contains("Acceleration"),colName] = "0"
        
        #Converting a M * 1000000
        dfr.loc[dfr[colName].str.contains("M"),colName] = dfr.loc[dfr[colName].str.contains("M"),colName].str.replace("M", "").apply(lambda x: str(float(x) * 1000000))

        #Converting a K * 1000
        dfr.loc[dfr[colName].str.contains("K"),colName] = dfr.loc[dfr[colName].str.contains("K"),colName].str.replace("K", "").apply(lambda x: str(float(x) * 1000))

        # convert the amount column to float
        dfr[colName] = dfr[colName].astype(float)

In [483]:
# Cleaning the amount columns - PlayerValue_EUR, PlayerWage_EUR, ReleaseClause_EUR
cleanUpAmountCol(df1, "PlayerValue_EUR")
cleanUpAmountCol(df1, "PlayerWage_EUR")
cleanUpAmountCol(df1, "ReleaseClause_EUR")

In [487]:
# Cleaning the club kit number but leabing it as string
df1['ClubKitNumber'].fillna(0)
df1['ClubKitNumber'] = df1['ClubKitNumber'].apply(lambda x: str(x).split(".")[0])

In [485]:
# ContractStart, ContractEnd are suppose to be date columns
# but have mixed values of string, int represent year and date
#these columns were converted to string
df1['ContractStart'] = df1['ContractStart'].apply(str)
df1['ContractEnd'] = df1['ContractEnd'].apply(str)

In [486]:
# THe national team in case where it is empty or haas a club value was converted to the player nationality
df1.loc[:,'NationalTeam'] = np.where(df1['NationalTeam'] == df1['ClubName'], df1['Nationality'], df1['NationalTeam'])

In [489]:
# Cleaning the nation team kit number
df1['NationalTeamKitNumber'] = df1['NationalTeamKitNumber'].apply(str)

In [493]:
#Add the base url to the PlayerLink
df1.loc[:,'PlayerLink'] = df1['PlayerLink'].apply(lambda x: f"{base_url}{x}")

## Saving result

In [496]:
df1.head(3)

Unnamed: 0,Name,Fullname,Age,Birthdate,Height_CM,Weight_KG,Position,OverallRating,Potential,PlayerValue_EUR,PlayerWage_EUR,ReleaseClause_EUR,ClubName,League,ClubKitNumber,ContractStart,ContractEnd,Nationality,NationalTeam,NationalTeamKitNumber,ProfileImg,PlayerLink
0,K. Mbappé,Kylian Mbappé Lottin,26,1998-12-20,182,81,ST LW LM,91,94,173500000.0,610000.0,368700000.0,Real Madrid,La Liga,10,"Jul 1, 2024",2029,France,France,10,https://cdn.sofifa.net/players/231/747/26_360.png,https://sofifa.comhttps://sofifa.com/player/231747/kylian-mbappe/260006/
1,M. Salah,Mohamed Salah Hamed Ghalyمحمد صلاح,33,1992-06-15,175,72,RM RW,91,91,82000000.0,370000.0,151700000.0,Liverpool,Premier League,11,"Jul 1, 2017",2027,Egypt,Egypt,11,https://cdn.sofifa.net/players/209/331/26_360.png,https://sofifa.comhttps://sofifa.com/player/209331/mohamed-salah/260006/
2,J. Bellingham,Jude Victor William Bellingham,22,2003-06-29,186,75,CAM CM,90,94,174500000.0,320000.0,370800000.0,Real Madrid,La Liga,5,"Jul 1, 2023",2029,England,England,10,https://cdn.sofifa.net/players/252/371/26_360.png,https://sofifa.comhttps://sofifa.com/player/252371/jude-bellingham/260006/


In [497]:
df1.dtypes

Name                             object
Fullname                         object
Age                               int64
Birthdate                datetime64[ns]
Height_CM                         int64
Weight_KG                         int64
Position                         object
OverallRating                     int64
Potential                         int64
PlayerValue_EUR                 float64
PlayerWage_EUR                  float64
ReleaseClause_EUR               float64
ClubName                         object
League                           object
ClubKitNumber                    object
ContractStart                    object
ContractEnd                      object
Nationality                      object
NationalTeam                     object
NationalTeamKitNumber            object
ProfileImg                       object
PlayerLink                       object
dtype: object

In [498]:
#Save to file
df1.to_csv("./results/player_dataset_clean.csv")