# The Sofifa Scraper

In [7]:
import numpy as np
import pandas as pd
import calendar
import matplotlib.pyplot as plt
import math
from datetime import datetime

In [8]:
pd.set_option('display.max.columns', 30)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max.rows', 1000)
pd.set_option("display.float_format", lambda x: "%.2f" % x )

## Load Dataset

In [9]:
#Load the datset
df = pd.read_csv("./results/player_dataset.csv")

In [10]:
#Preview the original dataset
df.head(2)

Unnamed: 0,name,fullname,playerData,playerPos,playerCountry,overallRating,potential,playerValue,playerWage,preferedFoot,releaseClause,clubName,league,clubKitNumber,contractStart,contractEnd,nationalTeam,countryKitNumber,profileImg,playerLink
0,K. Mbappé,Kylian Mbappé Lottin,"['26', 'Dec', '20', '1998', '182cm', '6\'0""', '81kg', '179lbs']",ST LW LM,France,91,94,€173.5M,€610K,Right,€368.7M,Real Madrid,La Liga,10.0,"Jul 1, 2024",2029,France,10,https://cdn.sofifa.net/players/231/747/26_360.png,/player/231747/kylian-mbappe/260006/
1,M. Salah,Mohamed Salah Hamed Ghalyمحمد صلاح,"['33', 'Jun', '15', '1992', '175cm', '5\'9""', '72kg', '159lbs']",RM RW,Egypt,91,91,€82M,€370K,Left,€151.7M,Liverpool,Premier League,11.0,"Jul 1, 2017",2027,Liverpool,11,https://cdn.sofifa.net/players/209/331/26_360.png,/player/209331/mohamed-salah/260006/


### Staging dataset

The staging dataset is use so as not to alter the original dataset till I full done

In [109]:
df1 = df.copy()

In [110]:
df1.head(2)

Unnamed: 0,name,fullname,playerData,playerPos,playerCountry,overallRating,potential,playerValue,playerWage,preferedFoot,releaseClause,clubName,league,clubKitNumber,contractStart,contractEnd,nationalTeam,countryKitNumber,profileImg,playerLink
0,K. Mbappé,Kylian Mbappé Lottin,"['26', 'Dec', '20', '1998', '182cm', '6\'0""', '81kg', '179lbs']",ST LW LM,France,91,94,€173.5M,€610K,Right,€368.7M,Real Madrid,La Liga,10.0,"Jul 1, 2024",2029,France,10,https://cdn.sofifa.net/players/231/747/26_360.png,/player/231747/kylian-mbappe/260006/
1,M. Salah,Mohamed Salah Hamed Ghalyمحمد صلاح,"['33', 'Jun', '15', '1992', '175cm', '5\'9""', '72kg', '159lbs']",RM RW,Egypt,91,91,€82M,€370K,Left,€151.7M,Liverpool,Premier League,11.0,"Jul 1, 2017",2027,Liverpool,11,https://cdn.sofifa.net/players/209/331/26_360.png,/player/209331/mohamed-salah/260006/


## Overview

In [111]:
df1.head(2)

Unnamed: 0,name,fullname,playerData,playerPos,playerCountry,overallRating,potential,playerValue,playerWage,preferedFoot,releaseClause,clubName,league,clubKitNumber,contractStart,contractEnd,nationalTeam,countryKitNumber,profileImg,playerLink
0,K. Mbappé,Kylian Mbappé Lottin,"['26', 'Dec', '20', '1998', '182cm', '6\'0""', '81kg', '179lbs']",ST LW LM,France,91,94,€173.5M,€610K,Right,€368.7M,Real Madrid,La Liga,10.0,"Jul 1, 2024",2029,France,10,https://cdn.sofifa.net/players/231/747/26_360.png,/player/231747/kylian-mbappe/260006/
1,M. Salah,Mohamed Salah Hamed Ghalyمحمد صلاح,"['33', 'Jun', '15', '1992', '175cm', '5\'9""', '72kg', '159lbs']",RM RW,Egypt,91,91,€82M,€370K,Left,€151.7M,Liverpool,Premier League,11.0,"Jul 1, 2017",2027,Liverpool,11,https://cdn.sofifa.net/players/209/331/26_360.png,/player/209331/mohamed-salah/260006/


In [112]:
df1.shape

(1500, 20)

In [113]:
df1.dtypes

name                 object
fullname             object
playerData           object
playerPos            object
playerCountry        object
overallRating         int64
potential             int64
playerValue          object
playerWage           object
preferedFoot         object
releaseClause        object
clubName             object
league               object
clubKitNumber       float64
contractStart        object
contractEnd          object
nationalTeam         object
countryKitNumber      int64
profileImg           object
playerLink           object
dtype: object

## Data Cleaning

In [114]:
base_url = "https://sofifa.com"

In [115]:
#Extracting the playerID from the playerLink
df1['playerID'] = df1.loc[:,'playerLink'].str.strip("/").str.split("/").apply(lambda x : x[1])
df1['playerID'] = df1['playerID'].astype('int32')

In [116]:
#Cleaning the name and the fullname
df1['name'] = df1['name'].str.strip()
df1['fullname'] = df1['fullname'].str.strip()

In [117]:
# Cleaning the playerData
df1['playerData'] = df1['playerData'].str.replace("[", "").str.replace("]", "").str.replace("'", "").str.replace("'", "")
df1[['Age', 'BirthMonth', 'BirthDay', 'BirthYear', 'Height CM', 'Height Foot Inches', 'Weight KG', 'Weight Pounds']] = df1['playerData'].str.split(", ", expand = True)


In [118]:
#Rearranging 
df1 = df1[['playerID', 'name', 'fullname', 'Age', 'BirthMonth', 'BirthDay',
       'BirthYear', 'Height CM', 'Weight KG', 'Weight Pounds', 
       'preferedFoot', 'playerPos', 'playerCountry',
       'overallRating', 'potential', 'playerValue', 'playerWage',
       'releaseClause', 'clubName', 'league', 'clubKitNumber',
       'contractStart', 'contractEnd', 'nationalTeam', 'countryKitNumber', 
       'profileImg', 'playerLink']]



In [120]:
# Merging the BirthMonth, BirthDay, BirthYear columns to one BirthDate Column
df1.loc[:, 'Birthdate'] = df1['BirthDay'] +"/"+ df1['BirthMonth'] +"/"+ df1['BirthYear']

In [121]:
#Rearranging Columns
df1 = df1[['playerID', 'name', 'fullname', 'Age', 'Birthdate', 
           'Height CM', 'Weight KG',
       'Weight Pounds', 'preferedFoot', 'playerPos', 'playerCountry',
       'overallRating', 'potential', 'playerValue', 'playerWage',
       'releaseClause', 'clubName', 'league', 'clubKitNumber', 'contractStart',
       'contractEnd', 'nationalTeam', 'countryKitNumber', 'profileImg',
       'playerLink']]

In [122]:
#Columns to Title_Snake case
df1.columns = df1.columns.str.title().str.replace(" ", "_")

In [123]:
# Renaming the column name to a more standard format
df1 = df1.rename (columns = {
    'Playerid': 'PlayerID',
    'Weight_Kg' : 'Weight_KG',
    'Height_Cm' : 'Height_CM',
    'Overallrating' : 'OverallRating',
    'Playerpos': 'Position',
    'Playervalue': 'PlayerValue_EUR',
    'Playerwage': 'PlayerWage_EUR',
    'Releaseclause': 'ReleaseClause_EUR',
    'Clubname': 'ClubName',
    'Clubkitnumber': 'ClubKitNumber',
    'Contractstart': 'ContractStart',
    'Contractend': 'ContractEnd',
    'Playercountry': 'Nationality',
    'Nationalteam': 'NationalTeam',
    'Countrykitnumber': 'NationalTeamKitNumber',
    'Profileimg': 'ProfileImg',
    'Playerlink': 'PlayerLink',
})


In [124]:
# Standardizing the Birthdate to datetime dtype
df1['Birthdate'] = pd.to_datetime(df1['Birthdate'])

# The age column to int type
if df1['Age'].dtype == 'O': 
    df1['Age'] = df1['Age'].astype('int64')

# Cleaning the height column
if df1['Height_CM'].dtype == 'O': 
    df1['Height_CM'] = df1['Height_CM'].str.replace("cm", "")
    df1['Height_CM'] = df1['Height_CM'].astype('int64')

# Cleaning the weight column
if df1['Weight_KG'].dtype == 'O': 
    df1['Weight_KG'] = df1['Weight_KG'].str.replace("kg", "")
    df1['Weight_KG'] = df1['Weight_KG'].astype('int64')

# Cleaning the overallRating column
if df1['OverallRating'].dtype == 'O': 
    df1['OverallRating'] = df1['OverallRating'].astype('int64')

# Cleaning the potential column
if df1['Potential'].dtype == 'O': 
    df1['Potential'] = df1['Potential'].astype('int64')

In [125]:
#Function cleanUpAmountCol - to cleaning up the amount columns
#param: colName - column name
def cleanUpAmountCol(dfr, colName):
    if (dfr[colName].dtype == 'O'):
        # Removing the currency symbol
        dfr.loc[:,colName] = dfr[colName].str.replace("€", "")

        #Cleaning the space character
        dfr.loc[:,colName] = dfr[colName].str.replace(" ", "")
        
        #some of the amount columns contained this string 'Acceleration type Controlled lengthy'
        #this was converted to 0
        dfr.loc[dfr[colName].str.contains("Acceleration"),colName] = "0"
        
        #Converting a M * 1000000
        dfr.loc[dfr[colName].str.contains("M"),colName] = dfr.loc[dfr[colName].str.contains("M"),colName].str.replace("M", "").apply(lambda x: str(float(x) * 1000000))

        #Converting a K * 1000
        dfr.loc[dfr[colName].str.contains("K"),colName] = dfr.loc[dfr[colName].str.contains("K"),colName].str.replace("K", "").apply(lambda x: str(float(x) * 1000))

        # convert the amount column to float
        dfr[colName] = dfr[colName].astype(float)

In [126]:
# Cleaning the amount columns - PlayerValue_EUR, PlayerWage_EUR, ReleaseClause_EUR
cleanUpAmountCol(df1, "PlayerValue_EUR")
cleanUpAmountCol(df1, "PlayerWage_EUR")
cleanUpAmountCol(df1, "ReleaseClause_EUR")

In [127]:
# Cleaning the club kit number but leabing it as string
df1['ClubKitNumber'].fillna(0)
df1['ClubKitNumber'] = df1['ClubKitNumber'].apply(lambda x: str(x).split(".")[0])

In [128]:
# ContractStart, ContractEnd are suppose to be date columns
# but have mixed values of string, int represent year and date
#these columns were converted to string
df1['ContractStart'] = df1['ContractStart'].apply(str)
df1['ContractEnd'] = df1['ContractEnd'].apply(str)

In [129]:
# THe national team in case where it is empty or haas a club value was converted to the player nationality
df1.loc[:,'NationalTeam'] = np.where(df1['NationalTeam'] == df1['ClubName'], df1['Nationality'], df1['NationalTeam'])

In [130]:
# Cleaning the nation team kit number
df1['NationalTeamKitNumber'] = df1['NationalTeamKitNumber'].apply(str)

In [131]:
#Add the base url to the PlayerLink
df1.loc[:,'PlayerLink'] = df1['PlayerLink'].apply(lambda x: f"{base_url}{x}")

## Saving result

In [136]:
df1.head(10)

Unnamed: 0,PlayerID,Name,Fullname,Age,Birthdate,Height_CM,Weight_KG,Weight_Pounds,Preferedfoot,Position,Nationality,OverallRating,Potential,PlayerValue_EUR,PlayerWage_EUR,ReleaseClause_EUR,ClubName,League,ClubKitNumber,ContractStart,ContractEnd,NationalTeam,NationalTeamKitNumber,ProfileImg,PlayerLink
0,231747,K. Mbappé,Kylian Mbappé Lottin,26,1998-12-20,182,81,179lbs,Right,ST LW LM,France,91,94,173500000.0,610000.0,368700000.0,Real Madrid,La Liga,10,"Jul 1, 2024",2029,France,10,https://cdn.sofifa.net/players/231/747/26_360.png,https://sofifa.com/player/231747/kylian-mbappe/260006/
1,209331,M. Salah,Mohamed Salah Hamed Ghalyمحمد صلاح,33,1992-06-15,175,72,159lbs,Left,RM RW,Egypt,91,91,82000000.0,370000.0,151700000.0,Liverpool,Premier League,11,"Jul 1, 2017",2027,Egypt,11,https://cdn.sofifa.net/players/209/331/26_360.png,https://sofifa.com/player/209331/mohamed-salah/260006/
2,252371,J. Bellingham,Jude Victor William Bellingham,22,2003-06-29,186,75,165lbs,Right,CAM CM,England,90,94,174500000.0,320000.0,370800000.0,Real Madrid,La Liga,5,"Jul 1, 2023",2029,England,10,https://cdn.sofifa.net/players/252/371/26_360.png,https://sofifa.com/player/252371/jude-bellingham/260006/
3,239085,E. Haaland,Erling Braut Håland,24,2000-07-21,195,94,207lbs,Left,ST,Norway,90,92,157000000.0,260000.0,302200000.0,Manchester City,Premier League,9,"Jul 1, 2022",2034,Norway,9,https://cdn.sofifa.net/players/239/085/26_360.png,https://sofifa.com/player/239085/erling-haaland/260006/
4,231866,Rodri,Rodrigo Hernández Cascante,29,1996-06-22,190,82,181lbs,Right,CDM CM,Spain,90,90,102000000.0,270000.0,188700000.0,Manchester City,Premier League,16,"Jul 4, 2019",2027,Spain,16,https://cdn.sofifa.net/players/231/866/26_360.png,https://sofifa.com/player/231866/rodrigo-hernandez-cascante/260006/
5,231443,O. Dembélé,Masour Ousmane Dembélé,28,1997-05-15,178,67,148lbs,Left,ST RW CAM,France,90,90,122500000.0,220000.0,226600000.0,Paris Saint-Germain,Ligue 1,10,"Aug 12, 2023",2028,France,7,https://cdn.sofifa.net/players/231/443/26_360.png,https://sofifa.com/player/231443/ousmane-dembele/260006/
6,203376,V. van Dijk,Virgil van Dijk,33,1991-07-08,193,92,203lbs,Right,CB,Netherlands,90,90,57000000.0,230000.0,105500000.0,Liverpool,Premier League,4,"Jan 1, 2018",2027,Netherlands,4,https://cdn.sofifa.net/players/203/376/26_360.png,https://sofifa.com/player/203376/virgil-van-dijk/260006/
7,277643,Lamine Yamal,Lamine Yamal Nasraoui Ebanaلامين يامال نصراوي إبانا,17,2007-07-13,180,72,159lbs,Left,RM RW,Spain,89,95,147000000.0,100000.0,330800000.0,FC Barcelona,La Liga,10,"Jul 1, 2022",2031,Spain,19,https://cdn.sofifa.net/players/277/643/26_360.png,https://sofifa.com/player/277643/lamine-yamal-nasraoui-ebana/260006/
8,256630,F. Wirtz,Florian Richard Wirtz,22,2003-05-03,177,71,157lbs,Right,CAM ST CM,Germany,89,93,150500000.0,190000.0,289700000.0,Liverpool,Premier League,7,"Jun 20, 2025",2030,Germany,17,https://cdn.sofifa.net/players/256/630/26_360.png,https://sofifa.com/player/256630/florian-wirtz/260006/
9,255253,Vitinha,Vítor Machado Ferreira,25,2000-02-13,172,64,141lbs,Right,CM CDM CAM,Portugal,89,91,128500000.0,170000.0,247400000.0,Paris Saint-Germain,Ligue 1,17,"Jun 30, 2022",2029,Portugal,23,https://cdn.sofifa.net/players/255/253/26_360.png,https://sofifa.com/player/255253/vitor-machado-ferreira/260006/


In [133]:
df1.dtypes

PlayerID                          int32
Name                             object
Fullname                         object
Age                               int64
Birthdate                datetime64[ns]
Height_CM                         int64
Weight_KG                         int64
Weight_Pounds                    object
Preferedfoot                     object
Position                         object
Nationality                      object
OverallRating                     int64
Potential                         int64
PlayerValue_EUR                 float64
PlayerWage_EUR                  float64
ReleaseClause_EUR               float64
ClubName                         object
League                           object
ClubKitNumber                    object
ContractStart                    object
ContractEnd                      object
NationalTeam                     object
NationalTeamKitNumber            object
ProfileImg                       object
PlayerLink                       object


In [135]:
#Save to file
df1.to_csv("./results/player_dataset_clean.csv", index = False)