In [1]:
import datetime
import time
import re
import requests as r
import pandas as pd
import numpy as np
import lxml
import sys
from bs4 import BeautifulSoup, SoupStrainer
from selenium import webdriver
from selenium.webdriver.common.by import By


In [2]:
date_id = 'id13295'  # first date 31 12 1992
fifa_url = 'https://www.fifa.com/fifa-world-ranking/men?dateId='

#Below code for women
#date_id = 'ranking_20210625'  
#fifa_url = 'https://www.fifa.com/fifa-world-ranking/women?dateId='

Collection of dates when ratings were created

In [3]:
def get_dates_html():
    page_source = r.get(f'{fifa_url}{date_id}')
    dates = re.search(r"dates\"\:\[(.+?)\]\,\"selectedDate", page_source.text).group(1)
    return dates


def create_dates_dataset(html_dates):
    dataset = pd.DataFrame({'date':re.findall(r"text\"\:\"(.+?)\"", html_dates),
                            'date_id':re.findall(r"id\"\:\"(.+?)\"", html_dates)})
    
    # convert 'date' from str to datetime and sorting "old -> new"
    dataset['date'] = pd.to_datetime(dataset['date'], format='%d %b %Y')
    dataset.sort_values('date', ignore_index=True, inplace=True)
    assert dataset.date.min() == dataset.iloc[0].date, \
            "Incorrect dataset sorting"
    
    return dataset

In [4]:
#Check the web output
dates_from_page = get_dates_html()
#dates_from_page

In [5]:
dates_dataset = create_dates_dataset(dates_from_page)
dates_dataset

Unnamed: 0,date,date_id
0,1992-12-31,id1
1,1993-08-08,id2
2,1993-09-23,id3
3,1993-10-22,id4
4,1993-11-19,id5
...,...,...
307,2020-11-26,id13113
308,2020-12-10,id13127
309,2021-02-18,id13197
310,2021-04-07,id13245


In [6]:
print(f'Last date: {dates_dataset.date.max().date()}')

Last date: 2021-05-27


Web Scraping

In [7]:
df_url=dates_dataset
df_url['url']=fifa_url+dates_dataset['date_id']
#check url
df_url.iloc[1,2]

'https://www.fifa.com/fifa-world-ranking/men?dateId=id2'

In [8]:
df_url

Unnamed: 0,date,date_id,url
0,1992-12-31,id1,https://www.fifa.com/fifa-world-ranking/men?da...
1,1993-08-08,id2,https://www.fifa.com/fifa-world-ranking/men?da...
2,1993-09-23,id3,https://www.fifa.com/fifa-world-ranking/men?da...
3,1993-10-22,id4,https://www.fifa.com/fifa-world-ranking/men?da...
4,1993-11-19,id5,https://www.fifa.com/fifa-world-ranking/men?da...
...,...,...,...
307,2020-11-26,id13113,https://www.fifa.com/fifa-world-ranking/men?da...
308,2020-12-10,id13127,https://www.fifa.com/fifa-world-ranking/men?da...
309,2021-02-18,id13197,https://www.fifa.com/fifa-world-ranking/men?da...
310,2021-04-07,id13245,https://www.fifa.com/fifa-world-ranking/men?da...


In [10]:
chromeOptions = webdriver.ChromeOptions() 
chromeOptions.add_argument("--no-sandbox") 
chromeOptions.add_argument("--headless") 
chromeOptions.add_argument("--disable-dev-shm-usage") 
driver = webdriver.Chrome(r'~\chromedriver.exe',options=chromeOptions)

In [13]:
#set new df 
full_df = []

#set progress bar
toolbar_width = 40
sys.stdout.write("[%s]" % (" " * toolbar_width))
sys.stdout.flush()
sys.stdout.write("\b" * (toolbar_width+1)) # return to start of line, after '['

#for some reasons (might be due to the web itself) men ranking not working 
# properly with driver.get on first several urls
# thus start from 60, for women okay to use range(len(df_url))
# both for men and women data need to check if the latest ranking was used at some point, then manually fix
for i in range(60,len(df_url)):
    driver.get(df_url.loc[i,'url'])
    time.sleep(2)
    pg=driver.page_source
    html_df=pd.read_html(pg)
    df=html_df[0][['RK','Team','Total PointsPTS']]
    df['date']=df_url.loc[i,'date']
    #append to new data frame
    full_df.append(df)
    # update the bar
    sys.stdout.write("-")
    sys.stdout.flush()

sys.stdout.write("]\n") # this ends the progress bar

full_df = pd.concat(full_df)
print(full_df)

[                                        -

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------]
    RK                    Team  Total PointsPTS       date
0    1               BrazilBRA              817 1999-05-19
1    2               FranceFRA              785 1999-05-19
2    3              GermanyGER              742 1999-05-19
3    4                ItalyITA              740 1999-05-19
4    5       Czech RepublicCZE              736 1999-05-19
..  ..                     ...              ...        ...
45  46                EgyptEGY             1433 2021-05-27
46  47  Republic of IrelandIRL             1427 2021-05-27
47  48     Northern IrelandNIR             1426 2021-05-27
48  49                GhanaGHA             1425 2021-05-27
49  50           Costa RicaCRC             1423 2021-05-27

[12600 rows x 4 columns]


In [15]:
#Write to CSV

full_df.to_csv(r'~\FIFA\fifa_men.csv', index=False)
#full_df.to_csv(r'~\FIFA\fifa_women.csv', index=False)

In [14]:
full_df

Unnamed: 0,RK,Team,Total PointsPTS,date
0,1,BrazilBRA,817,1999-05-19
1,2,FranceFRA,785,1999-05-19
2,3,GermanyGER,742,1999-05-19
3,4,ItalyITA,740,1999-05-19
4,5,Czech RepublicCZE,736,1999-05-19
...,...,...,...,...
45,46,EgyptEGY,1433,2021-05-27
46,47,Republic of IrelandIRL,1427,2021-05-27
47,48,Northern IrelandNIR,1426,2021-05-27
48,49,GhanaGHA,1425,2021-05-27


Manually fix

In [24]:
#Manually input the urls that were not correctly pulled
driver.get('https://www.fifa.com/fifa-world-ranking/men?dateId=id12406')

In [25]:
test=driver.page_source
test2=pd.read_html(test)


In [26]:
df=test2[0][['RK','Team','Total PointsPTS']]
df

Unnamed: 0,RK,Team,Total PointsPTS
0,1,USAUSA,2114
1,2,GermanyGER,2052
2,3,EnglandENG,2033
3,4,AustraliaAUS,2030
4,5,CanadaCAN,2023
5,6,FranceFRA,2019
6,7,NetherlandsNED,1972
7,8,BrazilBRA,1968
8,9,JapanJPN,1967
9,10,SwedenSWE,1955


In [18]:
#Check content
el = driver.find_element_by_tag_name('table')
el.text

'RK\nTeam\nPTS\n+/-\n1\nBEL\n1727\n0\n2\nFRA\n1726\n0\n3\nBRA\n1676\n0\n4\nCRO\n1634\n0\n5\nENG\n1631\n0\n6\nPOR\n1614\n0\n7\nURU\n1609\n0\n8\nSUI\n1599\n0\n9\nESP\n1591\n0\n10\nDEN\n1589\n0\n11\nARG\n1582\n0\n12\nCOL\n1575\n0\n13\nCHI\n1565\n0\n14\nSWE\n1560\n0\n14\nNED\n1560\n0\n16\nGER\n1558\n0\n17\nMEX\n1540\n0\n18\nITA\n1539\n0\n19\nWAL\n1525\n0\n20\nPOL\n1518\n0\n20\nPER\n1518\n0\n22\nIRN\n1516\n35\n23\nAUT\n1509\n0\n24\nSEN\n1505\n0\n25\nUSA\n1501\n4\n25\nROU\n1501\n0\n27\nJPN\n1495\n81\n28\nTUN\n1493\n0\n29\nSVK\n1483\n0\n30\nUKR\n1482\n0\n31\nSRB\n1481\n0\n32\nVEN\n1478\n0\n33\nPAR\n1476\n0\n34\nIRL\n1474\n0\n35\nBIH\n1472\n0\n36\nNIR\n1465\n0\n37\nCRC\n1461\n-3\n38\nKOR\n1451\n46\n38\nISL\n1451\n-1\n40\nSCO\n1446\n0\n41\nTUR\n1443\n0\n42\nAUS\n1441\n5\n43\nMAR\n1440\n0\n44\nCZE\n1435\n0\n45\nGRE\n1428\n0\n46\nNGA\n1427\n0\n46\nMNE\n1427\n0\n48\nBUL\n1425\n0\n48\nNOR\n1425\n0\n50\nRUS\n1424\n0'