In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import numpy as np
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from ipynb.fs.full.xscores_selenium_functions import scrape_xscores_completed

---
### This script is designed to scrape xscores.com for soccer match results starting in the 2002-2003 season to the most recently completed season, the 2021-2022 season.
---
##### This script will scrape will result in roughly 50,000 rows of data from over 200 urls, so it will take a while.
- This script will output a files containing snippets of the total data along the way. This is for two reasons: first, the scraping can take hours, so if the script gets interupted for whatever reason, the data that has been scraped will not be lost and the user can resume where the script left off; the second reason is that I can't gaurantee that xscores.com won't change their URL formats, and being able to easily see which seasons were successfully scraped makes debugging easier. This data will be stored in Data By League/*Country*/*Competition*.csv, where the country and competition correspond to the data being scraped.
- If you want to do the scrape in pieces, you just have to edit the for loops to select which years and countries you wish to scrape. 
- Cup and league games must be scraped seperately as data is stored in different url formats.

In [3]:
#Leagues of interest:
league_names = pd.DataFrame({
    'country':['spain', 'england','germany','france','italy'],
    'league_name':['primera-division','premier-league','bundesliga','ligue-1','serie-a']
})
league_names

Unnamed: 0,country,league_name
0,spain,primera-division
1,england,premier-league
2,germany,bundesliga
3,france,ligue-1
4,italy,serie-a


In [4]:
#Cups of interest:
cup_names = pd.DataFrame({
    'country':['europe-uefa','europe-uefa','spain','england','germany','france', 'italy'],
    'league_name':['uefa-champions-league','uefa-europa-league','fa-cup','fa-cup','fa-cup','fa-cup','fa-cup'],
    'rounds': [
    ['first-group-stage','second-group-stage','group-stage','round-16','quarter-finals','semi-finals','final'],
    ['group-stage','round-32','round-16','quarter-finals','semi-finals','final'],
    ['first-round','second-round','third-round','fourth-round','round-32','round-16','quarter-finals','semi-finals','final'],
    ['first-round','second-round','third-round','fourth-round','fifth-round','quarter-finals','semi-finals','final'],
    ['first-round','second-round','third-round','round-16','quarter-finals','semi-finals','final'],
    ['eighth-round','round-64','round-32','round-16','quarter-finals','semi-finals','final'],
    ['first-round','second-round','third-round','fourth-round','round-16','quarter-finals','semi-finals','final']
    ]
})
cup_names
#note: xscores labels each country's major domestic cup as fa-cup

Unnamed: 0,country,league_name,rounds
0,europe-uefa,uefa-champions-league,"[first-group-stage, second-group-stage, group-..."
1,europe-uefa,uefa-europa-league,"[group-stage, round-32, round-16, quarter-fina..."
2,spain,fa-cup,"[first-round, second-round, third-round, fourt..."
3,england,fa-cup,"[first-round, second-round, third-round, fourt..."
4,germany,fa-cup,"[first-round, second-round, third-round, round..."
5,france,fa-cup,"[eighth-round, round-64, round-32, round-16, q..."
6,italy,fa-cup,"[first-round, second-round, third-round, fourt..."


In [None]:
#path to Date-Pipeline-To-Tableau folder, to be used throughout script
folder_loc = '/Users/willfitzhugh/Desktop/Coding/Data-Pipeline-To-Tableau/'

## Scraping Leagues

In [18]:
#example of url to be scraped

i = 1
country = league_names.iloc[i,0]
league_name = league_names.iloc[i,1]
year = 2021
print('https://www.xscores.com/soccer/' + country + '/' + league_name + '/results/' + str(year) + '-' + str(year+1))

https://www.xscores.com/soccer/england/premier-league/results/2021-2022


In [5]:
#scrape and export leagues

years_list = range(2002,2022)

for i in range(len(league_names)):     #cycle through each country, edit this loop to do the scrape in peices
    
    for year in years_list:            #cycle through each year
        
        df = scrape_xscores_completed(year, league_names.iloc[i,0], league_names.iloc[i,1])
        df['season'] = year
    
        label = 'Data/Data By Season/'+league_names.iloc[i,0] + '/' + league_names.iloc[i,1] + '_games_' + str(year) + '.csv'    
        df.to_csv( folder_loc+label, index=False)

- Scraping 2020 & 2021 German league results separately because their URLs are formatted differently.

In [6]:
germ20 = scrape_xscores_completed(2020, 'germany','bundesliga/1.-bundesliga')
germ20['season'] = 2020
germ20.to_csv(folder_loc+'Data/Data By Season/germany/bundesliga_games_2020.csv', index=False)

germ21 = scrape_xscores_completed(2021, 'germany','bundesliga/1.-bundesliga')
germ21['season'] = 2021
germ21.to_csv(folder_loc+'Data/Data By Season/germany/bundesliga_games_2021.csv', index=False)

---

## Scraping Cups

In [36]:
#example of url to be scraped

i = 6
country = cup_names.iloc[i,0]
cup_name = cup_names.iloc[i,1] + '/' + cup_names.iloc[i,:]['rounds'][-1]
year = 2002
print('https://www.xscores.com/soccer/' + country + '/' + cup_name + '/results/' + str(year) + '-' + str(year+1))

https://www.xscores.com/soccer/italy/fa-cup/final/results/2002-2003


In [24]:
#scrape and export cups

years_list = range(2002,2021)

for i in range(len(cup_names)):     #cycle through each country, edit this loop to do scrape in pieces

    for year in years_list:         #cycle through each year
        
        data = pd.DataFrame(columns=['Round','Date', 'Time', 'Home_Team', 'Home_Score', 'Away_Score', 'Away_Team',
                                 'Home_Score_AET', 'Away_Score_AET', 'Home_Penalties', 'Away_Penalties',
                                'Home_Points', 'Away_Points'])
    
        for stage in cup_names.iloc[i,:]['rounds']:   #cycle through each stage of the cup
            cup_stage = cup_names.iloc[i,1] + '/' + stage

            df = scrape_xscores_completed(year, cup_names.iloc[i,0], cup_stage)
            df['Round'] = stage
            data = pd.concat([data,df])
    
        data['season'] = year
    
        label = 'Data/Data By Season/'+cup_names.iloc[i,0] + '/' + cup_names.iloc[i,1] + '_games_' + str(year) + '.csv'
    
        data.to_csv( folder_loc+label, index=False)


- Scraping 2021-2022 cup seasons seperately because the xscores URLs are formatted differently than other years.

In [25]:
#european cups

for i in range(0,2):
    
    data = pd.DataFrame(columns=['Round', 'Date', 'Time', 'Home_Team', 'Home_Score', 'Away_Score', 'Away_Team',
                                 'Home_Score_AET', 'Away_Score_AET', 'Home_Penalties', 'Away_Penalties',
                                'Home_Points', 'Away_Points'])
    
    for stage in ['group-stage','final-stage']:
        
        df = scrape_xscores_completed(2021, cup_names.iloc[i,0], (cup_names.iloc[i,1]+'/'+stage) )
        data = pd.concat([df,data])
    
    data['season'] = 2021
    
    label = 'Data/Data By Season/'+cup_names.iloc[i,0] + '/' + cup_names.iloc[i,1] + '_games_2021' + '.csv'
    
    data.to_csv( folder_loc+label, index=False)

In [None]:
#domestic cups

for i in range(2,7):
    cup21 = scrape_xscores_completed(2021, cup_names.iloc[i,0], cup_names.iloc[i,1])
    cup21['season'] = 2021
    label = 'Data/Data By Season/' + cup_names.iloc[i,0] + '/fa-cup_games_2021.csv'
    cup21.to_csv( folder_loc+label , index = False)

---
## Compile data for each year and competition into a single file.

In [5]:
#combine league and cup labels
data_labels = pd.concat([league_names, cup_names[['country','league_name']]])
data_labels

Unnamed: 0,country,league_name
0,spain,primera-division
1,england,premier-league
2,germany,bundesliga
3,france,ligue-1
4,italy,serie-a
0,europe-uefa,uefa-champions-league
1,europe-uefa,uefa-europa-league
2,spain,fa-cup
3,england,fa-cup
4,germany,fa-cup


In [6]:
data = pd.DataFrame(columns=['Round', 'Date', 'Time', 'Home_Team', 'Home_Score', 'Away_Score', 'Away_Team',
                                'Home_Score_AET', 'Away_Score_AET', 'Home_Penalties', 'Away_Penalties',
                                'Home_Points', 'Away_Points','season'])

for i in range(len(data_labels)):
    
    for year in range(2002, 2022):
        
        try: #try clause because some competitions don't have data for much earlier seasons (like the europa league prior to 2009)
            label = 'Data/Data By Season/'+data_labels.iloc[i,0]+'/'+data_labels.iloc[i,1]+'_games_'+str(year)+'.csv'
                
            df_to_add = pd.read_csv(folder_loc+label)
            
            df_to_add['Country'] = data_labels.iloc[i,0]
            df_to_add['Competition'] = data_labels.iloc[i,1]
            
            data = pd.concat([data,df_to_add])
            
        except:
            pass


data.to_csv( folder_loc+'Data/Past_20years_Data.csv', index = False)