In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import numpy as np
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

---
### This function scrapes an xscores page for the completed games played in a selected season for a selected country and league. Returns a dataframe.
---

In [1]:
#enter the path to your chromedriver, I have chromedriver saved to my desktop.
CD_path = '/Users/willfitzhugh/Desktop/chromedriver'

In [1]:
def scrape_xscores_completed(year, country, league_name):
#create xscores url string
    url = 'https://www.xscores.com/soccer/' + country + '/' + league_name + '/results/' + str(year) + '-' + str(year+1)

#initialize webdriver
    path = CD_path
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(path, chrome_options=chrome_options)
    driver.get(url)

#wait 15 seconds for page to load, proceed to else statement if score elements are detected
    try:
        elem = WebDriverWait(driver, 15
                            ).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.match_ft__home')))
        
#if webpage doesn't load, empty df is created, and nothing is scraped
    except:
        df = pd.DataFrame(columns = ['Date', 'Time', 'Home_Team', 'Home_Score', 'Away_Score', 'Away_Team',
                                'Home_Points', 'Away_Points'])
        
#if there' no exception, scraping begins
    else:
        
#scores
        scores_element = driver.find_elements(By.CSS_SELECTOR, 'div.match_ft__home')

        scores = []
        for i in scores_element:
            scores.extend([i.text])
        home_scores = scores[::2]
        away_scores = scores[1::2]

#homes teams
        home_team_elements = driver.find_elements(By.CSS_SELECTOR, 'div.match_home_txt')
        home_teams = []
        for i in home_team_elements[1:]:
            home_teams.extend([i.text])
        
#away teams
        away_team_elements = driver.find_elements(By.CSS_SELECTOR, 'div.match_away_txt')
        away_teams = []
        for i in away_team_elements[1:]:
            away_teams.extend([i.text])
            
#penalties (if any)
        penalty_elements = driver.find_elements(By.CSS_SELECTOR, 'div.match_pn__home')
        penalties = []
        for i in penalty_elements:
            penalties.extend([i.text])
        home_pens = penalties[::2]
        away_pens = penalties[1::2]
        
#extra time score (if any)
        et_elements = driver.find_elements(By.CSS_SELECTOR, 'div.match_et__home')
        extra_time = []
        for i in et_elements:
            extra_time.extend([i.text])
        home_et = extra_time[::2]
        away_et = extra_time[1::2]
    
#dates & times
        dates_times_elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'round_date') or contains(@class, 'match_ko')]")

        dates_times = []
        for i in dates_times_elements[1:]:
            dates_times.extend([i.text])

#rounds & times
        rounds_times_elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'match_ko') or contains(@class, 'round_name')]")
        rounds_times = []
        for i in rounds_times_elements[1:]:
            rounds_times.extend([i.text])
        
#dates
        date_elements = driver.find_elements(By.CSS_SELECTOR, 'div.round_date')
        dates = []
        for i in date_elements:
            dates.extend([i.text])
            
#rounds
        round_elements = driver.find_elements(By.CSS_SELECTOR, 'div.round_name')
        rounds = []
        for i in round_elements:
            rounds.extend([i.text])
        
#times
        time_elements = driver.find_elements(By.CSS_SELECTOR, 'div.match_ko')
        times = []
        for i in time_elements[1:]:
            times.extend([i.text])
        
#get date indices
        dates_times = pd.Series(dates_times)
        date_indices = list(dates_times[dates_times.isin(dates)].index)
            
#prepare dates column:
# The difference between date indices (minus one) is the number of games played on the date of the first index.
# The difference is used as the number of times that date must be repeated in the final dates column to correctly
# correspond to the games.
        date_column = []
        for i in range(len(date_indices)-1):
            date_to_add = dates_times[date_indices[i]]
    
            index_diff = date_indices[i+1] - date_indices[i]
    
            num_games = (index_diff - 1)
    
            date_column.extend( num_games * [date_to_add] )

        date_column.extend( (len(dates_times) - date_indices[-1] - 1) * [dates_times[date_indices[-1]]] )
        # ^ this line accounts for the last date recorded, which doesn't have a later date to compare indices.

#get round indicies
        rounds_times = pd.Series(rounds_times)
        round_indices = list(rounds_times[rounds_times.isin(rounds)].index)
                
#prepare rounds column
#The same process is repeated here for the rounds column.
        round_column = []
        for i in range(len(round_indices)-1):
            round_to_add = rounds_times[round_indices[i]]
    
            index_diff = round_indices[i+1] - round_indices[i]
    
            num_games = (index_diff - 1)
    
            round_column.extend( num_games * [round_to_add] )
        
        round_column.extend( (len(rounds_times) - round_indices[-1] - 1) * [rounds_times[round_indices[-1]]] )

    
#create df
        df = pd.DataFrame({
            'Round':round_column,
            'Date':date_column,
            'Time':times,
            'Home_Team':home_teams,
            'Home_Score':home_scores,
            'Away_Score':away_scores,
            'Away_Team':away_teams,
            'Home_Score_AET':home_et,
            'Away_Score_AET':away_et,
            'Home_Penalties':home_pens,
            'Away_Penalties':away_pens
            })

# add home/away points columns for future aggregation
        df.loc[(df['Home_Score'].str.isdigit() == True) &
           (df['Home_Score'].str.isdigit() == True), 
           ['Home_Points','Away_Points']] = [1, 1]          #set points to 1 by default

        df.loc[ (df['Home_Score']>df['Away_Score']) | 
            (df['Home_Score_AET']>df['Away_Score_AET']) |
            (df['Home_Penalties']>df['Away_Penalties']),
            ('Home_Points', 'Away_Points')] = [3, 0]        #set points for case of Home win

        df.loc[ (df['Home_Score']<df['Away_Score']) | 
            (df['Home_Score_AET']<df['Away_Score_AET']) |
            (df['Home_Penalties']<df['Away_Penalties']),
            ('Home_Points', 'Away_Points')] = [0, 3]        #set points for case of Away win
    
#return df regardless of ealier results
    finally:
        return df