In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [13]:
scrape_date = '2002'+'08'+'01'

In [10]:
def scrape_ESPN(scrape_date):
#get html content
    url = 'https://www.espn.com/soccer/fixtures/_/date/' + scrape_date
    html_content = requests.get(url).text
    soup = BeautifulSoup(html_content, "html.parser")

#scrape a list of leagues with games on the given date
    leagues = []
    league_soup = soup.find_all('h2', attrs={'class': 'table-caption'})
    for i in league_soup:
        leagues.append(i.get_text())

# scrape a list of teams and leagues
    teamsNleagues_classes = ['table-caption', 'team-name']

    scraper = soup.find_all(['h2','a','span'], attrs={'class': teamsNleagues_classes})

    teamsNleagues=[]
    for i in scraper:
        teamsNleagues.append(i.get_text())
    
### create data frame with teams and leagues
    if len(teamsNleagues) > 0:  #if scrape is not empty
#get league indices
        league_indices = []
        for i in range(len(teamsNleagues)):
            if teamsNleagues[i] in leagues:
                league_indices.append(i)
        
#prepare home team and away team lists to be used as df columns
# using league indicies to slice teamsNleagues list
        teams = []
        for i in range(len(league_indices)-1):
            teams.extend( teamsNleagues[ league_indices[i]+1 : league_indices[i+1] ] )  
        teams.extend( teamsNleagues[league_indices[-1]+1:] )
    
        home_teams = teams[::2]
        away_teams = teams[1::2]

#prepare leagues column, to be used as df index
# (diff in league_index position minus 1) / 2 is the number of games played in that league on the given date
        leagues_column = []

        for i in range(len(league_indices)-1):
            league_to_add = teamsNleagues[league_indices[i]]
    
            index_diff = league_indices[i+1] - league_indices[i]
    
            num_games = (index_diff - 1)//2
    
            leagues_column.extend( num_games * [league_to_add] )
                                                                                     
        leagues_column.extend( (len(teamsNleagues) - league_indices[-1] - 1)//2 * [teamsNleagues[league_indices[-1]]] )

        df = pd.DataFrame({
            'League':leagues_column,
            'Home Team':pd.Series(home_teams).str[:-4].str.strip(' '),   #remove team abbreviation from name
            'Away Team':pd.Series(away_teams).str[:-4].str.strip(' ')
        })


# scrape scores and prepare home/away score columns
        score_scraper = soup.find_all('span', attrs='record')

        scores=[]
        home_scores = []
        away_scores = []
        for i in score_scraper:
            score = i.get_text()
            scores.append(score)
            home_scores.append(score[0])
            away_scores.append(score[-1])
    
        df['Home Score'] = home_scores
        df['Away Score'] = away_scores
        
        
# add home/away points columns for future aggregation
        df['Home Points'] = 1
        df['Away Points'] = 1

        df.loc[ (df['Home Score']>df['Away Score']), 'Home Points' ] = 3
        df.loc[ (df['Home Score']>df['Away Score']), 'Away Points' ] = 0

        df.loc[ (df['Home Score']<df['Away Score']), 'Home Points' ] = 0
        df.loc[ (df['Home Score']<df['Away Score']), 'Away Points' ] = 3
        
        df.loc[df['Home Score'].str.isdigit() == False, 'Home Points'] = np.nan
        df.loc[df['Away Score'].str.isdigit() == False, 'Away Points' ] = np.nan
        
#filter for desired leagues & cups
        top_competitions = [
        'English Premier League',
        'Spanish LaLiga',
        'German Bundesliga',
        'Italian Serie A',
        'French Ligue 1',
        'English FA Cup',
        'English Carabao Cup',
        'Spanish Copa Del Rey',
        'Italian Coppa Italia',
        'German DFB Pokal',
        'French Coupe De France',
        'UEFA Champions League',
        'UEFA Europa League'
        ]
    
        df = df[ df['League'].isin(top_competitions) ]
    
        df['Date'] = scrape_date
    
    else:
        df = pd.DataFrame(columns = ['League','Home Team','Away Team','Home Score','Away Score'])
    
    return df

In [11]:
scrape_ESPN('20020505')

Unnamed: 0,League,Home Team,Away Team,Home Score,Away Score,Home Points,Away Points,Date
0,English Premier League,Liverpool,Blackburn Rovers,v,v,,,20020505
3,Spanish LaLiga,Alavés,Rayo Vallecano,0,1,0.0,3.0,20020505
4,Spanish LaLiga,Athletic Club,Osasuna,1,1,1.0,1.0,20020505
5,Spanish LaLiga,Barcelona,Espanyol,2,0,3.0,0.0,20020505
6,Spanish LaLiga,Celta Vigo,Sevilla,1,2,0.0,3.0,20020505
7,Spanish LaLiga,Las Palmas,Tenerife,0,1,0.0,3.0,20020505
8,Spanish LaLiga,Málaga,Valencia,0,2,0.0,3.0,20020505
9,Spanish LaLiga,Real Betis,Deportivo La Coruña,0,3,0.0,3.0,20020505
10,Spanish LaLiga,Real Madrid,Mallorca,0,0,1.0,1.0,20020505
11,Spanish LaLiga,Real Valladolid,Real Sociedad,1,3,0.0,3.0,20020505
