## Scraping all data from the UEFA Champions league website
The data has been obtained using the match information table on the righthand side of all matchpages. Unfortunately there is no information available (at least not to my knowledge) of season 2006 - 2007.

The websites that are used in equation makeLinksToSeason are similar to [this one](https://www.uefa.com/uefachampionsleague/history/season=2002/matches/round=1633/match=69700/events/index.html). That is why the array for each season has the following structure.

The seasons have the following structure:
- season[0] = the seasons in the match-overview link, this is the value of the year in which the final is played
- season[1] = the first value for day in the match-overview link, this stands for the first qualifying round
- season[2] = the final value for day in the match-overview link, this stands for the final
- season[3] = the value to store the files

In [None]:
import requests
import copy
import os
import json
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

link_location = "Match links"
goal_location = "Goal information"

In [None]:
S1516 = [2016,  -8, 13, "S1516"]
S1415 = [2015,  -8, 13, "S1415"]
S1314 = [2014,  -8, 13, "S1314"]
S1213 = [2013,  -8, 13, "S1213"]
S1112 = [2012,  -8, 13, "S1112"]
S1011 = [2011,  -8, 13, "S1011"]
S0910 = [2010,  -8, 13, "S0910"]
S0809 = [2009,  -6, 13, "S0809"]
S0708 = [2008,  -8, 13, "S0708"]
S0607 = [2006, -13, 13, "S0607"] # the missing 2007 is due to a change in uefa naming
S0506 = [2005, -12, 13, "S0506"]
S0405 = [2004, -10, 13, "S0405"]
S0304 = [2003,  -8, 26, "S0304"]
S0203 = [2002,  -6, 17, "S0203"]
S0102 = [2001,  -6, 17, "S0102"] 
S0001 = [2000,  -6, 17, "S0001"] 
S9900 = [1999,  -6, 17, "S9900"] 
S9899 = [1998,  -4, 13, "S9899"]
S9798 = [1997,  -4, 12, "S9798"] 
S9697 = [1996,  -2, 11, "S9697"] 
S9596 = [1995,  -2, 11, "S9596"] 
S9495 = [1994,  -2, 11, "S9495"] 
S9394 = [1993,  -6,  9, "S9394"] 
S9293 = [1992,  -6, 12, "S9293"] 

# earlier years is not called Champions League
seasons = [S1516, S1415, S1314, S1213, S1112, S1011, 
           S0910, S0809, S0708, S0506, S0405, S0304, 
           S0203, S0102, S0001, S9900, S9899, S9798, 
           S9697, S9596, S9495, S9394, S9293]

### The function to obtain all match links
This function is used to scrape all links to individual matches. The only required input is a season as defined in the cell above. A list with arrays is returned; the first value of the array is the match-id, which I defined to be the season (S1011 for example) + an integer ranging from 001 to the final. The second value in the array is the link to the actual match.

In [None]:
def makeMatchLinks(season):
    matchDays = []
    
    days = season[2] - season[1]
    days = np.linspace(season[1], season[2], days)

    for i in range(len(days)):
        
        # the range(2) is such that all matches in the week (both days) are checked
        for j in range(2):
            matchDays.append("http://www.uefa.com/uefachampionsleague/season=" + str(season[0]) + \
                             "/matches/library/fixtures/day=" + str(int(days[i])) + \
                             "/session=" + str(j + 1) + \
                             "/_matchesbydate.html?_=1")
    
    matches = {"ID": [],
               "Match link": []}
    
    number = 0
    
    for day in matchDays:
        while True:
            try:
                matchDayLinks = requests.get(day)
                matchDayLinks = BeautifulSoup(matchDayLinks.text, 'html.parser')
                matchDayLinks = matchDayLinks.findAll('a', attrs={'class': 'lbl'})
            except:
                print(day)
                continue
            break
        
        for matchDayLink in matchDayLinks:            
            matchLink = matchDayLink['href'].replace("index.html", "events/index.html")
            matchlink = matchLink.replace("uefachampionsleague", "uefachampionsleague/history")
            
            matches["ID"].append("{} - {:03d}".format(season[3], number))
            matches["Match link"].append("http://www.uefa.com" + matchlink)
            number += 1
            
    return pd.DataFrame.from_dict(matches)

In [None]:
def goalsPerMatch(link):
    while True:
        try:
            match_events = requests.get(link)
            match_events = BeautifulSoup(match_events.text, 'html.parser')
        except:
            print(link) # check while to code is running if the link is broken
            continue
        break
    
    home_events = match_events.findAll('li', attrs={'class': 'match-event match-event-home'})
    away_events = match_events.findAll('li', attrs={'class': 'match-event match-event-away'})

    goals = {"Home": [],
             "Away": []}
    
    goals = {"Minute": [],
             "Team": [],
             "Period": []}
    
    for event in home_events:
        if "code_GOAL" in event.find('img')["src"]:
            goals["Team"].append("Home")
            minute, period = determine_period(event.find('span', attrs={'class': 'minute'}).text[:-1])
            
            goals["Minute"].append(minute)
            goals["Period"].append(period)
    
    for event in away_events:
        if "code_GOAL" in event.find('img')["src"]:
            goals["Team"].append("Away")
            minute, period = determine_period(event.find('span', attrs={'class': 'minute'}).text[:-1])
            
            goals["Minute"].append(minute)
            goals["Period"].append(period)
    
    return goals

In [None]:
def determine_period(goal):
    if len(goal) < 2:
        minute = int(goal)
        period = "First half"
        
    elif int(goal[:2]) <= 45 and len(goal) == 2:
        minute = int(goal)
        period = "First half"
        
    elif len(goal) > 2 and float(goal[:2]) == 45:
        minute = int(goal[:2]) + int(goal[-1])
        period = "First half - Additional time"
        
    elif len(goal) == 2 and int(goal[:2]) <= 90:
        minute = int(goal)
        period = "Second half"
        
    elif len(goal) > 2 and int(goal[:2]) == 90:
        minute = int(goal[:2]) + int(goal[-1])
        period = "Second half - Additional time"
        
    elif (len(goal) == 3 or len(goal) == 2) and int(goal[:3]) <= 105:
        minute = int(goal)
        period = "Extra time - First half"
        
    elif len(goal) > 3 and int(goal[:3]) == 105:
        minute = int(goal[:3]) + int(goal[-1])
        period = "Extra time - First half - Additional time"
        
    elif len(goal) == 3 and int(goal[:3]) <= 120:
        minute = int(goal)
        period = "Extra time - Second half"
        
    elif len(goal) > 3 and int(goal[:3]) == 120:
        minute = int(goal[:3]) + int(goal[-1])
        period = "Extra time - Second half - Additional time"
    
    else:
        print(goal)

    return minute, period

In [None]:
def goalsPerSeason(season):
    links = makeMatchLinks(season)
    links.to_csv(os.path.join(link_location, season[3] + ".csv"))
    
    links = pd.DataFrame.from_csv(os.path.join(link_location, season[3] + ".csv"))
    goals_season = {}
    
    for link in links.index:
        goals = goalsPerMatch(links["Match link"][link])
        goals_season[links["ID"][link]] = goals
    
    return goals_season

### The function to extract and save all information

In [None]:
for season in seasons:
    goals = goalsPerSeason(season)
    
    with open(os.path.join(goal_location, season[3] + ".json"), "w") as outfile:
        json.dump(goals, outfile, indent=4, sort_keys=True)