In [1]:
import requests
from bs4 import BeautifulSoup
import re
import json
import codecs
import pandas as pd
from pandas import json_normalize
import time  

In [2]:
# define EPL seasons 
seasons = [2023]

In [3]:
def get_season_html(season):
    # Construct the URL based on the league (EPL) and season
    url = f"https://understat.com/league/EPL/{season}"

    # Send an HTTP GET request to the constructed URL
    response = requests.get(url)

    # Get the content of the response, which typically contains the HTML content of the web page
    html_content = response.content

    # Return the HTML content
    return html_content


In [4]:
def parse_html_content(html_content):
    # Parse HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all script tags in the HTML
    scripts = soup.find_all('script')

    # Access the script tag at index 2 (change index if needed)
    target_script = scripts[2]

    # Convert the script content to a string
    target_string = str(target_script.contents[0])

    # Decode the string using unicode_escape
    cleaned_string = codecs.decode(target_string, 'unicode_escape')

    # Extract the relevant JSON data from the decoded string
    # (Note: The specific indices [30:-4] may need adjustment based on the data structure)
    teams_data = json.loads(cleaned_string[30:-4])

    # Return the extracted teams_data
    return teams_data


In [5]:
def normalized_dataframe(teams_data):
    # Create an empty list to store individual team DataFrames
    teams_normalized_dfs = []

    # Iterate through each team's data
    for team_id, team_data in teams_data.items():
        # Create a DataFrame from the team's data
        team_df = pd.DataFrame(team_data)

        # Normalize the 'history' column using json_normalize and concatenate it with the original DataFrame
        team_normalized_df = pd.concat([team_df.drop(['history'], axis=1), 
                                        json_normalize(team_df['history'])], axis=1)

        # Append the normalized DataFrame to the list
        teams_normalized_dfs.append(team_normalized_df)

    # Return the final DataFrame
    return teams_normalized_dfs


In [6]:
# Create an empty list to store normalized DataFrames
normalized_dfs = []

# Iterate through each season
for season in seasons:
    # Fetch HTML content for the current season
    season_html_content = get_season_html(season)

    # Parse HTML content to obtain data
    season_parsed_data = parse_html_content(season_html_content)

    # Create normalized DataFrame for the current season
    season_normalized_df = normalized_dataframe(season_parsed_data)

    # Extend the list with the normalized DataFrames for the current season
    normalized_dfs.extend(season_normalized_df)

    # Add a 5-second delay before fetching data for the next season
    time.sleep(5)

# The 'normalized_dfs' list now contains all the normalized DataFrames for each season


In [7]:
# Create a single DataFrame by concatenating all individual team DataFrames
final_df = pd.concat(normalized_dfs, ignore_index=True)

In [8]:
final_df.shape

(410, 23)

In [9]:
final_df.head()

Unnamed: 0,id,title,h_a,xG,xGA,npxG,npxGA,deep,deep_allowed,scored,...,date,wins,draws,loses,pts,npxGD,ppda.att,ppda.def,ppda_allowed.att,ppda_allowed.def
0,71,Aston Villa,a,1.486,4.32208,1.486,4.32208,6,9,1,...,2023-08-12 16:30:00,0,0,1,0,-2.83608,324,28,255,17
1,71,Aston Villa,h,3.24336,0.721465,2.48219,0.721465,13,2,4,...,2023-08-20 13:00:00,1,0,0,3,1.760725,204,12,385,23
2,71,Aston Villa,a,2.83691,0.630605,2.83691,0.630605,9,8,3,...,2023-08-27 13:00:00,1,0,0,3,2.206305,365,15,254,31
3,71,Aston Villa,a,0.615503,2.73138,0.615503,2.73138,3,11,0,...,2023-09-03 13:00:00,0,0,1,0,-2.115877,543,8,248,29
4,71,Aston Villa,h,2.31518,1.11696,1.55401,1.11696,14,7,3,...,2023-09-16 14:00:00,1,0,0,3,0.43705,141,31,286,30


In [10]:
# Define the API key variable to hold the access key for the Footystats API
api_key = "dc33d51f5d112e81607b27308e5b1eb0f891774ab3937106033f7f2e452c6a69" 

In [11]:
# Define the URL for accessing the Football Data API's league list endpoint, including the API key
url = "https://api.football-data-api.com/league-list?key=" + api_key

# Make an HTTP GET request to the defined URL using the requests library
response = requests.get(url)


In [12]:
# Convert the response content to JSON format using the .json() method
data = response.json()


In [13]:
# Retrieve the keys of the 'data' dictionary to inspect its structure
data.keys()


dict_keys(['success', 'pager', 'metadata', 'data', 'message'])

In [14]:
# Extract the 'country' values from each dictionary in the 'data' list using list comprehension
countries = [data.get('country') for data in data['data']]

# Print the first 5 elements of the 'countries' list
print(countries[:5])


['USA', 'Scotland', 'Germany', 'Europe', 'Malaysia']


In [15]:
# Find the index of the value "England" in the 'countries' list
england_index = countries.index("England")

england_index

5

In [16]:
# Access information about seasons from the 'data' dictionary
seasons_info = data["data"][5]["season"]

# Print the 'seasons_info' variable
seasons_info


[{'id': 9, 'year': 20162017},
 {'id': 10, 'year': 20152016},
 {'id': 11, 'year': 20142015},
 {'id': 12, 'year': 20132014},
 {'id': 161, 'year': 20172018},
 {'id': 246, 'year': 20122013},
 {'id': 1625, 'year': 20182019},
 {'id': 2012, 'year': 20192020},
 {'id': 3119, 'year': 20112012},
 {'id': 3121, 'year': 20102011},
 {'id': 3125, 'year': 20092010},
 {'id': 3131, 'year': 20082009},
 {'id': 3137, 'year': 20072008},
 {'id': 4759, 'year': 20202021},
 {'id': 6135, 'year': 20212022},
 {'id': 7704, 'year': 20222023},
 {'id': 9660, 'year': 20232024}]

In [17]:
# Extract the 'id' values from each dictionary in the 'seasons_info' list using list comprehension
season_ids = [season.get("id") for season in seasons_info]

# Print the 'season_ids' list
print(season_ids)


[9, 10, 11, 12, 161, 246, 1625, 2012, 3119, 3121, 3125, 3131, 3137, 4759, 6135, 7704, 9660]


In [18]:
# extract season_ids of interest
season_ids_of_interest = [9660]

In [19]:
def get_league_matches(api_key, season_id):
    """
    Retrieve league matches data for a specific season.

    Parameters:
    - api_key (str): API key for accessing the Football Data API.
    - season_id (int): ID of the specific season.

    Returns:
    - response: HTTP response object containing the retrieved data.
    """
    # Construct the URL for accessing league matches data for the specified season
    url = f"https://api.football-data-api.com/league-matches?key={api_key}&season_id={season_id}"
    
    # Make an HTTP GET request to the defined URL using the requests library
    response = requests.get(url)
    
    # Return the response object
    return response


In [20]:
import pandas as pd

def create_dataframe(api_key, season_ids):
    """
    Create a Pandas DataFrame by fetching and concatenating league matches data for multiple seasons.

    Parameters:
    - api_key (str): API key for accessing the Football Data API.
    - season_ids (list): List of season IDs for which data will be fetched.

    Returns:
    - concatenated_df: Pandas DataFrame containing concatenated league matches data for the specified seasons.
    """
    # Initialize an empty list to store individual DataFrames for each season
    list_of_dfs = []

    # Iterate through each season ID in the provided list
    for season_id in season_ids:
        try:
            # Fetch league matches data for the current season
            response = get_league_matches(api_key, season_id)
            data = response.json()
            
            # Create a DataFrame from the fetched data
            df = pd.DataFrame(data["data"])
            
            # Append the DataFrame to the list
            list_of_dfs.append(df)
        except:
            # Handle errors and exit the function if an error occurs
            print("There was an error.")
            exit()

    # Concatenate the DataFrames in the list to create a single DataFrame
    concatenated_df = pd.concat(list_of_dfs, ignore_index=True)
    
    # Return the concatenated DataFrame
    return concatenated_df


In [21]:
# Create a DataFrame containing league matches data for multiple seasons using the create_dataframe function
matches_df = create_dataframe(api_key, season_ids_of_interest)

# Display the first few rows of the DataFrame to inspect the data
matches_df.head()


Unnamed: 0,id,homeID,awayID,season,status,roundID,game_week,revised_game_week,homeGoals,awayGoals,...,matches_completed_minimum,over05,over15,over25,over35,over45,over55,btts,homeGoals_timings,awayGoals_timings
0,6688951,145,93,2023/2024,complete,100543,1,-1,[],"[4, 36, 75]",...,20,True,True,True,False,False,False,False,[],"[4, 36, 75]"
1,6688952,59,211,2023/2024,complete,100543,1,-1,"[26, 32]",[82],...,21,True,True,True,False,False,False,True,"[26, 32]",[82]
2,6688953,148,153,2023/2024,complete,100543,1,-1,[82],[51],...,19,True,True,False,False,False,False,True,[82],[51]
3,6688954,209,271,2023/2024,complete,100543,1,-1,"[36, 71, 85, 90+5]",[81],...,20,True,True,True,True,True,False,True,"[36, 71, 85, 90+5]",[81]
4,6688955,144,162,2023/2024,complete,100543,1,-1,[],[73],...,21,True,False,False,False,False,False,False,[],[73]


In [22]:
# Inspect the resulting dataframe
matches_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Columns: 215 entries, id to awayGoals_timings
dtypes: bool(7), float64(78), int64(115), object(15)
memory usage: 620.2+ KB


In [23]:
# Inspect the shape of  dataframe
matches_df.shape

(380, 215)

In [24]:
# Save the final dataframe to a CSV file for scraped testing data
final_df.to_csv('./data/scraped_testing_data.csv', index=False)

# Save the matches dataframe to a CSV file for API testing data
matches_df.to_csv('./data/api_testing_data.csv', index=False)
