# Data collection

This notebook will be used to collect data using the API.

API docs - https://www.api-football.com/documentation-v3

# Libraries

In [2]:
import requests
import pandas as pd
from pandas.io.json import json_normalize
from pandasql import sqldf

# Functions

In [3]:
def api(param):
    
    # Define api url
    url = 'https://api-football-v1.p.rapidapi.com/v3/' + param
    
    # Authentication
    headers = {
        "X-RapidAPI-Key": key,
        "X-RapidAPI-Host": "api-football-v1.p.rapidapi.com"
    }

    response = requests.request("GET", url, headers=headers)
    
    return response.json()

# Initialization

In [4]:
# Read api key
with open('api-key', 'r') as f:
    key = f.read()
    
# List of countries
lstCount = ['France','Belgium','England','Germany','Italy','Netherlands','Portugal','Spain','Ukraine','Turkey']

# List of leagues
lstLeag = ['Premier League','La Liga','Serie A','Bundesliga','Ligue 1','Primeira Liga','Eredivisie','Süper Lig',\
           'Jupiler Pro League']

# Load data

## Seasons

In [5]:
# Season start and end
season_start_id = 2012
season_end_id = 2021

## Leagues

Run api

In [34]:
jLeagues = api('leagues')

Transform into dataframe

In [None]:
# Normalize from json
dfLeagues = pd.json_normalize(jLeagues['response'])

# Rename columns
dfLeagues.columns = dfLeagues.columns.str.replace('.', '_')

# Filter leagues
dfLeagues = dfLeagues[dfLeagues.league_name.isin(lstLeag) & dfLeagues.country_name.isin(lstCount)]

# Save into file
dfLeagues.drop('seasons', axis=1).to_csv('leagues.csv', index=False)

# Get league id
# league_id = dfLeagues[(dfLeagues['country_name'] == 'England')\
#                       & (dfLeagues['league_name'] == 'Premier League')]['league_id'].values[0]

## Teams

In [None]:
# Get list of leagues
dfLeagues = pd.read_csv('data/leagues.csv')
listLeagues = dfLeagues['league_id'].to_list()

# Initialise an empty data frame
dfTeams = pd.DataFrame()

# Clear log file
open('log/teams.log', 'w').close()

# Get teams for all seasons and all leagues
for season_id in range(season_start_id, 2022):
    for league_id in listLeagues:

        # Run api
        jTeams = api(f'teams?league={league_id}&season={season_id}')

        # Normalize from json
        dfTeamsTemp = pd.json_normalize(jTeams['response'])

        # Rename columns
        dfTeamsTemp.columns = dfTeamsTemp.columns.str.replace('.', '_')

        # Add leage and season
        dfTeamsTemp['league_id'] = league_id
        dfTeamsTemp['season_id'] = season_id
        
        # Merge temp data to the main df
        dfTeams = pd.concat([dfTeams,dfTeamsTemp])

        # Log
        with open('log/teams.log', 'a') as f:
            f.write(f'season_id = {season_id}, league_id = {league_id}\n')
        
# Save into file
dfTeams.to_csv('teams.csv', index=False)

## Players statistics

In [None]:
# Season start and end
season_start_id = 2012
season_end_id = 2021

# Clear log file
open('log/players.log', 'w').close()

# Get list of leagues
dfLeagues = pd.read_csv('raw_data/leagues.csv')
listLeagues = dfLeagues['league_id'].to_list()

# Get teams for all seasons and all leagues
for season_id in range(season_start_id, 2022):
    
    # Initialise empty data frames
    dfPlayers = pd.DataFrame()
    dfPlayerStats = pd.DataFrame()
    
    for league_id in listLeagues:

        # Get total number of pages
        jPlayers = api(f'players?league={league_id}&season={season_id}')

        # Get total number of pages
        pages = jPlayers['paging']['total']

        # Get all pages
        for page_id in range(pages):

            # Get current page
            jPlayers = api(f'players?league={league_id}&season={season_id}&page={page_id+1}')

            # Get number of records
            records = len(jPlayers['response'])

            # Initialise empty data frames
            dfPlayersTemp = pd.DataFrame()
            dfPlayerStatsTemp = pd.DataFrame()

            # Get all records
            for record_id in range(records):

                # Get current page of players
                dfPlayersTemp = pd.json_normalize(jPlayers['response']).iloc[[record_id]].drop('statistics', axis=1)

                # Get current page of players' statistics
                dfPlayerStatsTemp = pd.json_normalize(pd.json_normalize(jPlayers['response'])\
                                                        .iloc[[record_id]]['statistics'].iloc[0])

                # Add player_id, league_id and season_id
                player_id = dfPlayersTemp['player.id'].iloc[0]
                dfPlayerStatsTemp['player_id'] = player_id
                dfPlayersTemp['league_id'] = league_id
                dfPlayersTemp['season_id'] = season_id
                dfPlayerStatsTemp['season_id'] = season_id

                # Merge the current page to the result dataframe
                dfPlayers = pd.concat([dfPlayers, dfPlayersTemp])
                dfPlayerStats = pd.concat([dfPlayerStats, dfPlayerStatsTemp])

                # Log
                with open('log/players.log', 'a') as f:
                    f.write(f'season_id = {season_id}, league_id = {league_id}, player_id = {player_id}, page_id = {page_id + 1}, pages = {pages}\n')

    # Rename columns
    dfPlayers.columns = dfPlayers.columns.str.replace('.', '_')
    dfPlayerStats.columns = dfPlayerStats.columns.str.replace('.', '_')

    # Save into file
    dfPlayers.to_csv('players_' + str(season_id) + '.csv',index=False)
    dfPlayerStats.to_csv('player_stats_' + str(season_id) + '.csv',index=False)