#Path

In [None]:
ligue_1_url = 'https://www.ligue1.com/ranking'
la_liga_url = 'https://www.laliga.com/en-GB/laliga-easports/standing'
bundesliga_url = 'https://www.bundesliga.com/en/bundesliga/table'
serie_a_url = 'https://www.legaseriea.it/en/serie-a/classifica'
serie_a_api = 'https://www.legaseriea.it/api/stats/Classificacompleta'
premier_league_url = 'https://www.premierleague.com/tables'
premier_league_api = 'https://footballapi.pulselive.com/football/standings'
dataset_brut_url = '/content/drive/MyDrive/4MAC/datasets/brut'
dataset_processed_url = '/content/drive/MyDrive/4MAC/datasets/processed'

In [2]:
#Import

In [None]:
from bs4 import BeautifulSoup as BS
import os, os.path as path
import requests as request
from matplotlib import pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
%matplotlib inline
import json
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

In [3]:
#Constant

In [None]:
COLUMNS = ['id', 'team', 'point', 'played', 'won', 'drawn', 'lost', 'goal_scored', 'goal_conceded', 'goal_diff']
SEASONS = {
    '2022-2023':'2022-23',
    '2021-2022':'2021-22',
    '2020-2021':'2020-21',
    '2019-2020':'2019-20',
    '2018-2019':'2018-19'
}
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) \
                    AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
}

In [None]:
#Function

In [4]:
def append_team_data(team_list, id, name, point, played, won, drawn, lost, goal_scored, goal_conceded, goal_diff):
    team = []
    team.append(id)
    team.append(name)
    team.append(point)
    team.append(played)
    team.append(won)
    team.append(drawn)
    team.append(lost)
    team.append(goal_scored)
    team.append(goal_conceded)
    team.append(goal_diff)

    team_list.append(team)

In [5]:
#Scraping

In [6]:
##Ligue 1

In [None]:
params = {
    'StatsActiveTab': 0
}

for season in SEASONS.keys():
  team_list = []
  params['seasonId'] = season
  response = request.get(ligue_1_url, params= params, headers= HEADERS)
  if response.ok:
    soup= BS(response.content, 'lxml')
    li_list = soup.find('div', class_='classement-table-body').find_all('li')

    for li in li_list:
      div_list = li.find_all('div')
      id = int(div_list[0].string)
      name = str.upper(div_list[1].span.string)
      if season == '2019-2020':
        point = int(div_list[3].string)
        played = int(div_list[4].string)
        won = int(div_list[5].string)
        drawn = int(div_list[6].string)
        lost = int(div_list[7].string)
        goal_scored = int(div_list[8].string)
        goal_conceded = int(div_list[9].string)
        goal_diff = int(div_list[10].string)
      else:
        point = int(div_list[2].string)
        played = int(div_list[3].string)
        won = int(div_list[4].string)
        drawn = int(div_list[5].string)
        lost = int(div_list[6].string)
        goal_scored = int(div_list[7].string)
        goal_conceded = int(div_list[8].string)
        goal_diff = int(div_list[9].string)

      append_team_data(team_list, id, name, point, played, won, drawn, lost, goal_scored, goal_conceded, goal_diff)

  df = pd.DataFrame(team_list, columns=COLUMNS)
  df.to_csv(path.join(dataset_brut_url+'/ligue1', season+'.csv'), header=True, index= False)


In [8]:
##Bundesliga

In [None]:
for season in SEASONS.keys():
  team_list = []
  response = request.get(bundesliga_url+f"/{season}", headers= HEADERS)
  if response.ok:
    soup= BS(response.content, 'lxml')
    tr_list = soup.tbody.find_all('tr')
    for tr in tr_list:
      td_list = tr.find_all('td')
      id = int(td_list[1].span.string)
      name = str.upper(td_list[3].div['title'])
      point = int(td_list[11].span.string)
      played = int(td_list[5].span.string)
      won = int(td_list[6].span.string)
      drawn = int(td_list[7].span.string)
      lost = int(td_list[8].span.string)
      gs, gc = td_list[9].string.split(':')
      goal_scored = int(gs)
      goal_conceded = int(gc)
      goal_diff = int(td_list[10].span.string)

      append_team_data(team_list, id, name, point, played, won, drawn, lost, goal_scored, goal_conceded, goal_diff)

  df = pd.DataFrame(team_list, columns=COLUMNS)
  df.to_csv(path.join(dataset_brut_url+'/bundesliga', season+'.csv'), header=True, index= False)



In [9]:
##Serie A

In [None]:
params = {
    'CAMPIONATO': 'A',
    'TURNO': 'UNICO',
    'GIRONE': 'UNI'
}

for season, param in SEASONS.items():
  team_list = []
  params['STAGIONE'] = param
  response = request.get(serie_a_api, params= params, headers= HEADERS)
  if response.ok:
    data = json.loads(response.text)['data']
    for team in data:
      id = team['PosCls']
      name = team['Nome']
      point = team['PuntiCls']
      played = team['Giocate']
      won = team['Vinte']
      drawn = team['Pareggiate']
      lost = team['Perse']
      goal_scored = team['RETIFATTE']
      goal_conceded = team['RETISUBITE']
      goal_diff = team['RETIFATTE'] - team['RETISUBITE']

      append_team_data(team_list, id, name, point, played, won, drawn, lost, goal_scored, goal_conceded, goal_diff)

  df = pd.DataFrame(team_list, columns=COLUMNS)
  df.to_csv(path.join(dataset_brut_url+'/seriea', season+'.csv'), header=True, index= False)


In [10]:
##Premier League

In [None]:
params = {
    'altIds': True,
    'detail': 2,
    'FOOTBALL_COMPETITION': 1
}
seasons = {
    '2022-2023':489,
    '2021-2022':418,
    '2020-2021':363,
    '2019-2020':274,
    '2018-2019':210
}
header = {
    'origin': 'https://www.premierleague.com'
}

for season, param in seasons.items():
  team_list = []
  params['compSeasons'] = param
  response = request.get(premier_league_api, params= params, headers= header)
  if response.ok:
    data = json.loads(response.text)['tables'][0]['entries']
    for team in data:
      id = team['position']
      name = str.upper(team['team']['name'])
      team = team['overall']
      point = team['points']
      played = team['played']
      won = team['won']
      drawn = team['drawn']
      lost = team['lost']
      goal_scored = team['goalsFor']
      goal_conceded = team['goalsAgainst']
      goal_diff = team['goalsDifference']

      append_team_data(team_list, id, name, point, played, won, drawn, lost, goal_scored, goal_conceded, goal_diff)

  df = pd.DataFrame(team_list, columns=COLUMNS)
  df.to_csv(path.join(dataset_brut_url+'/premierleague', season+'.csv'), header=True, index= False)
