# Análise e Predição de Química entre Jogadores de Futebol

Grupo: Gabriel Castelo, Matheus Vaz, Victor Kenji e Vinicius Gomes

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import socceraction.spadl as spd
from bs4 import BeautifulSoup
import requests

pd.set_option('future.no_silent_downcasting', True)

In [11]:
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd

# URL da tabela
url = 'https://fbref.com/pt/comps/9/2017-2018/defense/2017-2018-Premier-League-estatisticas'

# Headers para simular um navegador
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Faz requisição
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

# A tabela está dentro de um comentário HTML
comments = soup.find_all(string=lambda text: isinstance(text, Comment))

# Busca pela tabela dentro dos comentários
table = None
for comment in comments:
    comment_soup = BeautifulSoup(comment, 'html.parser')
    table = comment_soup.find('table', id='stats_defense')
    if table:
        break

# Se tabela foi encontrada, extrair os dados
if not table:
    raise ValueError('Tabela stats_defense não encontrada!')

# Extrair cabeçalhos
headers = [th.text.strip() for th in table.find('thead').find_all('th')]
headers = headers[1:]  # ignora a coluna do checkbox

# Extrair dados do corpo da tabela
data = []
for row in table.find('tbody').find_all('tr'):
    # Ignora linhas de subtítulos
    if row.get('class') and 'thead' in row.get('class'):
        continue
    cells = [td.text.strip() for td in row.find_all('td')]
    if cells:
        data.append(dict(zip(headers, cells)))

# Converter para DataFrame
df = pd.DataFrame(data)

df.head()

Unnamed: 0,Botes defensivos,Desafios,Bloqueios,Unnamed: 4,Class.,Jogador,Nação,Pos.,Equipe,Idade,...,90s,Div,TklW,Terço Def,Terço Central,Terço de Ataque,Tent,Tkl%,Perdido,TC
0,Patrick van Aanholt,nl NED,2,26,1990,24.3,47,32,29,15,...,16,19,471,18,24,5,47,94,64,Partidas
1,Rolando Aarons,eng ENG,0,21,1995,1.5,4,4,3,1,...,4,3,667,2,3,0,1,5,0,Partidas
2,Tammy Abraham,eng ENG,0,19,1997,19.2,10,8,1,7,...,1,7,77,12,8,1,1,11,11,Partidas
3,Charlie Adam,sct SCO,2,31,1985,4.6,9,5,2,5,...,5,2,294,12,2,0,9,18,11,Partidas
4,Adrián,es ESP,1,30,1987,19.0,2,1,2,0,...,1,1,250,3,1,0,0,2,15,Partidas


In [7]:
def load_matches(path):
    matches = pd.read_json(path_or_buf=path)
    # as informações dos times de cada partida estão em um dicionário dentro da coluna 'teamsData', então vamos separar essas informações
    team_matches = []
    for i in range(len(matches)):
        match = pd.DataFrame(matches.loc[i, 'teamsData']).T
        match['matchId'] = matches.loc[i, 'wyId']
        team_matches.append(match)
    team_matches = pd.concat(team_matches).reset_index(drop=True)
    return team_matches

def load_players(path):
    players = pd.read_json(path_or_buf=path)
    players['player_name'] = players['firstName'] + ' ' + players['lastName']
    players = players[['wyId', 'player_name']].rename(columns={'wyId': 'player_id'})
    return players

def load_events(path):
    events = pd.read_json(path_or_buf=path)
    # pré processamento em colunas da tabela de eventos para facilitar a conversão p/ SPADL
    events = events.rename(columns={
        'id': 'event_id',
        'eventId': 'type_id',
        'subEventId': 'subtype_id',
        'teamId': 'team_id',
        'playerId': 'player_id',
        'matchId': 'game_id'
    })
    events['milliseconds'] = events['eventSec'] * 1000
    events['period_id'] = events['matchPeriod'].replace({'1H': 1, '2H': 2})
    return events

# def load_minutes_played_per_game(path):
#     minutes = pd.read_json(path_or_buf=path)
#     minutes = minutes.rename(columns={
#         'playerId': 'player_id',
#         'matchId': 'game_id',
#         'teamId': 'team_id',
#         'minutesPlayed': 'minutes_played'
#     })
#     minutes = minutes.drop(['shortName', 'teamName', 'red_card'], axis=1)
#     return minutes

In [None]:
leagues = ['England', 'Spain', 'Germany', 'Italy']
events = {}
matches = {}
# minutes = {}
for league in tqdm(leagues):
    path = r'data/matches/matches_{}.json'.format(league)
    matches[league] = load_matches(path)
    path = r'data/events/events_{}.json'.format(league)
    events[league] = load_events(path)
    # path = r'data/minutes_played_per_game_{}.json'.format(league)
    # minutes[league] = load_minutes_played_per_game(path)

path = r'data/players.json'
players = load_players(path)
players['player_name'] = players['player_name'].str.decode('unicode-escape')

## Mapeamento para SPADL

In [None]:
def spadl_transform(events, matches):
    spadl = []
    game_ids = events.game_id.unique().tolist()
    for g in tqdm(game_ids):
        match_events = events.loc[events.game_id == g]
        match_home_id = matches.loc[(matches.matchId == g) & (matches.side == 'home'), 'teamId'].values[0]
        match_actions = spd.wyscout.convert_to_actions(events=match_events, home_team_id=match_home_id)
        match_actions = spd.play_left_to_right(actions=match_actions, home_team_id=match_home_id)
        match_actions = spd.add_names(match_actions)
        spadl.append(match_actions)
    spadl = pd.concat(spadl).reset_index(drop=True)
    return spadl

In [None]:
spadl = {}
for league in leagues:
    spadl[league] = spadl_transform(events=events[league], matches=matches[league])
    # Adicionando o nome dos players
    spadl[league] = spadl[league].merge(players[['player_id', 'player_name']], on='player_id', how='left')

## Análise exploratória