# Análise e Predição de Química entre Jogadores de Futebol

No futebol moderno, a performance coletiva vai além das métricas individuais como gols, passes certos ou desarmes. 
A interação entre os jogadores - a chamada "química" - é um fator crucial, mas frequentemente ignorado, especialmente 
em decisões de recrutamento. Casos como a contratação simultânea de Andy Carroll e Luis Suárez pelo Liverpool em 2011 
ilustram como a falta de entrosamento pode comprometer o desempenho, mesmo com atletas tecnicamente qualificados.

Este trabalho busca investigar a qualidade do entrosamento entre duplas de jogadores que atuaram juntas, utilizando 
dados de desempenho individual e coletivo. O objetivo é classificar o nível de química entre essas duplas e treinar 
um modelo capaz de prever a compatibilidade entre jogadores que nunca atuaram lado a lado.

Grupo: Gabriel Castelo, Matheus Vaz, Victor Kenji e Vinicius Gomes

Repositório no GitHub: [https://github.com/vinisilvag/cdaf-projeto](https://github.com/vinisilvag/cdaf-projeto)

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
from functools import reduce
from collections import defaultdict

from thefuzz import fuzz
from thefuzz import process

import socceraction.spadl as spd

from bs4 import BeautifulSoup, Comment
import requests

pd.set_option('future.no_silent_downcasting', True)

### Scrapping de dados da FBref para enriquecer os dados que temos dos jogadores

A metodologia do trabalho de base usa dados proprietários. Assim, pretendemos contornar esse problema fazendo um scrapping de dados da FBref para enriquecer as informações dos jogadores e usar atributos semelhantes aos que eles utilizaram na nossa metodologia para cálculo das métricas e treinamento do modelo.

OBS: Scrapping finalizado! Não é necessário rodar as células a seguir, os dados coletados já estão salvos localmente (a não ser que a gente queira adicionar mais dados).

In [2]:
player_data = {
    "England": [
        ("https://fbref.com/en/comps/9/2017-2018/passing/2017-2018-Premier-League-Stats", "stats_passing"),
        ("https://fbref.com/en/comps/9/2017-2018/defense/2017-2018-Premier-League-Stats", "stats_defense"),
        ("https://fbref.com/en/comps/9/2017-2018/passing_types/2017-2018-Premier-League-Stats", "stats_passing_types"),
        ("https://fbref.com/en/comps/9/2017-2018/misc/2017-2018-Premier-League-Stats", "stats_misc"),
    ],
    "Spain": [
        ("https://fbref.com/en/comps/12/2017-2018/passing/2017-2018-La-Liga-Stats", "stats_passing"),
        ("https://fbref.com/en/comps/12/2017-2018/defense/2017-2018-La-Liga-Stats", "stats_defense"),
        ("https://fbref.com/en/comps/12/2017-2018/passing_types/2017-2018-La-Liga-Stats", "stats_passing_types"),
        ("https://fbref.com/en/comps/12/2017-2018/misc/2017-2018-La-Liga-Stats", "stats_misc"),
    ],
    "Germany": [
        ("https://fbref.com/en/comps/20/2017-2018/passing/2017-2018-Bundesliga-Stats", "stats_passing"),
        ("https://fbref.com/en/comps/20/2017-2018/defense/2017-2018-Bundesliga-Stats", "stats_defense"),
        ("https://fbref.com/en/comps/20/2017-2018/passing_types/2017-2018-Bundesliga-Stats", "stats_passing_types"),
        ("https://fbref.com/en/comps/20/2017-2018/misc/2017-2018-Bundesliga-Stats", "stats_misc"),
    ],
    "Italy": [
        ("https://fbref.com/en/comps/11/2017-2018/passing/2017-2018-Serie-A-Stats", "stats_passing"),
        ("https://fbref.com/en/comps/11/2017-2018/defense/2017-2018-Serie-A-Stats", "stats_defense"),
        ("https://fbref.com/en/comps/11/2017-2018/passing_types/2017-2018-Serie-A-Stats", "stats_passing_types"),
        ("https://fbref.com/en/comps/11/2017-2018/misc/2017-2018-Serie-A-Stats", "stats_misc"),
    ],
    "France": [
        ("https://fbref.com/en/comps/13/2017-2018/passing/2017-2018-Ligue-1-Stats", "stats_passing"),
        ("https://fbref.com/en/comps/13/2017-2018/defense/2017-2018-Ligue-1-Stats", "stats_defense"),
        ("https://fbref.com/en/comps/13/2017-2018/passing_types/2017-2018-Ligue-1-Stats", "stats_passing_types"),
        ("https://fbref.com/en/comps/13/2017-2018/misc/2017-2018-Ligue-1-Stats", "stats_misc"),
    ]
}

def get_data_by_league(league_data):
    scrapped_data = []
    for index, (url, table_id) in enumerate(league_data):
        headers = {
            'User-Agent': 'Mozilla/5.0'
        }
        
        # Faz requisição
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # A tabela está dentro de um comentário HTML
        comments = soup.find_all(string=lambda text: isinstance(text, Comment))
        
        # Busca pela tabela dentro dos comentários
        table = None
        for comment in comments:
            comment_soup = BeautifulSoup(comment, 'html.parser')
            table = comment_soup.find('table', id=table_id)
            if table:
                break
        
        # Se tabela foi encontrada, extrair os dados
        if not table:
            raise ValueError(f'Table {table_id} not found!')
            
        # Extrair cabeçalhos
        thead = table.find('thead')
        headers = []
        for tr in thead.find_all('tr'):
            if 'over_header' in tr.get('class', []):
                continue
            headers.extend([th.text.strip() for th in tr.find_all('th')])

        headers = headers[1:len(headers) - 1]

        if table_id == "stats_passing":
            labels = ['_Total', '_Short', '_Medium', '_Long']
            counts = defaultdict(int)
            new_stats_passing = []
            for col in headers:
                if col in ['Cmp', 'Att', 'Cmp%']:
                    suffix = labels[counts[col]]
                    if col == "Att" or col == "Cmp":
                        new_stats_passing.append(col + suffix + "_Passing")
                    else:
                        new_stats_passing.append(col + suffix)
                    counts[col] += 1
                else:
                    new_stats_passing.append(col)
            headers = new_stats_passing

        if table_id == "stats_defense":
            labels = ['_Tackles', '_Challenges']
            counts = defaultdict(int)
            new_stats_defense = []
            for col in headers:
                if col in ['Tkl']:
                    suffix = labels[counts[col]]
                    new_stats_defense.append(col + suffix)
                    counts[col] += 1
                if col in ["Att", "Blocks", "Int", "Lost", "TklW"]:
                    new_stats_defense.append(col + "_Defense")
                else:
                    new_stats_defense.append(col)
            headers = new_stats_defense

        if table_id == "stats_passing_types":
            new_stats_passing_types = []
            for col in headers:
                if col in ["Att", "Blocks", "Cmp", "Crs", "Off"]:
                    new_stats_passing_types.append(col + "_PassingTypes")
                else:
                    new_stats_passing_types.append(col)
            headers = new_stats_passing_types

        if table_id == "stats_misc":
            new_stats_misc = []
            for col in headers:
                if col in ["Crs", "Int", "Lost", "Off", "TklW"]:
                    new_stats_misc.append(col + "_Misc")
                else:
                    new_stats_misc.append(col)
            headers = new_stats_misc
            
        # Extrair dados do corpo da tabela
        data = []
        for row in table.find('tbody').find_all('tr'):
            # Ignora linhas de subtítulos
            if row.get('class') and 'thead' in row.get('class'):
                continue
            cells = [td.text.strip() for td in row.find_all('td')]
            if cells:
                data.append(dict(zip(headers, cells)))

        # Converter para DataFrame
        df = pd.DataFrame(data)

        if index != 0:
            df.drop(["Squad", "Pos", "Nation", "90s", "Age", "Born"], axis=1, inplace=True)
        
        scrapped_data.append(df)
    return scrapped_data

In [3]:
england_players_scrapped_data = get_data_by_league(player_data["England"])

In [4]:
spain_players_scrapped_data = get_data_by_league(player_data["Spain"])

In [12]:
germany_players_scrapped_data = get_data_by_league(player_data["Germany"])

In [13]:
italy_players_scrapped_data = get_data_by_league(player_data["Italy"])

In [14]:
france_players_scrapped_data = get_data_by_league(player_data["France"])

In [16]:
england_players = reduce(lambda left, right: pd.merge(left, right, on='Player', how='inner'), england_players_scrapped_data)
spain_players = reduce(lambda left, right: pd.merge(left, right, on='Player', how='inner'), spain_players_scrapped_data)
germany_players = reduce(lambda left, right: pd.merge(left, right, on='Player', how='inner'), germany_players_scrapped_data)
italy_players = reduce(lambda left, right: pd.merge(left, right, on='Player', how='inner'), italy_players_scrapped_data)
france_players = reduce(lambda left, right: pd.merge(left, right, on='Player', how='inner'), france_players_scrapped_data)
enhanced_players = pd.concat([england_players, spain_players, germany_players, italy_players, france_players], ignore_index=True)
enhanced_players.to_csv('data/players_scrapped.csv', index=False)

In [38]:
def get_best_match(name, choices, threshold=90):
    match, score = process.extractOne(name, choices, scorer=fuzz.token_sort_ratio)
    if score >= threshold:
        return match
    return None

def save_players_merged(path):
    players = pd.read_json(path_or_buf=path + "players.json")
    players_scrapped = pd.read_csv(path + "players_scrapped.csv")
    players['player_name'] = players['firstName'] + ' ' + players['lastName']
    players['player_name'] = players['player_name'].apply(lambda x: x.encode('utf-8').decode('unicode_escape'))

    player_names = players_scrapped["Player"].tolist()

    players["matched_name"] = players["player_name"].apply(lambda x: get_best_match(x, player_names))

    players.dropna(subset=["matched_name"], inplace=True)
    
    merged_players = pd.merge(players, players_scrapped, left_on="matched_name", right_on="Player", how="inner")

    merged_players = merged_players.rename(columns={'wyId': 'player_id'})
    
    return merged_players

In [39]:
players_merged = save_players_merged("data/")
players_merged.to_csv('data/players_merged.csv', index=False)

### Funções auxiliares

In [2]:
def load_matches(path):
    matches = pd.read_json(path_or_buf=path)
    # as informações dos times de cada partida estão em um dicionário dentro da coluna 'teamsData', então vamos separar essas informações
    team_matches = []
    for i in range(len(matches)):
        match = pd.DataFrame(matches.loc[i, 'teamsData']).T
        match['matchId'] = matches.loc[i, 'wyId']
        team_matches.append(match)
    team_matches = pd.concat(team_matches).reset_index(drop=True)
    return team_matches

def load_players(path):
    players = pd.read_csv(path + "players_merged.csv")
    return players

def load_events(path):
    events = pd.read_json(path_or_buf=path)
    # pré processamento em colunas da tabela de eventos para facilitar a conversão p/ SPADL
    events = events.rename(columns={
        'id': 'event_id',
        'eventId': 'type_id',
        'subEventId': 'subtype_id',
        'teamId': 'team_id',
        'playerId': 'player_id',
        'matchId': 'game_id'
    })
    events['milliseconds'] = events['eventSec'] * 1000
    events['period_id'] = events['matchPeriod'].replace({'1H': 1, '2H': 2})
    return events

# def load_minutes_played_per_game(path):
#     minutes = pd.read_json(path_or_buf=path)
#     minutes = minutes.rename(columns={
#         'playerId': 'player_id',
#         'matchId': 'game_id',
#         'teamId': 'team_id',
#         'minutesPlayed': 'minutes_played'
#     })
#     minutes = minutes.drop(['shortName', 'teamName', 'red_card'], axis=1)
#     return minutes

In [3]:
leagues = ['England']
# leagues = ['England', 'Spain', 'Germany', 'Italy', 'France']
events = {}
matches = {}
for league in tqdm(leagues):
    path = r'data/matches/matches_{}.json'.format(league)
    matches[league] = load_matches(path)
    path = r'data/events/events_{}.json'.format(league)
    events[league] = load_events(path)

path = r'data/'
players = load_players(path)
players['player_name'] = players['player_name'].str.decode('unicode-escape')

100%|███████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.20s/it]


## Mapeamento para SPADL

In [4]:
def spadl_transform(events, matches):
    spadl = []
    game_ids = events.game_id.unique().tolist()
    for g in tqdm(game_ids):
        match_events = events.loc[events.game_id == g]
        match_home_id = matches.loc[(matches.matchId == g) & (matches.side == 'home'), 'teamId'].values[0]
        match_actions = spd.wyscout.convert_to_actions(events=match_events, home_team_id=match_home_id)
        match_actions = spd.play_left_to_right(actions=match_actions, home_team_id=match_home_id)
        match_actions = spd.add_names(match_actions)
        spadl.append(match_actions)
    spadl = pd.concat(spadl).reset_index(drop=True)
    return spadl

In [5]:
spadl = {}
for league in leagues:
    spadl[league] = spadl_transform(events=events[league], matches=matches[league])
    # Adicionando o nome dos players
    spadl[league] = spadl[league].merge(players[['player_id', 'player_name']], on='player_id', how='left')

100%|███████████████████████████████████████████████| 380/380 [02:41<00:00,  2.35it/s]


## Análise exploratória

In [None]:
spadl['England']

In [None]:
spadl['Spain']

In [None]:
spadl['Germany']

In [None]:
spadl['Italy']

In [None]:
spadl['France']

### Players

In [None]:
players_england = spadl['England']['player_name'].unique()
print("Número de jogadores na Premier League:", len(players_england))
players_england

In [None]:
players_spain = spadl['Spain']['player_name'].unique()
print("Número de jogadores na La Liga:", len(players_spain))
players_spain

In [None]:
players_germany = spadl['Germany']['player_name'].unique()
print("Número de jogadores na Bundesliga:", len(players_germany))
players_germany

In [None]:
players_italy = spadl['Italy']['player_name'].unique()
print("Número de jogadores na Serie A:", len(players_italy))
players_italy

In [None]:
players_france = spadl['France']['player_name'].unique()
print("Número de jogadores na Ligue 1:", len(players_france))
players_france

### Número médio de ações por jogo em cada liga

Observações
- Mais ou menos o mesmo número médio de ações por jogo.

In [None]:
spadl['England'].groupby('game_id').size().mean()

In [None]:
spadl['Spain'].groupby('game_id').size().mean()

In [None]:
spadl['Germany'].groupby('game_id').size().mean()

In [None]:
spadl['Italy'].groupby('game_id').size().mean()

In [None]:
spadl['France'].groupby('game_id').size().mean()

### Distribuição dos Tipos de Ação

Histograma com a frequência ded cada tipo de ação existente nos dados analisados.

Observações:
- Passe é a ação mais comum;
- Alternância, a depender da liga, entre interceptações e bolas carregadas como segundo tipo de ação mais realizada;
- Ocorrências menos frequentes de lançamentos, cruzamentos, "chutões" e desarmes.

In [None]:
def plot_action_counts(action_counts):
    plt.figure(figsize=(10, 6))
    action_counts.plot(kind='bar', color='skyblue')
    plt.title('Distribuição dos Tipos de Ação')
    plt.xlabel('Tipo de Ação')
    plt.ylabel('Frequência')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
plot_action_counts(spadl['England']['type_name'].value_counts())

In [None]:
plot_action_counts(spadl['Spain']['type_name'].value_counts())

In [None]:
plot_action_counts(spadl['Germany']['type_name'].value_counts())

In [None]:
plot_action_counts(spadl['Italy']['type_name'].value_counts())

In [None]:
plot_action_counts(spadl['France']['type_name'].value_counts())

### Ações por Jogador

Top 10 jogadores com mais ações registradas na liga.

Observações:
- Os jogadores mais participativos são, em geral, meio campistas ou jogadores muito importantes do time;
- O número de ações parece flutuar pouco.

In [None]:
def plot_top_active_players(top_players):
    plt.figure(figsize=(10, 6))
    top_players.plot(kind='bar', color='lightgreen')
    plt.title('Top 10 Jogadores por Número de Ações')
    plt.xlabel('Jogador')
    plt.ylabel('Número de Ações')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
plot_top_active_players(spadl['England']['player_name'].value_counts().head(10))

In [None]:
plot_top_active_players(spadl['Spain']['player_name'].value_counts().head(10))

In [None]:
plot_top_active_players(spadl['Germany']['player_name'].value_counts().head(10))

In [None]:
plot_top_active_players(spadl['Italy']['player_name'].value_counts().head(10))

In [None]:
plot_top_active_players(spadl['France']['player_name'].value_counts().head(10))

### Mapa de Calor dos Passes

Mapa de calor dos passes em função das áreas do campo.

Observações:
- A troca de passes nas 5 ligas segue mais ou menos a mesma distribuição pelo campo;
- Pouca troca de passe na área adversária;
- Parece comum a existência de um jogador no meio do campo que distribui e conecta a bola entre a defesa e o ataque.

In [None]:
def plot_pass_heatmap(passes):
    n_x, n_y = 24, 16
    x_bins = np.linspace(0, 105, n_x + 1)
    y_bins = np.linspace(0, 68, n_y + 1)
    
    # Conta quantos passes começaram em cada quadrante
    heatmap, _, _ = np.histogram2d(passes['start_y'], passes['start_x'], bins=[y_bins, x_bins])
    
    # Plot
    fig, ax = plt.subplots(figsize=(12, 7))
    im = ax.imshow(heatmap, cmap='Blues', origin='lower', extent=[0, 105, 0, 68], aspect='auto')
    
    # Adiciona a grade
    for x in x_bins:
        ax.axvline(x, color='gray', linewidth=0.5)
    for y in y_bins:
        ax.axhline(y, color='gray', linewidth=0.5)
    
    # Rótulos e título
    ax.set_title('Mapa de Calor dos Passes')
    ax.set_xlabel('Comprimento do Campo')
    ax.set_ylabel('Largura do Campo')
    fig.colorbar(im, ax=ax, label='Número de Passes')
    plt.show()

In [None]:
england_passes = spadl['England'][spadl['England']['type_name'] == 'pass']
plot_pass_heatmap(england_passes)

In [None]:
spain_passes = spadl['Spain'][spadl['Spain']['type_name'] == 'pass']
plot_pass_heatmap(spain_passes)

In [None]:
germany_passes = spadl['Germany'][spadl['Germany']['type_name'] == 'pass']
plot_pass_heatmap(germany_passes)

In [None]:
italy_passes = spadl['Italy'][spadl['Italy']['type_name'] == 'pass']
plot_pass_heatmap(italy_passes)

In [None]:
france_passes = spadl['France'][spadl['France']['type_name'] == 'pass']
plot_pass_heatmap(france_passes)

### Sequência: Ações que Levaram a Gol

Histograma com a frequência de cada tipo de ação executada que precedeu um gol.

Observações:
- Passes e cruzamentos lideram como ações que precedem gols;
- Bolas carregadas não resultam em tantos gols;
- Na La Liga, rebotes tendem a gerar mais gols do que escanteios.

In [None]:
def plot_action_sequences(action_sequences):
    plt.figure(figsize=(10, 6))
    action_sequences.plot(kind='bar', color='orange')
    plt.title('Tipo de Ação Imediatamente Antes do Gol (feito a partir de um chute)')
    plt.xlabel('Tipo de Ação')
    plt.ylabel('Frequência')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
goal_indices = spadl['England'][(spadl['England']['type_name'] == 'shot') & (spadl['England']['result_name'] == 'success')].index
pre_goal_actions = spadl['England'].loc[goal_indices - 1]
action_sequences = pre_goal_actions['type_name'].value_counts()
plot_action_sequences(action_sequences)

In [None]:
goal_indices = spadl['Spain'][(spadl['Spain']['type_name'] == 'shot') & (spadl['Spain']['result_name'] == 'success')].index
pre_goal_actions = spadl['Spain'].loc[goal_indices - 1]
action_sequences = pre_goal_actions['type_name'].value_counts()
plot_action_sequences(action_sequences)

In [None]:
goal_indices = spadl['Germany'][(spadl['Germany']['type_name'] == 'shot') & (spadl['Germany']['result_name'] == 'success')].index
pre_goal_actions = spadl['Germany'].loc[goal_indices - 1]
action_sequences = pre_goal_actions['type_name'].value_counts()
plot_action_sequences(action_sequences)

In [None]:
goal_indices = spadl['Italy'][(spadl['Italy']['type_name'] == 'shot') & (spadl['Italy']['result_name'] == 'success')].index
pre_goal_actions = spadl['Italy'].loc[goal_indices - 1]
action_sequences = pre_goal_actions['type_name'].value_counts()
plot_action_sequences(action_sequences)

In [None]:
goal_indices = spadl['France'][(spadl['France']['type_name'] == 'shot') & (spadl['France']['result_name'] == 'success')].index
pre_goal_actions = spadl['France'].loc[goal_indices - 1]
action_sequences = pre_goal_actions['type_name'].value_counts()
plot_action_sequences(action_sequences)

### Mapa de Calor de Finalizações

Mapa de calor com as finalizações realizadas na liga.

Observações:
- As 5 ligas seguem o mesmo padrão de finalizações.

In [None]:
def plot_shot_heatmap(shots):
    n_x, n_y = 24, 16
    x_bins = np.linspace(0, 105, n_x + 1)
    y_bins = np.linspace(0, 68, n_y + 1)
    
    # Conta quantos passes começaram em cada quadrante
    heatmap, _, _ = np.histogram2d(shots['start_y'], shots['start_x'], bins=[y_bins, x_bins])
    
    # Plot
    fig, ax = plt.subplots(figsize=(12, 7))
    im = ax.imshow(heatmap, cmap='Reds', origin='lower', extent=[0, 105, 0, 68], aspect='auto')
    
    # Adiciona a grade
    for x in x_bins:
        ax.axvline(x, color='gray', linewidth=0.5)
    for y in y_bins:
        ax.axhline(y, color='gray', linewidth=0.5)
    
    # Rótulos e título
    plt.title('Mapa de Calor de Finalizações')
    ax.set_xlabel('Comprimento do Campo')
    ax.set_ylabel('Largura do Campo')
    fig.colorbar(im, ax=ax, label='Número de Chutes')
    plt.show()

In [None]:
plot_shot_heatmap(spadl['England'][spadl['England']['type_name'] == 'shot'])

In [None]:
plot_shot_heatmap(spadl['Spain'][spadl['Spain']['type_name'] == 'shot'])

In [None]:
plot_shot_heatmap(spadl['Germany'][spadl['Germany']['type_name'] == 'shot'])

In [None]:
plot_shot_heatmap(spadl['Italy'][spadl['Italy']['type_name'] == 'shot'])

In [None]:
plot_shot_heatmap(spadl['France'][spadl['France']['type_name'] == 'shot'])

### Mapa de Assistências

Mapa com as setas que indicam passes que antecedem um chute.

Observações:
- Apenas uma amostra das assistências foram plotadas, para evitar poluição;
- As assistências precedem chutes contra o gol adversário, majoritariamente;
- Os chutes em sequência partem principalmente da grande área do goleiro adversário.

In [None]:
def plot_assists_heatmap(assists):
    plt.figure(figsize=(12, 7))
    for _, row in assists.iterrows():
        plt.arrow(row['start_x'], row['start_y'], 
                  row['end_x'] - row['start_x'], row['end_y'] - row['start_y'], 
                  head_width=1, head_length=1, color='green', alpha=0.5)
    
    plt.title('Passes que Antecederam Chutes')
    plt.xlim(0, 105)
    plt.ylim(0, 68)
    plt.gca().set_aspect('equal')
    plt.show()

In [None]:
shot_idx = spadl['England'][spadl['England']['type_name'] == 'shot'].index
assist_idx = shot_idx - 1
assists = spadl['England'].loc[assist_idx][spadl['England'].loc[assist_idx]['type_name'] == 'pass']
plot_assists_heatmap(assists.head(300))

In [None]:
shot_idx = spadl['Spain'][spadl['Spain']['type_name'] == 'shot'].index
assist_idx = shot_idx - 1
assists = spadl['Spain'].loc[assist_idx][spadl['Spain'].loc[assist_idx]['type_name'] == 'pass']
plot_assists_heatmap(assists.head(300))

In [None]:
shot_idx = spadl['Germany'][spadl['Germany']['type_name'] == 'shot'].index
assist_idx = shot_idx - 1
assists = spadl['Germany'].loc[assist_idx][spadl['Germany'].loc[assist_idx]['type_name'] == 'pass']
plot_assists_heatmap(assists.head(300))

In [None]:
shot_idx = spadl['Italy'][spadl['Italy']['type_name'] == 'shot'].index
assist_idx = shot_idx - 1
assists = spadl['Italy'].loc[assist_idx][spadl['Italy'].loc[assist_idx]['type_name'] == 'pass']
plot_assists_heatmap(assists.head(300))

In [None]:
shot_idx = spadl['France'][spadl['France']['type_name'] == 'shot'].index
assist_idx = shot_idx - 1
assists = spadl['France'].loc[assist_idx][spadl['France'].loc[assist_idx]['type_name'] == 'pass']
plot_assists_heatmap(assists.head(300))

In [None]:
!pip install matplotsoccer

In [None]:
# 1) Imports e seleção aleatória de uma partida por liga
import random
import scipy.ndimage
import matplotlib.pyplot as plt
import matplotsoccer as mps

def select_random_games(spadl_dict):
    """
    Retorna { liga: game_id } escolhendo aleatoriamente
    uma partida de cada liga.
    """
    return {
        league: random.choice(df['game_id'].unique().tolist())
        for league, df in spadl_dict.items()
    }

In [None]:
def plot_buildup_last_events(spadl_dict, games_dict, last_n=10):
    import pandas as pd

    for liga, gid in games_dict.items():
        df  = spadl_dict[liga]
        sub = df[df['game_id']==gid].sort_values('time_seconds')

        # 1) escolhe o evento final
        shots = sub[sub['type_name'].str.lower()=='shot']
        goals = shots[shots['result_name'].str.lower()=='success']
        evt   = goals.iloc[0] if not goals.empty else shots.iloc[0]

        # 2) pega só os eventos antes e limita aos últimos last_n
        before      = sub[sub['time_seconds'] < evt['time_seconds']]
        last_events = before.tail(last_n)
        events_to_plot = pd.concat([last_events, pd.DataFrame([evt])], ignore_index=True)
        events_to_plot = events_to_plot.dropna(subset=['start_x','start_y'])

        # 3) desenha o campo e obtém ax
        ax = mps.field('green', figsize=8, show=False)

        # 4) extrai coordenadas
        xs = events_to_plot['start_x'].tolist()
        ys = events_to_plot['start_y'].tolist()

        # 5) plota últimos eventos (círculos) e conecta com linhas
        ax.scatter(xs[:-1], ys[:-1], s=80, c='blue', zorder=3)
        for i in range(len(xs)-1):
            ax.plot([xs[i], xs[i+1]], [ys[i], ys[i+1]],
                    color='blue', linewidth=2, alpha=0.7, zorder=2)

        # 6) plot da finalização
        x0, y0 = evt['start_x'], evt['start_y']
        x1, y1 = evt.get('end_x', None), evt.get('end_y', None)
        if pd.notna(x1) and pd.notna(y1):
            ax.annotate('', xy=(x1,y1), xytext=(x0,y0),
                        arrowprops=dict(color='red', width=2,
                                        headwidth=8, headlength=8),
                        zorder=4)
            ax.scatter([x1], [y1], s=150, marker='X',
                       c='red', zorder=5, label='Finalização')
        else:
            ax.scatter([x0], [y0], s=150, marker='X',
                       c='red', zorder=5, label='Shot')

        ax.set_title(
            f"{liga} • Partida {gid} • Últimos {last_n} eventos + "
            f"{'Gol' if evt['result_name'].lower()=='success' else 'Shot'}"
        )
        ax.legend(loc='upper left', fontsize='small')
        plt.show()


In [None]:
def plot_attack_heatmap(spadl_dict, games_dict, bins=25):
    """
    Para cada liga e partida em games_dict:
     - identifica o team_id do shot/gol
     - desenha um heatmap (suavizado) das posições de todas as ações
       desse time no campo
     - adiciona um label no canto com liga e partida
    """
    import scipy.ndimage

    for liga, gid in games_dict.items():
        df       = spadl_dict[liga]
        sub      = df[df['game_id']==gid]
        shots    = sub[sub['type_name'].str.lower()=='shot']
        goals    = shots[shots['result_name'].str.lower()=='success']
        evt      = goals.iloc[0] if not goals.empty else shots.iloc[0]
        team_act = sub[sub['team_id']==evt['team_id']]

        # desenha o campo e obtém ax
        ax = mps.field('green', figsize=8, show=False)

        # gera e suaviza o heatmap
        hm = mps.count(team_act['start_x'], team_act['start_y'], n=bins, m=bins)
        hm = scipy.ndimage.gaussian_filter(hm, sigma=1)

        # plota o heatmap sobre o campo
        mps.heatmap(hm, cmap='Reds', linecolor='white', cbar=True, ax=ax)

        # label em coordenadas relativas (0 a 1)
        ax.text(
            0.02, 0.98,
            f"Liga: {liga}   Partida: {gid}",
            transform=ax.transAxes,
            ha='left', va='top',
            color='white',
            fontsize=12,
            backgroundcolor='black',
            alpha=0.6
        )

        ax.set_title(f"Heatmap de ataques (time {evt['team_id']})", pad=20)
        plt.show()



In [None]:
games = select_random_games(spadl)
print("Partidas selecionadas:", games)

plot_buildup_last_events(spadl, games, last_n=10)
plot_attack_heatmap(spadl, games, bins=30)

## Paper - Terinamento do VAEP e cálculo do JOI e JDI

In [51]:
from socceraction.vaep import features as ft
import socceraction.vaep.labels as lab
import socceraction.vaep.formula as fm

from sklearn.model_selection import train_test_split
import xgboost as xgb
import sklearn.metrics as mt

In [52]:
def features_transform(spadl):
    spadl.loc[spadl.result_id.isin([2, 3]), ['result_id']] = 0
    spadl.loc[spadl.result_name.isin(['offside', 'owngoal']), ['result_name']] = 'fail'

    xfns = [
        ft.actiontype_onehot,
        ft.bodypart_onehot,
        ft.result_onehot,
        ft.goalscore,
        ft.startlocation,
        ft.endlocation,
        ft.team,
        ft.time,
        ft.time_delta
    ]

    features = []
    for game in tqdm(np.unique(spadl.game_id).tolist()):
        match_actions = spadl.loc[spadl.game_id == game].reset_index(drop=True)
        match_states = ft.gamestates(actions=match_actions)
        match_feats = pd.concat([fn(match_states) for fn in xfns], axis=1)
        features.append(match_feats)
    features = pd.concat(features).reset_index(drop=True)

    return features

def labels_transform(spadl):
    yfns = [lab.scores, lab.concedes]

    labels = []
    for game in tqdm(np.unique(spadl.game_id).tolist()):
        match_actions = spadl.loc[spadl.game_id == game].reset_index(drop=True)
        labels.append(pd.concat([fn(actions=match_actions) for fn in yfns], axis=1))

    labels = pd.concat(labels).reset_index(drop=True)

    return labels

def train_vaep(X_train, y_train, X_test, y_test):
    models = {}
    for m in ['scores', 'concedes']:
        models[m] = xgb.XGBClassifier(random_state=0, n_estimators=50, max_depth=3)

        print('training ' + m + ' model')
        models[m].fit(X_train, y_train[m])

        p = sum(y_train[m]) / len(y_train[m])
        base = [p] * len(y_train[m])
        y_train_pred = models[m].predict_proba(X_train)[:, 1]
        train_brier = mt.brier_score_loss(y_train[m], y_train_pred) / mt.brier_score_loss(y_train[m], base)
        print(m + ' Train NBS: ' + str(train_brier))
        print()

        p = sum(y_test[m]) / len(y_test[m])
        base = [p] * len(y_test[m])
        y_test_pred = models[m].predict_proba(X_test)[:, 1]
        test_brier = mt.brier_score_loss(y_test[m], y_test_pred) / mt.brier_score_loss(y_test[m], base)
        print(m + ' Test NBS: ' + str(test_brier))
        print()

        print('----------------------------------------')

    return models

def generate_predictions(features, models):
    preds = {}
    for m in ['scores', 'concedes']:
        preds[m] = models[m].predict_proba(features)[:, 1]
    preds = pd.DataFrame(preds)
    return preds

def calculate_action_values(spadl, predictions):
    action_values = fm.value(actions=spadl, Pscores=predictions['scores'], Pconcedes=predictions['concedes'])
    action_values = pd.concat([
        spadl[['original_event_id', 'action_id', 'game_id', 'start_x', 'start_y', 'end_x', 'end_y', 'type_name', 'result_name', 'player_id',]],
        predictions.rename(columns={'scores': 'Pscores', 'concedes': 'Pconcedes'}),
        action_values
    ], axis=1)
    return action_values

In [53]:
features = {}
labels = {}
for league in leagues:
    features[league] = features_transform(spadl[league])
    features[league] = features[league].astype({"period_id_a0": int, "period_id_a1": int, "period_id_a2": int})
    labels[league] = labels_transform(spadl[league])

100%|███████████████████████████████████████████████| 380/380 [00:16<00:00, 23.09it/s]
100%|███████████████████████████████████████████████| 380/380 [00:18<00:00, 20.07it/s]


In [66]:
features_concat = pd.concat(features.values(), ignore_index=True)
labels_concat = pd.concat(labels.values(), ignore_index=True)

X_train, X_test, y_train, y_test = train_test_split(features_concat, labels_concat, test_size=0.2, random_state=42)
models = train_vaep(X_train, y_train, X_test, y_test)

training scores model
scores Train NBS: 0.8070185437562029

scores Test NBS: 0.8066906690512291

----------------------------------------
training concedes model
concedes Train NBS: 0.9591881632001684

concedes Test NBS: 0.966907347830372

----------------------------------------


In [67]:
preds = {}
for league in leagues:
    preds[league] = generate_predictions(features=features[league], models=models)

In [68]:
action_values = {}
for league in leagues:
    action_values[league] = calculate_action_values(spadl=spadl[league], predictions=preds[league])

In [69]:
action_values["England"]

Unnamed: 0,original_event_id,action_id,game_id,start_x,start_y,end_x,end_y,type_name,result_name,player_id,Pscores,Pconcedes,offensive_value,defensive_value,vaep_value
0,177959171,0,2499719,51.45,34.68,32.55,14.96,pass,success,25413,0.000675,0.000127,0.000000,-0.000000,0.000000
1,177959172,1,2499719,32.55,14.96,53.55,17.00,pass,success,370224,0.003925,0.000683,0.003250,-0.000556,0.002693
2,177959173,2,2499719,53.55,17.00,36.75,19.72,pass,success,3319,0.002881,0.000905,-0.001044,-0.000222,-0.001265
3,177959174,3,2499719,36.75,19.72,43.05,3.40,pass,success,120339,0.003034,0.000719,0.000153,0.000186,0.000340
4,177959175,4,2499719,43.05,3.40,75.60,8.16,pass,success,167145,0.005474,0.000593,0.002440,0.000126,0.002565
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
610378,251596226,1136,2500098,55.65,7.48,103.95,19.04,pass,success,20620,0.081724,0.001691,0.064737,-0.000010,0.064726
610379,251596229,1137,2500098,103.95,19.04,103.95,19.04,cross,fail,14703,0.032199,0.004769,-0.049525,-0.003078,-0.052603
610380,251596408,1138,2500098,2.10,46.92,0.00,46.24,interception,success,8239,0.004789,0.036650,0.000020,-0.004450,-0.004430
610381,251596232,1139,2500098,105.00,0.00,92.40,36.04,corner_crossed,success,70965,0.059718,0.003889,0.013218,-0.003889,0.009330


In [11]:
desired_actions = ['pass', 'cross', 'dribble', 'take-on', 'shot']

def get_interactions(actions, game_id, player_before, player_after):
    game_actions = actions[actions['game_id'] == game_id]
    filtered = game_actions[game_actions['type_name'].isin(desired_actions)]
    sorted_data = filtered.sort_values(by=['period_id', 'time_seconds']).reset_index(drop=True)
    
    interactions = []
    
    for i in range(len(sorted_data) - 1):
        current_action = sorted_data.iloc[i]
        next_action = sorted_data.iloc[i + 1]
        if (current_action["player_id"] == player_before) and (next_action["player_id"] == player_after):
            interactions.append((current_action, next_action))        
    
    return interations

def extended_vaep(interaction):
    current_action, next_action = interaction
    return current_action["vaep_value"] + next_action["vaep_value"]

In [None]:
def joint_offensive_impact(actions, game_id, p, q):
    interactions = get_interactions(actions, game_id, p, q)
    interactions_reverse = get_interactions(actions, game_id, q, p)
    interactions_sum = 0
    interactions_reverse_sum = 0

    for i in interactions:
        interactions_sum += extended_vaep(i)

    for i in interactions_reverse:
        interactions_reverse_sum += extended_vaep(i)
    
    return interactions_sum + interactions_reverse_sum

### Teste JOI

In [None]:
def offensive_impact():
    pass

In [None]:
def expected_offensive_impact():
    pass

In [None]:
def responsability():
    pass

In [None]:
def joint_defensive_impact():
    pass