# Análise e Predição de Química entre Jogadores de Futebol

Grupo: Gabriel Castelo, Matheus Vaz, Victor Kenji e Vinicius Gomes

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import socceraction.spadl as spd
pd.set_option('future.no_silent_downcasting', True)

In [None]:
def load_matches(path):
    matches = pd.read_json(path_or_buf=path)
    # as informações dos times de cada partida estão em um dicionário dentro da coluna 'teamsData', então vamos separar essas informações
    team_matches = []
    for i in range(len(matches)):
        match = pd.DataFrame(matches.loc[i, 'teamsData']).T
        match['matchId'] = matches.loc[i, 'wyId']
        team_matches.append(match)
    team_matches = pd.concat(team_matches).reset_index(drop=True)
    return team_matches

def load_players(path):
    players = pd.read_json(path_or_buf=path)
    players['player_name'] = players['firstName'] + ' ' + players['lastName']
    players = players[['wyId', 'player_name']].rename(columns={'wyId': 'player_id'})
    return players

def load_events(path):
    events = pd.read_json(path_or_buf=path)
    # pré processamento em colunas da tabela de eventos para facilitar a conversão p/ SPADL
    events = events.rename(columns={
        'id': 'event_id',
        'eventId': 'type_id',
        'subEventId': 'subtype_id',
        'teamId': 'team_id',
        'playerId': 'player_id',
        'matchId': 'game_id'
    })
    events['milliseconds'] = events['eventSec'] * 1000
    events['period_id'] = events['matchPeriod'].replace({'1H': 1, '2H': 2})
    return events

# def load_minutes_played_per_game(path):
#     minutes = pd.read_json(path_or_buf=path)
#     minutes = minutes.rename(columns={
#         'playerId': 'player_id',
#         'matchId': 'game_id',
#         'teamId': 'team_id',
#         'minutesPlayed': 'minutes_played'
#     })
#     minutes = minutes.drop(['shortName', 'teamName', 'red_card'], axis=1)
#     return minutes

In [None]:
leagues = ['England', 'Spain', 'Germany', 'Italy']
events = {}
matches = {}
# minutes = {}
for league in tqdm(leagues):
    path = r'data/matches/matches_{}.json'.format(league)
    matches[league] = load_matches(path)
    path = r'data/events/events_{}.json'.format(league)
    events[league] = load_events(path)
    # path = r'data/minutes_played_per_game_{}.json'.format(league)
    # minutes[league] = load_minutes_played_per_game(path)

path = r'data/players.json'
players = load_players(path)
players['player_name'] = players['player_name'].str.decode('unicode-escape')

## Mapeamento para SPADL

In [None]:
def spadl_transform(events, matches):
    spadl = []
    game_ids = events.game_id.unique().tolist()
    for g in tqdm(game_ids):
        match_events = events.loc[events.game_id == g]
        match_home_id = matches.loc[(matches.matchId == g) & (matches.side == 'home'), 'teamId'].values[0]
        match_actions = spd.wyscout.convert_to_actions(events=match_events, home_team_id=match_home_id)
        match_actions = spd.play_left_to_right(actions=match_actions, home_team_id=match_home_id)
        match_actions = spd.add_names(match_actions)
        spadl.append(match_actions)
    spadl = pd.concat(spadl).reset_index(drop=True)
    return spadl

In [None]:
spadl = {}
for league in leagues:
    spadl[league] = spadl_transform(events=events[league], matches=matches[league])
    # Adicionando o nome dos players
    spadl[league] = spadl[league].merge(players[['player_id', 'player_name']], on='player_id', how='left')

## Análise exploratória