<a href="https://colab.research.google.com/github/tiagosilveiraa/portifoliods/blob/main/Recomenda%C3%A7%C3%A3o_de_Jogadores.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import requests
import asyncio
from datetime import date
from ipywidgets import widgets, interact
from IPython.display import display
import seaborn as sns
from matplotlib import rcParams
import matplotlib.pyplot as plt
from google.colab import data_table
from scipy import stats

In [3]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 100)
rcParams['figure.figsize'] = 20,9.27
rcParams['axes.titlesize'] = 22
rcParams['axes.labelsize'] = 16
data_table.enable_dataframe_formatter()
np.random.seed(777)

#1 - Parametrização



In [None]:
#@title
#Capturo as principais ligas exibidas na tela principal
all_leagues = requests.get('https://api.sofascore.com/api/v1/config/unique-tournaments/BR/football').json()['uniqueTournaments']
#Variaveis que terão os dados tratados para o dropdown
dp_leagues_data = [] 
dp_seasons_data = []
#Montagem das opções dos dropdowns
for league in all_leagues:  
  dp_leagues_data.append((league['name'], league['id']))
  league_seasons = requests.get(f'https://api.sofascore.com/api/v1/unique-tournament/{league["id"]}/seasons').json()['seasons'] 
  for season in league_seasons:
    dp_seasons_data.append((season['year'], season['id'], league['id']))      

No menu abaixo selecione as ligas e temporadas que deseja obter os dados.

In [5]:
#@title
#UI
add_button = widgets.Button(description="Adicionar")
clear_button = widgets.Button(description="Limpar Seleções")
dp_leagues = widgets.Dropdown(options=dp_leagues_data, value=None)
dp_seasons = widgets.Dropdown(value=None)
UI_INPUTS = widgets.HBox([dp_leagues, dp_seasons])
UI_BUTTONS = widgets.HBox([add_button, clear_button])
output = widgets.Output()
selected_leagues = []
selected_seasons = []

def on_change(event):
  dp_seasons_league = []
  if event['name'] == 'value':
    for ds in dp_seasons_data:
      #Busco apenas pelas seasons da liga selecionada pela posição na tupla
      if ds[2] == event['new']: 
        dp_seasons_league.append(ds[0:2])
    dp_seasons.options = dp_seasons_league

def add_league(b):
  if dp_leagues.value == None or dp_seasons.value == None:
    output.clear_output()
    print('Informações não preenchidas!')
  elif dp_leagues.value in selected_leagues and dp_seasons.value in selected_seasons:
    with output:      
      output.clear_output()
      print(f"Erro: Liga e temporada já adicionadas")       
  else:
    selected_leagues.append(dp_leagues.value)
    selected_seasons.append(dp_seasons.value) 
    output.clear_output()   
    with output:      
      print('Liga adicionada com sucesso! :D')

def clear_selection(b):
  selected_leagues.clear()
  selected_seasons.clear()
  with output:      
      print('Parâmetros reiniciados.')
      
dp_leagues.observe(on_change)
add_button.on_click(add_league)
clear_button.on_click(clear_selection)
display(UI_INPUTS, UI_BUTTONS, output)

HBox(children=(Dropdown(options=(('European Championship', 1), ('Copa América', 133), ('World Cup', 16), ('UEF…

HBox(children=(Button(description='Adicionar', style=ButtonStyle()), Button(description='Limpar Seleções', sty…

Output()

Agora selecione as posições dos jogadores que terão seus dados extraídos.

*Para selecionar mais de uma posição use as teclas CRTL ou SHIFT*

In [6]:
#@title
dp_positions = widgets.SelectMultiple(
    options=[('Goleiros', 'G'), ('Defensores', 'D'), ('Meio-Campistas', 'M'), ('Atacantes','F')],    
    value=['G', 'D', 'M', 'F'],
    #rows=10,
    description='Posições: ',
    disabled=False
)
display(dp_positions)

SelectMultiple(description='Posições: ', index=(0, 1, 2, 3), options=(('Goleiros', 'G'), ('Defensores', 'D'), …

#2 - Obtenção das estatisticas
Nesta etapa iremos utilizar os parâmetros definidos acima para capturar os dados no SofaScore

In [7]:
#@title
def make_url_stats(league_id, season_id, position, offset):
  return f"https://api.sofascore.com/api/v1/unique-tournament/{league_id}/season/{season_id}/statistics?offset={offset}&order=-rating&accumulation=total&filters=position.in.{position}&fields=%2CbigChancesMissed%2CsuccessfulDribbles%2CsuccessfulDribblesPercentage%2CtotalShots%2CshotsOnTarget%2CshotsOffTarget%2CblockedShots%2CgoalConversionPercentage%2CpenaltiesTaken%2CpenaltyGoals%2CpenaltyWon%2CshotFromSetPiece%2CfreeKickGoal%2CgoalsFromInsideTheBox%2CgoalsFromOutsideTheBox%2CheadedGoals%2CleftFootGoals%2CrightFootGoal%2CshitWoodwork%2Coffsides%2CpenaltyConversion%2CsetPieceConversion%2Crating%2Cinterceptions%2CpenaltyConceded%2Cclearances%2CerrorLeadToGoal%2CerrorLeadToShot%2CownGoals%2CdribbledPast%2CcleanSheet%2C rating %2CbigChancesCreated%2Cassists %2CaccuratePasses%2CinaccuratePasses%2CtotalPasses%2CaccuratePassesPercentage %2CaccurateOwnHalfPasses%2CaccurateOppositionHalfPasses%2CaccurateFinalThirdPasses%2CkeyPasses%2CaccurateCrosses%2CaccurateCrossesPercentage%2CaccurateLongBalls%2CaccurateLongBallsPercentage%2CpassToAssist%2Crating %2CcleanSheet%2CpenaltyFaced%2CpenaltySave%2CsavedShotsFromInsideTheBox%2CsavedShotsFromOutsideTheBox%2CgoalsConcededInsideTheBox%2CgoalsConcededOutsideTheBox%2Cpunches%2CsuccessfulRunsOut%2ChighClaims%2CcrossesNotClaimed%2Crating %2CyellowCards%2CredCards%2CgroundDuelsWon%2CgroundDuelsWonPercentage%2CaerialDuelsWon%2CaerialDuelsWonPercentage%2CtotalDuelsWon%2CtotalDuelsWonPercentage%2CminutesPlayed%2CwasFouled%2Cfouls%2Cdispossessed%2CpossessionLost%2Cappearances%2CmatchesStarted%2Crating &limit=20"

In [8]:
#@title
def get_stats(leagues_ids, seasons_ids, positions_codes):
  players_data = list()
  for idx, league_id in enumerate(leagues_ids):       
    for position in positions_codes:      
      offset = 0
      while True:                
        #Efetuo a requisição a API, como as seasons estão ordenadas, passo apenas o indice atual
        response = requests.get(make_url_stats(league_id, seasons_ids[idx], position, offset))
        #Já percoreu todos os players então paro
        if response.json()['results'] == []:        
           break;        
        #Insiro o ID e nome da Liga no Dicionário     
        response = response.json()['results']
        for player in response:
          player['leagueId'] = league_id
          player['leagueName'] = [lgn['name'] for lgn in all_leagues if lgn['id'] == league_id][0]                       
        #Concateno os novos dados com a lista já existente e incremento o offset para a próxima requisição
        players_data.extend(response) 
        offset = offset + 20 
  return players_data

In [9]:
#@title
json_data = get_stats(selected_leagues, selected_seasons, dp_positions.value)
df_players = pd.json_normalize(json_data, max_level=6)       

#3 - Enriquecimento dos dados
Na etapa anterior obtivemos todas as estatisticas disponíveis para os jogadores, agora vamos fazer o enriquecimento desses dados com outras informações.

Buscando por posições alternativas para cada player, bem como algumas caracteristicas, como altura, valor de mercado e etc...

In [10]:
async def parallel_requests(coroutines):
  await asyncio.gather(*coroutines)

In [11]:
players_atributtes = []
players_alternative_positions = []

async def get_player_attributes(player_id):
  response_data = requests.get(f'https://api.sofascore.com/api/v1/player/{player_id}')
  response = response_data.json()['player']
  response['player.idd'] = player_id #Para fazer o merge depois
  players_atributtes.append(response)

async def get_alternative_positions(player_id):
  response_data = requests.get(f'https://api.sofascore.com/api/v1/player/{player_id}/characteristics')
  response = response_data.json()
  if len(response['positions']) >=1:
    response = {key: response[key] for key in response.keys() if key == 'positions'}
    response['player.id'] = player_id #Para fazer o merge depois
    players_alternative_positions.append(response)

In [12]:
req_attributes = [get_player_attributes(p) for p in df_players['player.id']]
req_alternative_positions = [get_alternative_positions(p) for p in df_players['player.id']]

In [13]:
await parallel_requests(req_alternative_positions)

In [None]:
await parallel_requests(req_attributes)

In [None]:
df_players_attributes = pd.json_normalize(players_atributtes, max_level=2)
df_players_alternative_positions = pd.json_normalize(players_alternative_positions, max_level=2)

In [None]:
df_unified_extras = df_players_alternative_positions.merge(df_players_attributes, on='player.id')

In [None]:
#Como vieram muitas informações redundantes iremos dropá-las
df_unified_extras = df_unified_extras.drop(columns=['name', 'firstName', 'lastName', 'slug', 'shortName','proposedMarketValue', 'jerseyNumber','team.slug', 'team.shortName', 'team.gender', 'team.sport.name',
       'team.sport.slug', 'team.sport.id', 'team.tournament.name',
       'team.tournament.slug', 'team.tournament.category',
       'team.tournament.uniqueTournament', 'team.tournament.priority',
       'team.tournament.id', 'team.primaryUniqueTournament.name',
       'team.primaryUniqueTournament.slug',
       'team.primaryUniqueTournament.category',
       'team.primaryUniqueTournament.userCount',
       'team.primaryUniqueTournament.id',
       'team.primaryUniqueTournament.displayInverseHomeAwayTeams',
       'team.userCount', 'team.nameCode', 'team.disabled', 'team.national',
       'team.type', 'team.id', 'team.country.alpha2', 'team.country.name',
       'team.teamColors.primary', 'team.teamColors.secondary',
       'team.teamColors.text', 'userCount', 'id', 'team.name'])

In [None]:
df_players = df_unified_extras.merge(df_players, on='player.id')

## 3.2 - Engenharia reversa em features percentuais

Para alguns fundamentos é exibida a quantidade e o percentual de acertos do jogador, mas não há a infomação de quantos quandos lances ele errou. 

Para o nosso modelo é mais interessante termos os dados dos acertos e dos erros do que os percentuais, por isso usei a regra de 3 para criar novas features demonstrando os erros cometidos em cada fundamento

In [None]:
def reverse_feature(frame, number, percentage, method = np.ceil):
 return ((frame[number] / (frame[percentage]/100)) - frame[number]).fillna(0).apply(lambda x: int(method(x)))

In [None]:
df_players['aerialDuelsLost'] = reverse_feature(df_players, 'aerialDuelsWon', 
                                                'aerialDuelsWonPercentage')

In [None]:
df_players['groundDuelsLost'] = reverse_feature(df_players, 'groundDuelsWon', 
                                              'groundDuelsWonPercentage')

In [None]:
df_players['unsuccessfulDribbles'] = reverse_feature(df_players, 'successfulDribbles', 
                                                     'successfulDribblesPercentage', np.ceil)

In [None]:
df_players['unnacurateLongBalls'] = reverse_feature(df_players, 'accurateLongBalls',
                                                    'accurateLongBallsPercentage')

In [None]:
df_players['unaccurateCrosses'] = reverse_feature(df_players, 'accurateCrosses', 
                                                            'accurateCrossesPercentage')

In [None]:
df_players['successfulShotFromSetPiece'] = (df_players['shotFromSetPiece'] * (df_players['setPieceConversion']/100)).apply(lambda x: int(np.ceil(x)))

In [None]:
df_players['unsuccessfulShotFromSetPiece'] = reverse_feature(df_players, 'successfulShotFromSetPiece',
                                                             'setPieceConversion', np.floor)

#4 - Tratamento dos dados

In [None]:
df_players.rename(columns={'country.alpha2':'countryCode', 
                'country.name':'countryName', 
                'proposedMarketValueRaw.value':'proposedMarketValue', 
                'proposedMarketValueRaw.currency': 'proposedMarketCurrency',
                'player.id': 'playerId', 
                'player.name': 'playerName', 
                'player.slug': 'playerSlug',
                'player.userCount': 'playerUserCount',
                'team.id':'teamId',
                'team.name':'teamName',
                'team.slug':'teamSlug', 
                'team.teamColors.primary': 'teamPrimaryColor',
                'team.teamColors.secondary': 'teamSecondaryColor',
                'team.userCount': 'teamUserCount', 
                'position': 'playerMainPosition', 
                'positions': 'playerAlternativePositions'
                }, inplace=True)

In [None]:
df_players.drop(['team.teamColors.text', 
'team.type',
'team.sport.id',
'team.sport.name',
'team.sport.slug',
'team.shortName'], axis=1, inplace=True)

In [None]:
dp_players = df_players[['playerName','playerId']].sort_values(by=['playerName']).values.tolist()

##4.1 - Tratando valores nulos

In [None]:
df_players.columns[df_players.isna().any()].tolist()

['retired',
 'shirtNumber',
 'contractUntilTimestamp',
 'proposedMarketValue',
 'proposedMarketCurrency']

Para o prazo de término dos contratos dos jogadores iremos assumir que quem não possuir esta informação preenchida irá ter o seu contrato encerrado no fim da próxima temporada.

In [None]:
from datetime import date
import time
next_year = date.today().year + 1
next_season_end = date(next_year, 12,31)
next_season_end = time.mktime(next_season_end.timetuple())

In [None]:
df_players.fillna({'height': np.ceil(df_players['height'].mean()),
                   'preferredFoot': df_players['preferredFoot'].mode().values[0], 
                   'shirtNumber': 0,
                   'retired': False,
                   'proposedMarketCurrency': 'EUR',
                   'contractUntilTimestamp': next_season_end
                   }, inplace=True)

In [None]:
df_players['proposedMarketValue'].dropna(inplace=True)

In [None]:
df_players = df_players[df_players.columns.sort_values(ascending=True)]

In [None]:
#Só quero players que estiveram em alguma partida na temporada
df_players = df_players[df_players['proposedMarketValue']>0]

##4.2 - Tratamento de transferências
Como é possível capturar dados de ligas diferentes, pode ser que o jogador que estava em uma Liga A tenha se transferido para uma Liga B, com isso o seu registro é exibido duas vezes o que irá causar um problema na execução do algoritmo KNN já que em breve iremos indexar o dataframe pelo ID do Jogador. Portanto, optei por eliminar os jogadores duplicados já que não faz tanto sentido comparar ele com ele mesmo.

In [None]:
df_players = df_players.drop_duplicates(subset=['playerId'], keep='last', ignore_index=True)

## 4.3 - Tratamento de Outliers [*TODO*]

In [None]:
df_players.describe()



Unnamed: 0,accurateCrosses,accurateCrossesPercentage,accurateFinalThirdPasses,accurateLongBalls,accurateLongBallsPercentage,accurateOppositionHalfPasses,accurateOwnHalfPasses,accuratePasses,aerialDuelsLost,aerialDuelsWon,aerialDuelsWonPercentage,appearances,bigChancesCreated,bigChancesMissed,blockedShots,cleanSheet,clearances,contractUntilTimestamp,crossesNotClaimed,dateOfBirthTimestamp,dispossessed,dribbledPast,errorLeadToGoal,errorLeadToShot,fouls,freeKickGoal,goalConversionPercentage,goalsConcededInsideTheBox,goalsConcededOutsideTheBox,goalsFromInsideTheBox,goalsFromOutsideTheBox,groundDuelsLost,groundDuelsWon,groundDuelsWonPercentage,headedGoals,height,highClaims,inaccuratePasses,interceptions,keyPasses,leagueId,leftFootGoals,matchesStarted,minutesPlayed,offsides,ownGoals,passToAssist,penaltiesTaken,penaltyConceded,penaltyConversion,penaltyFaced,penaltyGoals,penaltySave,penaltyWon,playerId,playerUserCount,possessionLost,proposedMarketValue,punches,rating,redCards,savedShotsFromInsideTheBox,savedShotsFromOutsideTheBox,setPieceConversion,shirtNumber,shotFromSetPiece,shotsOffTarget,shotsOnTarget,successfulDribbles,successfulDribblesPercentage,successfulRunsOut,successfulShotFromSetPiece,teamId,teamUserCount,totalDuelsWon,totalDuelsWonPercentage,totalPasses,totalShots,unaccurateCrosses,unnacurateLongBalls,unsuccessfulDribbles,unsuccessfulShotFromSetPiece,wasFouled,yellowCards
count,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0
mean,6.129771,16.564084,125.118321,27.099237,55.927176,261.045802,175.209924,430.125954,14.675573,11.316794,40.771069,18.687023,1.652672,0.950382,4.717557,1.801527,9.5,1710851000.0,0.0,813566800.0,13.232824,16.442748,0.053435,0.087786,19.992366,0.038168,5.994427,11.255725,2.118321,1.083969,0.293893,59.740458,57.881679,47.585611,0.194656,176.70229,0.0,82.522901,12.793893,15.015267,325.0,0.400763,12.507634,1111.324427,1.28626,0.01145,0.049618,0.179389,0.103053,5.916031,0.0,0.129771,0.0,0.137405,722885.0,641.709924,155.591603,2424374.0,0.003817,6.77084,0.148855,0.0,0.0,1.484847,21.099237,0.912214,7.450382,5.71374,15.969466,58.846718,0.0,0.041985,5487.572519,0.0,69.198473,46.605802,512.648855,17.881679,21.114504,16.587786,11.183206,0.240458,20.538168,2.706107
std,9.237519,15.525129,103.444088,28.961291,21.196535,211.088635,170.433653,362.623407,13.764498,12.454777,21.960205,10.355064,2.116703,1.583385,5.053393,2.672178,11.600221,39600910.0,0.0,137119100.0,12.151155,14.048989,0.241736,0.333224,16.194458,0.191968,10.739543,8.205658,2.050448,1.897925,0.679109,46.923446,47.628404,11.430143,0.46762,5.128531,0.0,63.303361,12.184858,14.769485,0.0,1.179194,9.951893,827.142465,2.072933,0.106596,0.217571,0.696375,0.328806,22.929087,0.0,0.617707,0.0,0.467542,316350.7,1367.06769,124.254671,3303408.0,0.06178,0.207398,0.397283,0.0,0.0,9.991248,18.12826,2.2102,7.522853,6.391904,17.10322,23.247556,0.0,0.219178,9794.510367,0.0,55.265354,10.964027,418.898271,17.886545,29.218726,15.919274,14.170717,2.001772,19.2562,2.449779
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1668038000.0,0.0,364348800.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,158.0,0.0,0.0,0.0,0.0,325.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4811.0,15.0,0.0,49000.0,0.0,6.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1955.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,34.0,6.0,48.165,78.0,47.5,131.25,5.0,2.0,27.09,10.0,0.0,0.0,1.0,0.0,2.0,1672445000.0,0.0,722498400.0,4.0,5.0,0.0,0.0,6.25,0.0,0.0,4.0,0.0,0.0,0.0,18.25,18.25,43.2575,0.0,173.0,0.0,28.0,3.0,3.0,325.0,0.0,3.0,374.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,380496.0,101.75,44.25,547500.0,0.0,6.6525,0.0,0.0,0.0,0.0,8.0,0.0,2.0,1.0,3.25,50.0,0.0,0.0,1963.0,0.0,23.0,42.445,157.75,4.0,0.0,5.0,2.0,0.0,5.25,1.0
50%,2.0,17.61,105.0,17.0,60.0,228.0,130.0,338.0,11.0,7.5,40.315,19.0,1.0,0.0,3.0,0.0,6.0,1703981000.0,0.0,835099200.0,10.0,14.0,0.0,0.0,17.0,0.0,2.045,10.0,2.0,0.0,0.0,53.0,47.0,49.1,0.0,177.0,0.0,74.0,10.0,11.0,325.0,0.0,11.0,998.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,866664.0,226.0,139.0,1200000.0,0.0,6.745,0.0,0.0,0.0,0.0,17.0,0.0,5.0,4.0,10.0,61.415,0.0,0.0,1978.5,0.0,60.0,47.64,434.0,13.0,8.5,12.0,6.0,0.0,14.0,2.0
75%,8.0,25.2475,203.75,42.75,67.965,419.0,245.25,661.5,22.0,16.0,52.3275,28.0,3.0,1.0,7.0,3.0,13.0,1735603000.0,0.0,922492800.0,20.0,25.0,0.0,0.0,29.0,0.0,9.09,17.0,3.0,1.0,0.0,92.0,88.0,53.31,0.0,181.0,0.0,126.0,18.75,24.0,325.0,0.0,21.0,1789.5,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,931712.0,543.5,238.75,2800000.0,0.0,6.9075,0.0,0.0,0.0,0.0,27.0,1.0,10.75,8.0,23.0,71.1475,0.0,0.0,5926.0,0.0,106.75,52.365,796.5,25.0,31.0,24.0,14.0,0.0,31.0,4.0
max,55.0,100.0,410.0,142.0,100.0,907.0,907.0,1780.0,103.0,72.0,100.0,37.0,11.0,11.0,22.0,12.0,76.0,1845936000.0,0.0,1058314000.0,60.0,80.0,2.0,3.0,77.0,1.0,100.0,33.0,9.0,14.0,5.0,213.0,218.0,100.0,3.0,191.0,0.0,291.0,64.0,73.0,325.0,10.0,35.0,3035.0,13.0,1.0,1.0,6.0,2.0,100.0,0.0,6.0,0.0,3.0,1139775.0,12815.0,514.0,18800000.0,1.0,7.49,2.0,0.0,0.0,100.0,96.0,16.0,35.0,33.0,88.0,100.0,0.0,2.0,49202.0,0.0,254.0,76.0,2013.0,84.0,129.0,100.0,82.0,27.0,94.0,14.0


#5 - Análise exploratória  [*TODO*]



##5.1 - Destaques por fundamentos

In [None]:
#df_players[(np.abs(stats.zscore(df_players['accurateCrosses'])>3))][['playerName', 'appearences']]
df_players.nlargest(n=5, columns='appearances')
#df_players[(np.abs(stats.zscore(df_players['appearances'])>3))][['playerName', 'teamName', 'appearances']].sort_values(by='appearances', ascending=False)



Unnamed: 0,accurateCrosses,accurateCrossesPercentage,accurateFinalThirdPasses,accurateLongBalls,accurateLongBallsPercentage,accurateOppositionHalfPasses,accurateOwnHalfPasses,accuratePasses,aerialDuelsLost,aerialDuelsWon,aerialDuelsWonPercentage,appearances,bigChancesCreated,bigChancesMissed,blockedShots,cleanSheet,clearances,contractUntilTimestamp,countryCode,countryName,crossesNotClaimed,dateOfBirthTimestamp,dispossessed,dribbledPast,errorLeadToGoal,errorLeadToShot,fouls,freeKickGoal,goalConversionPercentage,goalsConcededInsideTheBox,goalsConcededOutsideTheBox,goalsFromInsideTheBox,goalsFromOutsideTheBox,groundDuelsLost,groundDuelsWon,groundDuelsWonPercentage,headedGoals,height,highClaims,inaccuratePasses,interceptions,keyPasses,leagueId,leagueName,leftFootGoals,matchesStarted,minutesPlayed,offsides,ownGoals,passToAssist,penaltiesTaken,penaltyConceded,penaltyConversion,penaltyFaced,penaltyGoals,penaltySave,penaltyWon,playerAlternativePositions,playerId,playerMainPosition,playerName,playerSlug,playerUserCount,possessionLost,preferredFoot,proposedMarketCurrency,proposedMarketValue,punches,rating,redCards,retired,savedShotsFromInsideTheBox,savedShotsFromOutsideTheBox,setPieceConversion,shirtNumber,shotFromSetPiece,shotsOffTarget,shotsOnTarget,successfulDribbles,successfulDribblesPercentage,successfulRunsOut,successfulShotFromSetPiece,teamId,teamName,teamPrimaryColor,teamSecondaryColor,teamSlug,teamUserCount,totalDuelsWon,totalDuelsWonPercentage,totalPasses,totalShots,unaccurateCrosses,unnacurateLongBalls,unsuccessfulDribbles,unsuccessfulShotFromSetPiece,wasFouled,yellowCards
64,13,28.26,334,51,52.04,590,263,840,32,28,46.67,37,5,5,8,8,28,1703981000.0,BR,Brazil,0,561945600.0,28,28,0,1,43,0,11.11,25,4,3,0,116,121,51.27,1,173.0,0,238,35,48,325,Brasileiro Série A,0,29,2751,0,0,0,0,0,0.0,0,0,0,1,"[MC, DM, AM]",330691,M,Juninho,juninho,135,386,Right,EUR,450000.0,0,6.91,0,False,0,0,0.0,8.0,0,7,12,26,56.52,0,0,1973,América Mineiro,#52b030,#52b030,america-mineiro,0,149,50.34,1078,27,34,48,21,0,32,2
124,6,23.08,244,35,46.67,404,171,569,57,39,40.63,37,4,4,18,0,11,1672445000.0,BR,Brazil,0,537235200.0,41,33,0,0,45,0,8.06,13,3,4,1,150,116,43.77,1,175.0,0,186,19,33,325,Brasileiro Série A,0,31,2335,3,0,0,1,0,0.0,0,0,0,0,"[LW, RW, AM]",166471,M,Felipe Azevedo,felipe-azevedo,111,348,Right,EUR,435000.0,0,6.75,0,False,0,0,0.0,11.0,0,24,20,35,53.03,0,0,1973,América Mineiro,#52b030,#52b030,america-mineiro,0,155,42.94,755,62,20,40,32,0,36,4
206,20,19.8,178,9,42.86,344,154,478,37,9,19.57,37,4,3,6,1,12,1703981000.0,BR,Brazil,0,873590400.0,40,16,0,0,26,0,6.9,13,8,2,0,150,118,44.19,0,166.0,0,107,14,25,325,Brasileiro Série A,1,21,2069,2,0,0,0,0,0.0,0,0,0,3,"[RW, LW]",863283,M,Gustavo Silva,gustavo-silva,676,380,Right,EUR,3600000.0,0,6.63,0,False,0,0,0.0,19.0,0,15,8,54,43.9,0,0,1957,Corinthians,#52b030,#52b030,corinthians,0,127,40.58,585,29,82,12,70,0,48,2
56,4,12.12,331,106,57.61,775,489,1260,26,16,38.1,36,3,0,6,10,39,1733789000.0,BR,Brazil,0,787622400.0,28,80,0,1,63,0,0.0,25,8,0,0,183,194,51.46,0,178.0,0,291,39,30,325,Brasileiro Série A,0,35,3011,0,0,0,0,0,0.0,0,0,0,0,"[MC, DM, RW, AM]",931479,M,Fernando Sobral,fernando-sobral,421,417,Right,EUR,1900000.0,0,6.93,1,False,0,0,0.0,8.0,1,16,7,39,73.58,0,0,2001,Ceará,#52b030,#52b030,ceara,0,210,50.12,1551,29,30,78,15,0,56,2
32,13,16.67,293,10,43.48,438,94,519,22,7,24.14,35,9,8,14,4,8,1751242000.0,BR,Brazil,0,826588800.0,39,47,0,0,17,0,19.72,21,2,14,0,161,141,46.84,2,166.0,0,177,20,38,325,Brasileiro Série A,2,23,2204,8,0,0,0,0,0.0,0,0,0,1,"[LW, RW, ML]",878084,M,Michael,michael,4695,455,Right,EUR,7200000.0,0,7.0,0,False,0,0,0.0,96.0,0,29,28,76,55.47,0,0,5981,Flamengo,#52b030,#52b030,flamengo,0,148,44.85,696,71,65,13,62,0,35,0


In [None]:
output_5 = widgets.Output()
features = df_players.select_dtypes(include=np.number).columns

def on_change_feature(Fundamento):    
    return df_players[(np.abs(stats.zscore(df_players[Fundamento])>3))][['playerName', 'teamName', Fundamento]].sort_values(by=Fundamento, ascending=False)

interact(on_change_feature, Fundamento=features);

interactive(children=(Dropdown(description='Fundamento', options=('accurateCrosses', 'accurateCrossesPercentag…

#6 - Geração do Modelo

##6.1 - Transformando as posições alternativas em colunas

In [None]:
X = df_players.copy()

In [None]:
X[['AM', 'DC', 'DL', 'DM', 'DR', 'LW', 'MC', 'ML', 'MR', 'RW', 'ST']] = 0

In [None]:
for i, p in enumerate(X['positions'].tolist()):
    X.loc[i, p] = 1

In [None]:
#Alterando o indice para o player.id
X.set_index('player.id', inplace = True)

##6.2 - Enconding das features categoricas

In [None]:
from sklearn.preprocessing import OneHotEncoder
f_encoder = OneHotEncoder()
c_encoder = OneHotEncoder()
p_encoder = OneHotEncoder()
feet = f_encoder.fit_transform(X[['preferredFoot']])
country = c_encoder.fit_transform(X[['countryName']])
position = p_encoder.fit_transform(X[['position']])

In [None]:
X[f_encoder.categories_[0]] = feet.toarray()
X[c_encoder.categories_[0]] = country.toarray()
X[p_encoder.categories_[0]] = position.toarray()

In [None]:
X.drop([
        'player.name', 'player.slug',
       'player.userCount', 'team.name', 'team.slug', 'team.shortName',
       'team.sport.name', 'team.sport.slug', 'team.sport.id', 'team.userCount',
       'team.type', 'team.id', 'team.teamColors.primary',
       'team.teamColors.secondary', 'team.teamColors.text', 'shirtNumber', 'league',
       'positions','position', 'preferredFoot', 'shirtNumber',
        'countryCode', 'countryName',
        'retired','proposedMarketCurrency',
       #Percentuais já foram convertidos em novas features anteriormente
       'accurateLongBallsPercentage','aerialDuelsWonPercentage','goalConversionPercentage','groundDuelsWonPercentage',
       'successfulDribblesPercentage','totalDuelsWonPercentage', 'setPieceConversion', 'accurateCrossesPercentage',
       #Redundantes em relação aos erros e acertos
       'totalDuelsWon', 'totalShots', 'totalPasses', 'shotFromSetPiece'
], axis='columns', inplace=True)

In [None]:
from scipy.sparse import csr_matrix
stats_sparse = csr_matrix(X)

In [None]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm='brute', n_neighbors=5)
model.fit(stats_sparse)

NearestNeighbors(algorithm='brute')

# 7 - Apresentação dos Resultados

In [None]:
def get_similars(Player):
  distance, similar_players_index = model.kneighbors(X.loc[Player].values.reshape(1,-1))
  similar_players = pd.DataFrame()
  similar_players = df_players.loc[df_players.index[similar_players_index[0]]]
  similar_players['distances'] = distance[0]
  return similar_players[['player.name', 'team.name','rating']]

In [None]:
interact(get_similars, Player=dp_players);

interactive(children=(Dropdown(description='Player', options=(['Ademir', 922546], ['Adil Aouchiche', 962877], …