<a href="https://colab.research.google.com/github/tiagosilveiraa/portifoliods/blob/main/Players_Recomendation_using_fifa22_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [498]:
import pandas as pd
import requests
import re
from decimal import Decimal
import numpy as np
from datetime import datetime
import time
from google.colab import data_table

In [499]:
data_table.disable_dataframe_formatter()
np.random.seed(777)

#1 - Loading dataset

In [500]:
df = pd.read_csv('https://github.com/tiagosilveiraa/portifoliods/raw/main/datasets/FIFA22_official_data.csv', sep=',')

In [501]:
df = df.rename(columns=lambda x: x.replace(" ", "_").lower())
df = df.rename(columns={'value': 'market_value'})

#2 - Tratamento dos dados

##2.1 - Formatting Values

In [503]:
def str_to_currency(value):
  if(type(value) == str):
    try:    
        x = re.sub(r'[^\d.]', '', value)
        x = Decimal(x) 
    except Exception as e:
      raise e
    else:
      if x == None:
        return np.NaN
      if 'K' in value:
        return x * 1000
      elif 'M' in value:
        return x * 1000000

In [506]:
df['release_clause'] = df['release_clause'].apply(str_to_currency)
df['wage'] = df['wage'].apply(str_to_currency)
df['market_value'] = df['market_value'].apply(str_to_currency)

In [504]:
def to_date(date):     
    try:  
      date_size = len(str(date))   
      formatted_date = ''        
      if date_size > 4:
        formatted_date = datetime.strptime(date, '%b %d, %Y')      
      elif date_size == 4:
        formatted_date = datetime(int(date), 6, 30) 
      else:
        next_year = datetime.today().year + 1
        next_season_end = datetime(next_year, 12,31)
        formatted_date = next_season_end
    except Exception as e:
      raise e
    else:    
      return time.mktime(formatted_date.timetuple())

In [507]:
df['contract_valid_until'] = df['contract_valid_until'].apply(to_date)
df['joined'] = df['joined'].apply(to_date)

In [505]:
def remove_html(value):
  if(type(value) == str):
    try:
      x = re.sub(r'<.{0,100}">|<.{0,10}>','', value)
    except Exception as e:
      raise e
    else:
      return x

In [508]:
df['position'] = df['position'].apply(remove_html)
df['loaned_from'] = df['loaned_from'].apply(remove_html)

In [509]:
def str_to_number(value):
  if value:
    try:
      x = re.sub(r'[^\d.]', '', value)
      x = Decimal(x) 
    except Exception as e:
      raise e
    else:
      return x

In [510]:
df['height'] = df['height'].apply(str_to_number)
df['weight'] = df['weight'].apply(str_to_number)

##2.2 - Changing datatypes

In [511]:
df['wage'] = pd.to_numeric(df['wage'])
df['market_value'] = pd.to_numeric(df['market_value'])
df['height'] = pd.to_numeric(df['height'])
df['weight'] = pd.to_numeric(df['weight'])
df['release_clause'] = pd.to_numeric(df['release_clause'])

##2.2 - Filling NA values

In [512]:
df.columns[df.isna().any()].tolist()

['club',
 'market_value',
 'wage',
 'body_type',
 'real_face',
 'position',
 'jersey_number',
 'loaned_from',
 'volleys',
 'curve',
 'agility',
 'balance',
 'jumping',
 'interceptions',
 'positioning',
 'vision',
 'composure',
 'marking',
 'slidingtackle',
 'release_clause',
 'defensiveawareness']

Removing players without Market Value, Club, Wage, Release Clause Value, Joined Date because we can't infer these information.

In [513]:
df.dropna(subset = ['market_value', 'club', 'wage', 'release_clause', 'joined'], inplace=True)

Here we are assuming that every player that has not the information of the club that Loaned him is not loaned.

In [514]:
df['loaned_from'].fillna('NL', inplace=True)

The attribute Marking was substituted for the attribute DefensiveAwareness in Fifa 22 so, unified the two columns

In [515]:
df.loc[df['defensiveawareness'].isna() == True, ['defensiveawareness']] = df.loc[df['defensiveawareness'].isna() == True]['marking']

In [516]:
df.drop(columns=['marking'], inplace = True)

In [517]:
df.loc[df['position'].isin(['RES', 'SUB']), ['position']] = df.loc[df['position'].isin(['RES', 'SUB'])]['best_position']

## 2.3 - Exploding attributes

The attribute Work Rate can be divided in Deffensive Workrate and Attacking Workrate, in this dataset both are provided in the same column so, i had to split it into two different columns

In [519]:
awr = []
dwr = []
for i, w in enumerate(df['work_rate'].tolist()): 
  if(type(w) == str): 
    opt = w.replace(' ', '').split('/')
    awr.append(opt[0])
    dwr.append(opt[1])
  else:
    awr.append(np.nan)
    dwr.append(np.nan)

In [520]:
df['attacking_wr'] = awr
df['defensive_wr'] = dwr

#3 - Data Modeling

Dropping unecessary columns to the model

In [521]:
X = df.drop(columns=['photo', 'flag','jersey_number', 'joined', 'club_logo', 'work_rate', 'name', 'special'])

In [522]:
X.set_index('id', inplace=True)

## 3.1 - Applying encodings

In [523]:
from sklearn.preprocessing import OneHotEncoder

In [524]:
X.select_dtypes('object').columns

Index(['nationality', 'club', 'preferred_foot', 'body_type', 'real_face',
       'position', 'loaned_from', 'best_position', 'attacking_wr',
       'defensive_wr'],
      dtype='object')

In [525]:
wr_order = ['Low', 'Medium','High',]
X['attacking_wr'] =  X['attacking_wr'].astype('category')
X['attacking_wr'] = X['attacking_wr'].cat.reorder_categories(wr_order).cat.codes
X['defensive_wr'] =  X['defensive_wr'].astype('category')
X['defensive_wr'] = X['defensive_wr'].cat.reorder_categories(wr_order).cat.codes

In [526]:
rf_order = ['No', 'Yes']
X['real_face'] = X['real_face'].astype('category')
X['real_face'] = X['real_face'].cat.reorder_categories(rf_order).cat.codes

In [527]:
#one_nation = OneHotEncoder()
#one_club = OneHotEncoder()
one_foot = OneHotEncoder()
one_body_type = OneHotEncoder()
one_position = OneHotEncoder()
#one_loaned_from = OneHotEncoder()
one_best_position = OneHotEncoder()

In [528]:
#enc_nation = one_nation.fit_transform(X[['nationality']])
#enc_club = one_club.fit_transform(X[['club']])
enc_foot = one_foot.fit_transform(X[['preferred_foot']])
enc_body_type = one_body_type.fit_transform(X[['body_type']])
enc_position = one_position.fit_transform(X[['position']])
#enc_loaned_from = one_loaned_from.fit_transform(X[['loaned_from']])
enc_best_position = one_best_position.fit_transform(X[['best_position']])

In [529]:
#X[one_nation.categories_[0]] = enc_nation.toarray()
#X[one_club.categories_[0]] = enc_club.toarray()
X[one_foot.categories_[0]] = enc_foot.toarray()
X[one_body_type.categories_[0]] = enc_body_type.toarray()
X[[f'default_{x}' for x in one_position.categories_[0]]] = enc_position.toarray()
X[[f'best_{x}' for x in one_best_position.categories_[0]]] = enc_best_position.toarray()
#X[[f'loaned_{x}' for x in one_loaned_from.categories_[0]]] = enc_loaned_from.toarray()

In [530]:
X.drop(columns=['nationality', 'club', 'preferred_foot', 'body_type', 'position',
       'loaned_from', 'best_position'], inplace=True)

##3.4 - Rescaling values

In [533]:
from sklearn.preprocessing import MinMaxScaler

In [534]:
scaler = MinMaxScaler()

In [535]:
X_model = scaler.fit_transform(X)

##3.4 - Creating model

In [557]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm='brute', n_neighbors=10)
model.fit(X.values) # X_model to user scaled values

NearestNeighbors(algorithm='brute', n_neighbors=10)

# 4 - Presenting results

In [552]:
from ipywidgets import widgets, interact
from IPython.display import display

In [553]:
dp_players = df[['name','id']].sort_values(by=['name']).values.tolist()

In [554]:
def get_similars(Player):
  distance, similar_players_index = model.kneighbors(X.loc[Player].values.reshape(1,-1))
  similar_players = pd.DataFrame()
  similar_players = df.loc[df.index[similar_players_index[0]]]
  similar_players['distances'] = [int(x) for x in distance[0]]
  return similar_players[['name', 'club', 'overall','distances']]

In [555]:
interact(get_similars, Player=dp_players);

interactive(children=(Dropdown(description='Player', options=(['18\xa0A. Aquilani', 103935], ['18\xa0A. Araos'…