<a href="https://colab.research.google.com/github/tiagosilveiraa/portifoliods/blob/main/Players_Recomendation_using_fifa22_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0 - Importing Libraries

In [402]:
import pandas as pd
import requests
import re
from decimal import Decimal
import numpy as np
from datetime import datetime
import time
from google.colab import data_table

In [403]:
data_table.disable_dataframe_formatter()
np.random.seed(777)

#1 - Loading dataset

In [404]:
df = pd.read_csv('https://github.com/tiagosilveiraa/portifoliods/raw/main/datasets/FIFA22_official_data.csv', sep=',')

In [405]:
df = df.rename(columns=lambda x: x.replace(" ", "_").lower())
df = df.rename(columns={'value': 'market_value'})

#2 - Data cleaning

##2.1 - Formatting Values

In [406]:
def str_to_currency(value):
  if(type(value) == str):
    try:    
        x = re.sub(r'[^\d.]', '', value)
        x = Decimal(x) 
    except Exception as e:
      raise e
    else:
      if x == None:
        return np.NaN
      if 'K' in value:
        return x * 1000
      elif 'M' in value:
        return x * 1000000

In [407]:
df['release_clause'] = df['release_clause'].apply(str_to_currency)
df['wage'] = df['wage'].apply(str_to_currency)
df['market_value'] = df['market_value'].apply(str_to_currency)

In [408]:
def to_timestamp(date):     
    try:  
      date_size = len(str(date))   
      formatted_date = ''        
      if date_size > 4:
        formatted_date = datetime.strptime(date, '%b %d, %Y')      
      elif date_size == 4:
        formatted_date = datetime(int(date), 6, 30) 
      else:
        next_year = datetime.today().year + 1
        next_season_end = datetime(next_year, 12,31)
        formatted_date = next_season_end
    except Exception as e:
      raise e
    else:    
      return time.mktime(formatted_date.timetuple())

In [409]:
def to_date(date):     
    try:  
      date_size = len(str(date))   
      formatted_date = ''        
      if date_size > 4:
        formatted_date = datetime.strptime(date, '%b %d, %Y')      
      elif date_size == 4:
        formatted_date = datetime(int(date), 6, 30) 
      else:
        next_year = datetime.today().year + 1
        next_season_end = datetime(next_year, 12,31)
        formatted_date = next_season_end
    except Exception as e:
      raise e
    else:    
      return datetime.strftime(formatted_date, '%d-%m-%Y')

In [410]:
df['contract_valid_until_date'] = df['contract_valid_until'].apply(to_date)
df['joined_date'] = df['joined'].apply(to_date)

In [411]:
df['contract_valid_until'] = df['contract_valid_until'].apply(to_timestamp)
df['joined'] = df['joined'].apply(to_timestamp)

In [412]:
def remove_html(value):
  if(type(value) == str):
    try:
      x = re.sub(r'<.{0,100}">|<.{0,10}>','', value)
    except Exception as e:
      raise e
    else:
      return x

In [413]:
df['position'] = df['position'].apply(remove_html)
df['loaned_from'] = df['loaned_from'].apply(remove_html)

In [414]:
def str_to_number(value):
  if value:
    try:
      x = re.sub(r'[^\d.]', '', value)
      x = Decimal(x) 
    except Exception as e:
      raise e
    else:
      return x

In [415]:
df['height'] = df['height'].apply(str_to_number)
df['weight'] = df['weight'].apply(str_to_number)

In [416]:
df['name'] = df['name'].apply(lambda x: re.sub('\d','', x))

##2.2 - Changing datatypes

In [417]:
df['wage'] = pd.to_numeric(df['wage'])
df['market_value'] = pd.to_numeric(df['market_value'])
df['height'] = pd.to_numeric(df['height'])
df['weight'] = pd.to_numeric(df['weight'])
df['release_clause'] = pd.to_numeric(df['release_clause'])

##2.2 - Filling NA values

In [418]:
df.columns[df.isna().any()].tolist()

['club',
 'market_value',
 'wage',
 'body_type',
 'real_face',
 'position',
 'jersey_number',
 'loaned_from',
 'volleys',
 'curve',
 'agility',
 'balance',
 'jumping',
 'interceptions',
 'positioning',
 'vision',
 'composure',
 'marking',
 'slidingtackle',
 'release_clause',
 'defensiveawareness']

Removing players without Market Value, Club, Wage, Release Clause Value, Joined Date because we can't infer these information.

In [419]:
df.dropna(subset = ['market_value', 'club', 'wage', 'release_clause', 'joined'], inplace=True)

Here we are assuming that every player that has not the information of the club that Loaned him is not loaned.

In [420]:
df['loaned_from'].fillna('NL', inplace=True)

The attribute Marking was substituted for the attribute DefensiveAwareness in Fifa 22 so, unified the two columns

In [421]:
df.loc[df['defensiveawareness'].isna() == True, ['defensiveawareness']] = df.loc[df['defensiveawareness'].isna() == True]['marking']

In [422]:
df.drop(columns=['marking'], inplace = True)

In [423]:
df.loc[df['position'].isin(['RES', 'SUB']), ['position']] = df.loc[df['position'].isin(['RES', 'SUB'])]['best_position']

## 2.3 - Exploding attributes

The attribute Work Rate can be divided in Deffensive Workrate and Attacking Workrate, in this dataset both are provided in the same column so, i had to split it into two different columns

In [424]:
awr = []
dwr = []
for i, w in enumerate(df['work_rate'].tolist()): 
  if(type(w) == str): 
    opt = w.replace(' ', '').split('/')
    awr.append(opt[0])
    dwr.append(opt[1])
  else:
    awr.append(np.nan)
    dwr.append(np.nan)

In [425]:
df['attacking_wr'] = awr
df['defensive_wr'] = dwr

#3 - Data Modeling

Dropping unecessary columns to the model

In [426]:
X = df.drop(columns=['photo', 'flag','jersey_number', 'joined', 'club_logo', 'work_rate', 'name', 'special', 'contract_valid_until_date', 'joined_date'])

## 3.1 - Applying encodings

In [427]:
from sklearn.preprocessing import OneHotEncoder

In [428]:
X.select_dtypes('object').columns

Index(['nationality', 'club', 'preferred_foot', 'body_type', 'real_face',
       'position', 'loaned_from', 'best_position', 'attacking_wr',
       'defensive_wr'],
      dtype='object')

In [429]:
wr_order = ['Low', 'Medium','High',]
X['attacking_wr'] =  X['attacking_wr'].astype('category')
X['attacking_wr'] = X['attacking_wr'].cat.reorder_categories(wr_order).cat.codes
X['defensive_wr'] =  X['defensive_wr'].astype('category')
X['defensive_wr'] = X['defensive_wr'].cat.reorder_categories(wr_order).cat.codes

In [430]:
rf_order = ['No', 'Yes']
X['real_face'] = X['real_face'].astype('category')
X['real_face'] = X['real_face'].cat.reorder_categories(rf_order).cat.codes

In [431]:
one_nation = OneHotEncoder()
#one_club = OneHotEncoder()
one_foot = OneHotEncoder()
one_body_type = OneHotEncoder()
one_position = OneHotEncoder()
#one_loaned_from = OneHotEncoder()
one_best_position = OneHotEncoder()

In [432]:
enc_nation = one_nation.fit_transform(X[['nationality']])
#enc_club = one_club.fit_transform(X[['club']])
enc_foot = one_foot.fit_transform(X[['preferred_foot']])
enc_body_type = one_body_type.fit_transform(X[['body_type']])
enc_position = one_position.fit_transform(X[['position']])
#enc_loaned_from = one_loaned_from.fit_transform(X[['loaned_from']])
enc_best_position = one_best_position.fit_transform(X[['best_position']])

In [433]:
X[one_nation.categories_[0]] = enc_nation.toarray()
#X[one_club.categories_[0]] = enc_club.toarray()
X[one_foot.categories_[0]] = enc_foot.toarray()
X[one_body_type.categories_[0]] = enc_body_type.toarray()
X[[f'default_{x}' for x in one_position.categories_[0]]] = enc_position.toarray()
X[[f'best_{x}' for x in one_best_position.categories_[0]]] = enc_best_position.toarray()
#X[[f'loaned_{x}' for x in one_loaned_from.categories_[0]]] = enc_loaned_from.toarray()

  self[col] = igetitem(value, i)


In [434]:
X.drop(columns=['nationality', 'club', 'preferred_foot', 'body_type', 'position',
       'loaned_from', 'best_position'], inplace=True)

##3.2 - Rescaling values

In [435]:
from sklearn.preprocessing import MinMaxScaler

In [436]:
scaler = MinMaxScaler()

In [437]:
X_scaled = pd.DataFrame()

In [438]:
X_scaled[X.columns.values] =  scaler.fit_transform(X)

  self[col] = igetitem(value, i)


##3.3 - Setting dataset index

In [439]:
X_scaled.set_index(X['id'], inplace=True, drop=True)

## 3.4 - Modeling

In [440]:
from ipywidgets import widgets, interact
from IPython.display import display

In [441]:
center_midfield = ['CDM', 'CM', 'CAM']
defenders = ['CB', 'RB', 'LB', 'LWB', 'RWB']
wingers = ['LW', 'RW', 'LM', 'RM']
attackers = ['CF', 'ST']
gk = ['gk']

In [442]:
def get_equivalent_positions(position):  
  if(position in center_midfield):    
    return center_midfield
  elif(position in defenders):
    return defenders
  elif(position in wingers):
    return wingers
  elif(position in attackers):
    return attackers        
  elif(position in gk):
    return gk    

In [443]:
dp_players = df[['name','id']].sort_values(by=['name']).values.tolist()

In [444]:
def get_players_by_equivalent_position(position):
#1 - Find the equivalent positions for the position that were provided
#2 - Applying the function isin in the entire subset to check if there are True values
#3 - Use the function any to check if at least one column is True
#4 - Index the dataset only with the players that have as least one equivalent position  
  equivalent_positions = [f'best_{x}' for x in get_equivalent_positions(position)]  
  return X_scaled[X_scaled[equivalent_positions].isin([1]).any(1)]

In [445]:
from sklearn.neighbors import NearestNeighbors
def fit_model(dataset):
  model = NearestNeighbors(algorithm='brute', n_neighbors=10)
  model.fit(dataset)
  return model

In [446]:
df_indexed = df.set_index('id')

In [447]:
def fit_model_by_position(Player):
  position = df.loc[df['id'] == Player]['best_position'].values[0]
  players_with_equivalent_position = get_players_by_equivalent_position(position)
  fitted_model = fit_model(players_with_equivalent_position.values)    
  return get_similars(fitted_model, Player,players_with_equivalent_position)

def get_similars(model, Player, filtered_dataset):
  distance, similar_players_index = model.kneighbors(filtered_dataset.loc[Player].values.reshape(1,-1))  
  converted_indexes = filtered_dataset.iloc[similar_players_index[0]].index #Here is returned the position on the table, not the ID, so, I had to convert it to the original Index
  similar_players = df_indexed.loc[converted_indexes].copy()
  similar_players['distances'] = [float(d) for d in distance[0]]  
  return similar_players[['name', 'club', 'nationality', 'best_position', 'distances']]

#5 - Presenting results

In [448]:
interact(fit_model_by_position, Player=dp_players);

interactive(children=(Dropdown(description='Player', options=(['A Lan', 188044], ['A. Abdallah', 237347], ['A.…

#6 - Exporting results

In [449]:
from google.colab import files
from joblib import dump

In [450]:
df.to_json('players.json', orient='records')
files.download('players.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>