In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

pd.set_option('display.max_columns', None)
pd.options.mode.copy_on_write = True

In [2]:
df = pd.read_csv("../data/processed/preprocessed_data.csv", low_memory=False)

In [3]:
df.season.unique()

array([2017, 2016])

# Necessary Preprocessing

In [541]:
df = df.copy()
def _clean_sofifa_data( df):
    df.columns = df.columns.str.replace(" / ", "_").str.replace(" & ", "_").str.replace(" ", "_")

    df['height(cm)'] = df["height"].str.split("cm").str[0].astype(int)
    df['weight(kg)'] = df["weight"].str.split("kg").str[0].astype(int) 

    df['foot'] = df['foot'].map({"Left": 1, "Right": 2})

def parse_monetary_value(row): 
    row = row.split('€')[1]
    if "M" in row:
        value = float(row.replace("M", "")) * 1_000_000
    elif "K" in row:
        value = float(row.replace("K", "")) * 1_000
    else:
        value = float(row)

    return value

def extract_primary_position(row):
    if "," in row:
        row = row.split(",")[0]
    
    return row.strip()

_clean_sofifa_data(df)
df['value(€)'] = df['value'].apply(parse_monetary_value)
df['wage(€)'] = df['wage'].apply(parse_monetary_value)

df['release_clause(€)'] = df['release_clause'].apply(parse_monetary_value)
df["Tackles_Tkl%"] = df["Tackles_TklW"] / df["Tackles_Tkl"]
df["pos"] = df["pos"].apply(extract_primary_position)


In [542]:
df = df[df['season'] >= 2018]

**Dataset Now Ready For Preprocessing**

# Preprocessing

In [543]:
deleted_columns = [
'real_face',
'joined',
'traits',
'season_code',
'wage',
'club_kit_number',
'attacking_work_rate',
'traits.1',
'value',
'playstyles_+',
'playstyles',
'height',
'id',
'club_position',
'acceleration_type',
'defensive_work_rate',
'body_type',
'A-xAG',
'weight',
'loan_date_end',
'Rec',
'team_contract',
'release_clause', 
'Standard_G/SoT', 
'Medium_Cmp%', 
'Long_Cmp%', 
'Challenges_Tkl%', 
'Take-Ons_Succ%', 
'Take-Ons_Tkld%', 
'Starts_Mn/Start', 
'Subs_Mn/Sub', 
'Aerial_Duels_Won%', 
'Tackles_Tkl%',
"Short_Cmp%",
"Total_Cmp%",
"gk_diving"
,"age_y"]

delete_subset = ["nation", 
"born",
'Team_Success_PPM', 
"Team_Success_On-Off", 
'Team_Success_(xG)_On-Off',
"Performance_2CrdY"]

In [544]:
df.drop(columns=deleted_columns, inplace=True)

deleted_threshold = df.shape[1] - 3
df.dropna(thresh=deleted_threshold, axis=0, inplace=True)
df.dropna(subset=delete_subset, inplace=True)

In [545]:
df['potential'] = df['potential'].apply(lambda x: x.split("-")[0] if "-" in x else x)

In [546]:
object_to_num = ['overall_rating',
'potential',
'crossing',
'finishing',
'heading_accuracy',
'short_passing',
'volleys',
'dribbling',
'curve',
'fk_accuracy',
'long_passing',
'ball_control',
'acceleration',
'sprint_speed',
'agility',
'reactions',
'balance',
'shot_power',
'jumping',
'stamina',
'strength',
'long_shots',
'aggression',
'interceptions',
'attack_position',
'vision',
'penalties',
'composure',
'defensive_awareness',
'standing_tackle']

df[object_to_num] = df[object_to_num].astype(float)

In [547]:
col_order = ['player', 'league', 'team', 'nation','pos', "best_position", 'age_x', 'born', 'season'] + [col for col in df.columns if col not in ['player', 'league', 'team', 'nation', 'pos', 'age_x', 'season']]

In [548]:
df = df[col_order]

In [549]:
df.rename(columns={'age_x': 'age', 'pos':'general_position'}, inplace=True)

In [550]:
df = df.loc[:, ~df.columns.duplicated()]

In [551]:
df.sort_values(by=['player', 'season'], ascending=True, inplace=True)

In [552]:
player_counts = df['player'].value_counts()
players_to_keep = player_counts[player_counts >= 3].index
df = df[df['player'].isin(players_to_keep)]