# SOFIFA preprocess

In [1]:
import pandas as pd
import pandas as pd
import numpy as np
import re

df_sofifa = pd.read_csv("../data/raw/sofifa_players_all_years.csv",
                low_memory=False)
df_sofifa.drop_duplicates(inplace=True)

In [None]:
def name_position(row):
    pattern = r'[A-Z]{2,}'
    match = re.search(pattern, row)
    if match:
        location = match.start()
        name = row[:location].strip()
        
        return name
    else:
        return None

def money(row): 
    row = row.split('€')[1]
    if "M" in row:
        value = float(row.replace("M", "")) * 1_000_000
    elif "K" in row:
        value = float(row.replace("K", "")) * 1_000
    else:
        value = float(row)

    return value

deleted_col = ["unnamed:_0", "unnamed:_76", "id", "joined", 
                "loan_date_end", "club_kit_number", "club_position", 
                "traits", "traits.1", "acceleration_type", "playstyles", 
                "playstyles_+", "attacking_work_rate", "defensive_work_rate",
                "real_face", "team_contract", "end_year", "height", "weight", 
                "body_type", "season_code", "start_year", "start_part",
                "best_position", "value", "wage", "release_clause"]

In [3]:
df_sofifa.columns = df_sofifa.columns.str.lower()
df_sofifa.columns = df_sofifa.columns.str.replace(" / ", "_").str.replace(" & ", "_").str.replace(" ", "_")

df_sofifa['height(cm)'] = df_sofifa["height"].str.split("cm").str[0].astype(int)
df_sofifa['weight(kg)'] = df_sofifa["weight"].str.split("kg").str[0].astype(int)

foot_mapping = {"Left": 1, "Right": 2}
df_sofifa['foot'] = df_sofifa['foot'].map(foot_mapping)

df_sofifa[['start_part', 'end_year']] = df_sofifa['team_contract'].str.split(' ~ ', expand=True)
df_sofifa[['team', 'start_year']] = df_sofifa['start_part'].str.extract(r'([A-Za-z]+)(\d{4})')


df_sofifa['name'] = df_sofifa['name'].apply(name_position)
df_sofifa["name"] = df_sofifa['name'].str.lower()

df_sofifa = df_sofifa[df_sofifa['best_position'] != "GK"].copy()


df_sofifa['value(€)'] = df_sofifa['value'].apply(money)
df_sofifa['wage(€)'] = df_sofifa['wage'].apply(money)
df_sofifa['release_clause(€)'] = df_sofifa['release_clause'].apply(money)

df_sofifa.drop(columns=deleted_col, inplace=True)

In [4]:
# import shutil
# shutil.copyfile("../../model_1/data/raw/total_raw/total_fielders.csv", "../data/raw/fbref.csv")

# Merging two datasets

In [5]:
df_fbref = pd.read_csv("../data/raw/fbref_players_all.csv")

def convert_season(season):
    season_str = str(season).zfill(4)
    last_two = int(season_str[2:])
    if last_two >= 90:
        return 1900 + last_two
    else:
        return 2000 + last_two

def change_name(row):
    row = row.lower()
    names = row.split(" ")
    if len(names) >= 2:
        first_name = names[0]
        last_name = names[-1]
        new_name = f"{first_name[0]}. {last_name}"
        return new_name
    else:
        return row

def pos_simplification(row):
    if "," in row:
        row = row.split(",")[0]
    
    return row.strip()

In [6]:
df_fbref['season'] = df_fbref['season'].apply(convert_season)
df_fbref = df_fbref[df_fbref['season'] >= 2007].copy()
df_fbref.reset_index(drop=True, inplace=True)
df_fbref.sort_values(by=['season', "Performance_Gls"], ascending=[False, False], inplace=True)


df_fbref['player'] = df_fbref['player'].apply(change_name)
df_fbref['pos'] = df_fbref['pos'].apply(pos_simplification)


common_names = set(df_sofifa.name).intersection(set(df_fbref.player))

df_sofifa.rename(columns={"name": "player"}, inplace=True)
merged_df = pd.merge(df_sofifa  , df_fbref, on=['player', 'season'])

In [7]:
merged_df["Tackles_Tkl%"] = merged_df["Tackles_TklW"] / merged_df["Tackles_Tkl"]


# merged_df['pos'] = merged_df['pos'].apply(pos_simplification)

# Real Preprocess

In [8]:
df_total = merged_df.copy()

In [9]:
df_total = merged_df.copy()
df_total.columns = df_total.columns.str.strip()

# Stat imputation

In [11]:
FINAL_FEATURES = [
    # Identity
    "player", "season",

    # Physical / profile
    "age_x", "height(cm)", "weight(kg)", "pos", "foot",
    "weak_foot", "skill_moves", "international_reputation",

    # Ability summaries
    "total_attacking", "total_skill", "total_movement",
    "total_power", "total_mentality", "total_defending",

    # Usage
    "Playing Time_90s",

    # Attacking
    "Per 90 Minutes_G+A", "Expected_xG", "Expected_xAG",
    "KP", "CrsPA",

    # Progression
    "Total_Cmp%", "Total_TotDist", "Total_PrgDist",
    "Carries_TotDist", "Carries_PrgDist",

    # Defense
    "Tackles_Tkl%", "Challenges_Tkl%", "Int",
    "Blocks_Blocks", "Aerial Duels_Won%",

    # Discipline
    "Performance_CrdY", "Performance_CrdR",

    # Market (optional)
    "value(€)", "wage(€)", "release_clause(€)"
]

In [12]:
df_final = df_total[FINAL_FEATURES]

In [13]:
df_final = df_final.dropna(how="any")

# Test

In [30]:
import pandas as pd
import numpy as np
import re
import numpy as np

BASE_DIR = "../data/raw/"

df_sofifa = pd.read_csv(BASE_DIR + "sofifa_players.csv",
            low_memory=False)

df_fbref = pd.read_csv(BASE_DIR + "fbref_players.csv")

In [31]:
def extract_name_from_position(row):
    pattern = r'[A-Z]{2,}'
    match = re.search(pattern, row)
    if match:
        location = match.start()
        name = row[:location].strip()
        
        return name
    else:
        return "HELLO"

df_sofifa.drop_duplicates(inplace=True)
df_sofifa.columns = df_sofifa.columns.str.strip().str.lower()

df_sofifa['name'] = df_sofifa['name'].apply(extract_name_from_position)
df_sofifa["name"] = df_sofifa['name'].str.lower()      

In [32]:
def parse_season_code(season):
    season_str = str(season).zfill(4)
    last_two = int(season_str[2:])
    if last_two >= 90:
        return 1900 + last_two
    else:
        return 2000 + last_two

df_fbref['season'] = df_fbref['season'].apply(parse_season_code) 

In [None]:
def initial_format_player_name(row):
    row = row.lower()
    names = row.split(" ")
    
    if len(names) >= 2:
        first_name = names[0]
        last_name = names[-1]
        new_name = f"{first_name[0]}. {last_name}"
        return new_name
    else:
        return row

df_fbref = df_fbref[df_fbref['season'] >= 2007].copy()
df_fbref['player'] = df_fbref['player'].apply(initial_format_player_name)
df_fbref.rename(columns={"player": "name"}, inplace=True)

In [36]:
df_fbref.rename(columns={"player": "name"}, inplace=True)

In [38]:
merged_df = pd.merge(df_sofifa  , df_fbref, on=['name', 'season'])

In [41]:
merged_df

Unnamed: 0,unnamed: 0,name,age_x,overall rating,potential,team & contract,id,height,weight,foot,...,Receiving_PrgR,Team Success (xG)_onxG,Team Success (xG)_onxGA,Team Success (xG)_xG+/-,Team Success (xG)_xG+/-90,Team Success (xG)_On-Off,Performance_Recov,Aerial Duels_Won,Aerial Duels_Lost,Aerial Duels_Won%
0,,m. salah,32,91,91,Liverpool2017 ~ 2027,209331,"175cm 5'9""",72kg 159lbs,Left,...,488.0,81.6,38.1,43.5,1.16,1.05,101.0,9.0,14.0,39.1
1,,rodri,28,91,91,Manchester City2019 ~ 2027,231866,"190cm 6'3""",82kg 181lbs,Right,...,1.0,2.2,0.3,1.9,2.36,1.86,7.0,1.0,0.0,100.0
2,,e. haaland,23,90,92,Manchester City2022 ~ 2034,239085,"195cm 6'5""",94kg 207lbs,Left,...,124.0,56.2,40.5,15.7,0.52,-0.10,34.0,57.0,50.0,53.3
3,,h. kane,30,90,90,FC Bayern München2023 ~ 2027,202126,"188cm 6'2""",86kg 190lbs,Right,...,143.0,65.0,17.8,47.2,1.79,0.60,61.0,29.0,20.0,59.2
4,,k. mbappé,25,90,93,Real Madrid2024 ~ 2029,231747,"182cm 6'0""",75kg 165lbs,Right,...,386.0,69.0,35.2,33.8,1.05,1.27,36.0,4.0,6.0,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48276,,m. izco,23,48,62,Catania2009,165490,"170cm 5'7""",68kg 150lbs,Right,...,,,,,,,,,,
48277,,l. cigarini,20,47,71,Parma2011,173146,"173cm 5'8""",74kg 163lbs,Right,...,,,,,,,,,,
48278,,a. barillà,18,47,54,AS Reggina 19142007,173155,"180cm 5'11""",75kg 165lbs,Right,...,,,,,,,,,,
48279,,s. burrai,19,47,54,Cagliari2010,178510,"177cm 5'10""",72kg 159lbs,Right,...,,,,,,,,,,
