# SOFIFA preprocess

In [250]:
import pandas as pd
import pandas as pd
import numpy as np
import re

df_sofifa = pd.read_csv("../data/raw/sofifa_players_all_years.csv",
                low_memory=False)
df_sofifa.drop_duplicates(inplace=True)

In [251]:
def name_position(row):
    pattern = r'[A-Z]{2,}'
    match = re.search(pattern, row)
    if match:
        location = match.start()
        name = row[:location].strip()
        
        return name
    else:
        return None

def money(row): 
    row = row.split('€')[1]
    if "M" in row:
        value = float(row.replace("M", "")) * 1_000_000
    elif "K" in row:
        value = float(row.replace("K", "")) * 1_000
    else:
        value = float(row)

    return value

deleted_col = ["unnamed:_0", "unnamed:_76", "id", "joined", 
                "loan_date_end", "club_kit_number", "club_position", 
                "traits", "traits.1", "acceleration_type", "playstyles", 
                "playstyles_+", "attacking_work_rate", "defensive_work_rate",
                "real_face", "team_contract", "end_year", "height", "weight", 
                "body_type", "season_code", "start_year", "start_part",
                "best_position", "value", "wage", "release_clause"]

In [252]:
df_sofifa.columns = df_sofifa.columns.str.lower()
df_sofifa.columns = df_sofifa.columns.str.replace(" / ", "_").str.replace(" & ", "_").str.replace(" ", "_")

df_sofifa['height(cm)'] = df_sofifa["height"].str.split("cm").str[0].astype(int)
df_sofifa['weight(kg)'] = df_sofifa["weight"].str.split("kg").str[0].astype(int)

foot_mapping = {"Left": 1, "Right": 2}
df_sofifa['foot'] = df_sofifa['foot'].map(foot_mapping)

df_sofifa[['start_part', 'end_year']] = df_sofifa['team_contract'].str.split(' ~ ', expand=True)
df_sofifa[['team', 'start_year']] = df_sofifa['start_part'].str.extract(r'([A-Za-z]+)(\d{4})')


df_sofifa['name'] = df_sofifa['name'].apply(name_position)
df_sofifa["name"] = df_sofifa['name'].str.lower()

df_sofifa = df_sofifa[df_sofifa['best_position'] != "GK"].copy()


df_sofifa['value(€)'] = df_sofifa['value'].apply(money)
df_sofifa['wage(€)'] = df_sofifa['wage'].apply(money)
df_sofifa['release_clause(€)'] = df_sofifa['release_clause'].apply(money)

df_sofifa.drop(columns=deleted_col, inplace=True)

In [253]:
# import shutil
# shutil.copyfile("../../model_1/data/raw/total_raw/total_fielders.csv", "../data/raw/fbref.csv")

# Merging two datasets

In [254]:
df_fbref = pd.read_csv("../data/raw/fbref_players_all.csv")

def convert_season(season):
    season_str = str(season).zfill(4)
    last_two = int(season_str[2:])
    if last_two >= 90:
        return 1900 + last_two
    else:
        return 2000 + last_two

def change_name(row):
    row = row.lower()
    names = row.split(" ")
    if len(names) >= 2:
        first_name = names[0]
        last_name = names[-1]
        new_name = f"{first_name[0]}. {last_name}"
        return new_name
    else:
        return row

def pos_simplification(row):
    if "," in row:
        row = row.split(",")[0]
    
    return row.strip()

In [255]:
df_fbref['season'] = df_fbref['season'].apply(convert_season)
df_fbref = df_fbref[df_fbref['season'] >= 2007].copy()
df_fbref.reset_index(drop=True, inplace=True)
df_fbref.sort_values(by=['season', "Performance_Gls"], ascending=[False, False], inplace=True)


df_fbref['player'] = df_fbref['player'].apply(change_name)
df_fbref['pos'] = df_fbref['pos'].apply(pos_simplification)


common_names = set(df_sofifa.name).intersection(set(df_fbref.player))

df_sofifa.rename(columns={"name": "player"}, inplace=True)
merged_df = pd.merge(df_sofifa  , df_fbref, on=['player', 'season'])

In [256]:
merged_df["Tackles_Tkl%"] = merged_df["Tackles_TklW"] / merged_df["Tackles_Tkl"]


# merged_df['pos'] = merged_df['pos'].apply(pos_simplification)

# Real Preprocess

In [257]:
df_total = merged_df.copy()

In [258]:
df_total = merged_df.copy()
df_total.columns = df_total.columns.str.strip()

In [259]:
merged_df.columns

Index(['player', 'age_x', 'overall_rating', 'potential', 'foot',
       'best_overall', 'growth', 'total_attacking', 'crossing', 'finishing',
       ...
       'Team Success (xG)_onxG', 'Team Success (xG)_onxGA',
       'Team Success (xG)_xG+/-', 'Team Success (xG)_xG+/-90',
       'Team Success (xG)_On-Off', 'Performance_Recov', 'Aerial Duels_Won',
       'Aerial Duels_Lost', 'Aerial Duels_Won%', 'Tackles_Tkl%'],
      dtype='object', length=240)

# Stat imputation

In [260]:
FINAL_FEATURES = [
    # Identity
    "player", "season",

    # Physical / profile
    "age_x", "height(cm)", "weight(kg)", "pos", "foot",
    "weak_foot", "skill_moves", "international_reputation",

    # Ability summaries
    "total_attacking", "total_skill", "total_movement",
    "total_power", "total_mentality", "total_defending",

    # Usage
    "Playing Time_90s",

    # Attacking
    "Per 90 Minutes_G+A", "Expected_xG", "Expected_xAG",
    "KP", "CrsPA",

    # Progression
    "Total_Cmp%", "Total_TotDist", "Total_PrgDist",
    "Carries_TotDist", "Carries_PrgDist",

    # Defense
    "Tackles_Tkl%", "Challenges_Tkl%", "Int",
    "Blocks_Blocks", "Aerial Duels_Won%",

    # Discipline
    "Performance_CrdY", "Performance_CrdR",

    # Market (optional)
    "value(€)", "wage(€)", "release_clause(€)"
]

In [261]:
df_final = df_total[FINAL_FEATURES]

In [262]:
df_final = df_final.dropna(how="any")

In [263]:
df_final.shape

(12786, 37)

In [264]:
df_final = df_final[df_final["Playing Time_90s"] >= 5]

In [265]:
df_final = df_final.rename(columns={"age_x": "age"})

In [266]:
df_final.columns = df_final.columns.str.replace(" ", "_")

In [267]:
df_final


Unnamed: 0,player,season,age,height(cm),weight(kg),pos,foot,weak_foot,skill_moves,international_reputation,total_attacking,total_skill,total_movement,total_power,total_mentality,total_defending,Playing_Time_90s,Per_90_Minutes_G+A,Expected_xG,Expected_xAG,KP,CrsPA,Total_Cmp%,Total_TotDist,Total_PrgDist,Carries_TotDist,Carries_PrgDist,Tackles_Tkl%,Challenges_Tkl%,Int,Blocks_Blocks,Aerial_Duels_Won%,Performance_CrdY,Performance_CrdR,value(€),wage(€),release_clause(€)
0,m. salah,2025,32,175,72,FW,1,3,4,5,411,418,452,409,387,122,37.5,1.25,25.2,14.2,88.0,15.0,70.6,13343.0,3919.0,6909.0,3700.0,0.523810,47.1,9.0,12.0,39.1,1.0,0.0,104000000.0,350000.0,192400000.0
2,e. haaland,2025,23,195,94,FW,1,3,3,5,404,367,413,438,390,114,30.4,0.82,22.0,3.0,29.0,1.0,66.9,3086.0,593.0,1494.0,577.0,0.545455,37.5,5.0,12.0,53.3,2.0,0.0,157000000.0,270000.0,302200000.0
3,h. kane,2025,30,188,86,FW,2,4,3,5,441,397,362,431,397,130,26.5,1.32,20.3,5.3,35.0,2.0,78.9,9141.0,2258.0,2260.0,954.0,0.846154,33.3,2.0,16.0,59.2,5.0,0.0,117500000.0,170000.0,193900000.0
4,k. mbappé,2025,25,182,75,FW,2,4,5,5,411,408,460,425,355,92,32.3,1.05,25.9,7.7,51.0,3.0,82.9,13622.0,2382.0,6987.0,3808.0,0.625000,20.0,1.0,3.0,40.0,3.0,1.0,160000000.0,380000.0,340000000.0
5,j. bellingham,2025,21,186,75,MF,2,4,4,5,397,408,413,429,423,233,27.6,0.61,11.4,3.9,42.0,0.0,86.1,21413.0,4654.0,6054.0,2818.0,0.540984,66.7,28.0,37.0,49.2,5.0,1.0,174500000.0,280000.0,370800000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13934,h. kane,2018,18,173,67,FW,2,3,3,1,279,267,339,270,282,168,34.2,0.94,24.8,2.7,32.0,2.0,68.5,8715.0,2185.0,4128.0,1401.0,0.705882,26.7,5.0,15.0,42.6,5.0,0.0,525000.0,9000.0,1300000.0
13956,d. avounou,2018,19,179,71,DF,1,2,2,1,217,209,262,258,224,188,7.2,0.00,0.3,0.2,3.0,1.0,67.8,2755.0,931.0,674.0,235.0,0.687500,29.4,10.0,12.0,38.5,1.0,0.0,350000.0,2000.0,919000.0
13957,j. deminguet,2018,19,178,74,MF,1,3,2,1,257,286,315,295,270,164,5.9,0.34,0.3,0.9,9.0,4.0,62.3,2269.0,590.0,1221.0,622.0,0.750000,28.6,10.0,5.0,63.6,2.0,0.0,325000.0,2000.0,715000.0
13965,s. omeonga,2018,21,177,72,MF,2,2,2,1,219,267,331,234,272,172,8.9,0.00,0.5,0.5,9.0,1.0,82.5,4831.0,1300.0,1657.0,914.0,0.600000,38.9,5.0,20.0,38.1,2.0,1.0,350000.0,4000.0,665000.0


In [7]:
import pandas as pd
df = pd.read_csv("../data/processed/preprocessed_data.csv")

  df = pd.read_csv("../data/processed/preprocessed_data.csv")


In [8]:
df.isna().sum()

Unnamed: 0               0
unnamed:_0           32310
player                   0
age_x                    0
overall_rating           0
                     ...  
Performance_Recov    18302
Aerial Duels_Won     18302
Aerial Duels_Lost    18302
Aerial Duels_Won%    18885
Tackles_Tkl%         19079
Length: 268, dtype: int64