In [1]:
import os
import pandas as pd

RAW = "raw_data"
CLEAN = "null_removed"

os.makedirs(CLEAN, exist_ok=True)

In [2]:
def clean_common_columns(df):
    """Cleans the columns shared between players & coaches"""
    
    # Convert DOB
    if "dob" in df.columns:
        df["dob"] = pd.to_datetime(df["dob"], errors="coerce")
        df["dob"].fillna(pd.to_datetime("1900-01-01"), inplace=True)
    
    # Fill string columns
    string_cols = df.select_dtypes(include="object").columns
    for col in string_cols:
        df[col].fillna("None", inplace=True)
    
    # Fill numeric columns
    numeric_cols = df.select_dtypes(include=["float64","int64"]).columns
    for col in numeric_cols:
        df[col].fillna(0, inplace=True)
    
    return df

In [3]:
def clean_players(filename):
    df = pd.read_csv(f"{RAW}/{filename}")

    # Player-specific fixes
    if "club_loaned_from" in df.columns:
        df["club_loaned_from"].fillna("None", inplace=True)
    
    if "player_tags" in df.columns:
        df["player_tags"].fillna("None", inplace=True)
    
    if "player_traits" in df.columns:
        df["player_traits"].fillna("None", inplace=True)

    df = clean_common_columns(df)
    
    # Save
    df.to_csv(f"{CLEAN}/{filename}", index=False)
    print(f"Saved → {CLEAN}/{filename}")

In [4]:
def clean_coaches(filename):
    df = pd.read_csv(f"{RAW}/{filename}")
    df = clean_common_columns(df)
    df.to_csv(f"{CLEAN}/{filename}", index=False)
    print(f"Saved → {CLEAN}/{filename}")

In [5]:
def clean_teams(filename):
    df = pd.read_csv(f"{RAW}/{filename}")
    
    # Captain logic: missing captain → set 0
    if "captain" in df.columns:
        df["captain"].fillna(0, inplace=True)

    # Set-piece roles → fill with captain
    set_piece_cols = [
        "short_free_kick","long_free_kick","left_short_free_kick",
        "right_short_free_kick","penalties","left_corner","right_corner"
    ]

    for col in set_piece_cols:
        if col in df.columns:
            df[col].fillna(df["captain"], inplace=True)

    df = clean_common_columns(df)
    df.to_csv(f"{CLEAN}/{filename}", index=False)
    print(f"Saved → {CLEAN}/{filename}")

In [6]:
players = ["fplayers.csv", "mplayers.csv", 
           "fplayerslegacy.csv", "mplayerslegacy.csv"]

coaches = ["fcoaches.csv", "mcoaches.csv"]

teams = ["fteams.csv", "mteams.csv"]

for f in players:
    clean_players(f)

for f in coaches:
    clean_coaches(f)

for f in teams:
    clean_teams(f)

  df = pd.read_csv(f"{RAW}/{filename}")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["club_loaned_from"].fillna("None", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["player_tags"].fillna("None", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the inter

Saved → null_removed/fplayers.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["club_loaned_from"].fillna("None", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["player_tags"].fillna("None", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

Saved → null_removed/mplayers.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["club_loaned_from"].fillna("None", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["player_tags"].fillna("None", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

Saved → null_removed/fplayerslegacy.csv


  df = pd.read_csv(f"{RAW}/{filename}")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["club_loaned_from"].fillna("None", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["player_tags"].fillna("None", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the inter

Saved → null_removed/mplayerslegacy.csv
Saved → null_removed/fcoaches.csv
Saved → null_removed/mcoaches.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["dob"].fillna(pd.to_datetime("1900-01-01"), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna("None", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting va

Saved → null_removed/fteams.csv


  df = pd.read_csv(f"{RAW}/{filename}")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["captain"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df["captain"], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on

Saved → null_removed/mteams.csv


In [7]:
rows = []

for file in os.listdir(CLEAN):
    if file.endswith(".csv"):
        df = pd.read_csv(f"{CLEAN}/{file}")
        for col in df.columns:
            rows.append([file, col, df[col].isna().sum()])

null_report = pd.DataFrame(rows, columns=["dataset","column","null_count"])
null_report.to_csv("cleaned_null_report.csv", index=False)

print("Generated → cleaned_null_report.csv")

  df = pd.read_csv(f"{CLEAN}/{file}")
  df = pd.read_csv(f"{CLEAN}/{file}")
  df = pd.read_csv(f"{CLEAN}/{file}")


Generated → cleaned_null_report.csv


In [None]:
os.listdir("raw_data")


['fcoaches.csv',
 'fplayers.csv',
 'fplayerslegacy.csv',
 'fteams.csv',
 'mcoaches.csv',
 'mplayers.csv',
 'mplayerslegacy.csv',
 'mteams.csv']

In [9]:
os.listdir("null_removed")

['fcoaches.csv',
 'fplayers.csv',
 'fplayerslegacy.csv',
 'fteams.csv',
 'mcoaches.csv',
 'mplayers.csv',
 'mplayerslegacy.csv',
 'mteams.csv']