In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("mydata.csv")

# Randomly set 2% of numeric values to NaN
for col in df.select_dtypes(include="number").columns:
    df.loc[df.sample(frac=0.02).index, col] = np.nan

# Randomly set 2% of categorical values to NaN
for col in df.select_dtypes(include="object").columns:
    df.loc[df.sample(frac=0.02).index, col] = np.nan


# Duplicate 50 random rows
duplicates = df.sample(50, random_state=42)
df_unclean = pd.concat([df, duplicates], ignore_index=True)


if "attendance" in df_unclean.columns:
    df_unclean["attendance"] = df_unclean["attendance"].astype(str)
    df_unclean.loc[df_unclean.sample(frac=0.1).index, "attendance"] = (
        df_unclean["attendance"].str.replace(r"(\d+)", r"\1,000", regex=True)
    )

# Mess up categorical text (randomly lower/upper case + spaces)
for col in df_unclean.select_dtypes(include="object").columns:
    df_unclean.loc[df_unclean.sample(frac=0.05).index, col] = (
        df_unclean[col].astype(str).str.lower().str.strip() + " "
    )


# Set some attendance values to 0 or >200,000
if "attendance" in df_unclean.columns:
    df_unclean.loc[df_unclean.sample(5).index, "attendance"] = 0
    df_unclean.loc[df_unclean.sample(5).index, "attendance"] = 200000

# Set some goals to unrealistic numbers
if "Goals Home" in df_unclean.columns:
    df_unclean.loc[df_unclean.sample(5).index, "Goals Home"] = 15


df_unclean.to_csv("mydata_beforeclearning.csv", index=False)




In [None]:
import pandas as pd
import numpy as np
import re

# ================================
# Load unclean dataset
# ================================
df = pd.read_csv("mydata_beforeclearning.csv")
print("🔹 Original shape:", df.shape)

# A) Promote numeric-looking object columns to numeric safely
def looks_numeric_series(s: pd.Series, sample=200, thresh=0.8) -> bool:
    """
    Heuristic: if >=80% of a sample of non-null values look numeric (digits/.,+,-, commas),
    treat as numeric.
    """
    x = s.dropna().astype(str)
    if x.empty:
        return False
    x = x.sample(min(sample, len(x)), random_state=0)
    pat = re.compile(r'^[\s\-\+\.,\d]+$')
    return (x.str.match(pat)).mean() >= thresh

obj_cols_initial = df.select_dtypes(include="object").columns.tolist()

if "attendance" in df.columns:
    s = df["attendance"].astype(str).str.replace(",", "", regex=False).str.strip()
    df["attendance"] = pd.to_numeric(s, errors="coerce")

for col in obj_cols_initial:
    if col == "attendance":
        continue
    s = df[col].astype(str)
    if looks_numeric_series(s):
        df[col] = pd.to_numeric(s.str.replace(",", "", regex=False).str.strip(),
                                errors="coerce")

# B) Missing values (identify + handle)
na_before = df.isna().sum()
print("\n🔹 Missing values BEFORE:")
print(na_before[na_before > 0].sort_values(ascending=False))

numeric_cols = df.select_dtypes(include="number").columns.tolist()
cat_cols     = df.select_dtypes(include="object").columns.tolist()

# Fill numeric with median (log each)
for col in numeric_cols:
    n_missing = df[col].isna().sum()
    if n_missing > 0:
        med = df[col].median()
        df[col] = df[col].fillna(med)
        print(f"Filled {n_missing} missing values in numeric column '{col}' with median ({med}).")

# Fill categorical with "Unknown" (log each)
for col in cat_cols:
    n_missing = df[col].isna().sum()
    if n_missing > 0:
        df[col] = df[col].fillna("Unknown")
        print(f"Filled {n_missing} missing values in categorical column '{col}' with 'Unknown'.")

print("🔹 Missing values AFTER (total):", int(df.isna().sum().sum()))

# C) Remove exact duplicate rows
dups_before = int(df.duplicated().sum())
print("\n🔹 Exact duplicate rows BEFORE:", dups_before)
df = df.drop_duplicates().copy()
dups_after = int(df.duplicated().sum())
print("🔹 Exact duplicate rows REMOVED:", dups_before - dups_after)

# D) Formatting fixes for categoricals
for col in cat_cols:
    if col in df.columns:
        s = df[col].astype(str).str.strip()
        s = s.str.replace(r"\s+", " ", regex=True)
        if col not in {"date", "clock", "links"}:
            s = s.str.title()
        df[col] = s

print("\n🔹 Formatting fixes applied: attendance normalized; categorical text standardized")

# E) Outliers (identify + correct)
if "attendance" in df.columns:
    med_att = df["attendance"].median()
    mask_att_zero = df["attendance"] == 0
    mask_att_high = df["attendance"] > 100_000
    n_out_att = int((mask_att_zero | mask_att_high).sum())
    print(f"\n🔹 Attendance outliers found: {n_out_att}")
    if n_out_att > 0:
        df.loc[mask_att_zero | mask_att_high, "attendance"] = med_att
        print(f"Replaced attendance outliers with median ({med_att}).")

for gcol in ["Goals Home", "Away Goals", "home_goals", "away_goals"]:
    if gcol in df.columns and pd.api.types.is_numeric_dtype(df[gcol]):
        med_g = df[gcol].median()
        mask_g = df[gcol] > 10
        n_out_g = int(mask_g.sum())
        if n_out_g > 0:
            df.loc[mask_g, gcol] = med_g
            print(f"Replaced {n_out_g} unrealistic values in '{gcol}' (>10) with median ({med_g}).")

# ================================
# F) Repair team columns (if they were accidentally numeric)
# ================================
for col in ["Home Team", "Away Team"]:
    if col in df.columns:
        s = df[col].astype(str).str.strip()
        # Numeric-like strings (e.g., "11" or "11.0") were likely median imputations—turn back to NaN, then fill
        mask_numeric_like = s.str.fullmatch(r"\d+(\.0)?")
        n_fixed = int(mask_numeric_like.sum())
        if n_fixed > 0:
            print(f"\n🔹 Repairing {n_fixed} numeric-like values in '{col}' -> set to NaN, then 'Unknown'")
        s = s.mask(mask_numeric_like, np.nan)
        s = s.str.replace(r"\s+", " ", regex=True).str.title()
        s = s.fillna("Unknown")
        df[col] = s

# Recompute cat_cols in case types changed
cat_cols = df.select_dtypes(include="object").columns.tolist()

# ================================
# G) Deduplicate using semantic match key (date + teams + goals)
# ================================
# Ensure required columns exist
for need in ["date", "Home Team", "Away Team", "Goals Home", "Away Goals"]:
    if need not in df.columns:
        df[need] = ""

df["match_key"] = (
    df["date"].astype(str).str.strip()
    + " | " + df["Home Team"].astype(str).str.strip()
    + " | " + df["Away Team"].astype(str).str.strip()
    + " | " + df["Goals Home"].astype(str).str.strip()
    + " | " + df["Away Goals"].astype(str).str.strip()
)

sem_dups_before = int(df.duplicated(subset=["match_key"]).sum())
print("\n🔹 Duplicates by semantic key BEFORE:", sem_dups_before)

df = df.drop_duplicates(subset=["match_key"], keep="first").copy()

sem_dups_after = int(df.duplicated(subset=["match_key"]).sum())
print("🔹 Duplicates by semantic key AFTER:", sem_dups_after)

# Drop helper column
df = df.drop(columns=["match_key"])

# ================================
# H) Optional: tighten attendance bounds further (report & fix)
# ================================
if "attendance" in df.columns:
    med_att = df["attendance"].median()
    mask_tight = (df["attendance"] < 2_000) | (df["attendance"] > 95_000)
    n_tight = int(mask_tight.sum())
    if n_tight > 0:
        print(f"\n🔹 Adjusting {n_tight} attendance values outside [2,000, 95,000] -> median ({med_att}).")
        df.loc[mask_tight, "attendance"] = med_att

# ================================
# I) Final checks + save
# ================================
print("\n✅ Final dataset shape:", df.shape)
print("✅ Remaining NaNs (total):", int(df.isna().sum().sum()))
print("✅ Remaining exact duplicates:", int(df.duplicated().sum()))

df.to_csv("mydata_cleaned_v2.csv", index=False)
print("\n📁 Saved: 'mydata_cleaned_v2.csv'")


🔹 Original shape: (1190, 40)

🔹 Missing values BEFORE:
home_tackles        27
home_offside        26
attendance          25
home_shots          25
home_chances        25
away_red            25
away_duels          25
Goals Home          24
home_yellow         24
home_fouls          24
home_saves          24
home_duels          24
home_blocked        24
home_corners        24
away_off            24
home_off            24
away_tackles        24
away_yellow         24
away_fouls          24
Home Team           23
clock               23
date                23
stadium             23
class               23
Away Team           23
away_pass           23
away_chances        23
home_pass           23
away_possessions    23
away_shots          23
home_on             23
away_on             23
Away Goals          23
home_possessions    23
away_blocked        23
away_offside        23
away_corners        23
away_saves          23
home_red            23
links               22
dtype: int64
Filled 25 mi