In [7]:
import pandas as pd
import numpy as np
from pathlib import Path

def data_cleaning(df):
    # Show missing values
    print("Missing values in each column:")
    print(df.isna().sum())
    print("Missing values in each row:")
    print(df.isna().sum(axis=1))

    # Fill numeric columns with mean
    num_cols = df.select_dtypes(include=[np.number]).columns
    df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

    # Optionally fill categorical with mode (most frequent)
    cat_cols = df.select_dtypes(include=["object","category"]).columns
    for c in cat_cols:
        if df[c].isna().any():
            df[c] = df[c].fillna(df[c].mode().iloc[0])

    # Remove duplicates
    df = df.drop_duplicates()

    # Convert object -> category
    for col in df.select_dtypes(include=["object"]).columns:
        df[col] = df[col].astype("category")

    return df

# Path fix: use raw string (r"...") or Path with backslashes handled
csv_path = r"C:\Users\ajroy\OneDrive\Desktop\house_price_predection\data\USA Housing Dataset.csv"
df = pd.read_csv(csv_path)
df = data_cleaning(df)


Missing values in each column:
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
street           0
city             0
statezip         0
country          0
dtype: int64
Missing values in each row:
0       0
1       0
2       0
3       0
4       0
       ..
4135    0
4136    0
4137    0
4138    0
4139    0
Length: 4140, dtype: int64
