In [None]:
import pandas as pd

 ### 1) Load dataset ###
def Load_csv(path_in: str = "hotel_bookings.csv") -> pd.DataFrame:
    """
    Load dataset into a pandas DataFrame.
    """
    df = pd.read_csv(path_in)
    print(f"Loaded: {path_in}")
    print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns\n")
    return df

### 2) Find and summarize missing value in each columns ###
def Find_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    """
    Find missing values in each column and summarize.
    """
    na_count = df.isna().sum().sort_values(ascending=False)
    na_pct = (df.isna().mean() * 100).round(2).sort_values(ascending=False)
    missing_summary = pd.DataFrame({"missing_count": na_count, "missing_pct": na_pct})
    print("Missing values by column (pre-cleaning):")
    print(missing_summary[missing_summary["missing_count"] > 0])
    print(f"\nTotal missing cells: {int(df.isna().sum().sum()):,}")
    print(f"Rows with ≥1 missing value: {int(df.isna().any(axis=1).sum()):,}\n")

### 3) Find and remove duplicate rows ###
def Remove_dup_values(df: pd.DataFrame) -> pd.DataFrame:
    """
    Find and drop duplicates in the DataFrame.
    """
    dup_count = int(df.duplicated(keep=False).sum())
    print(f"Exact duplicate rows (pre-cleaning): {dup_count:,}")
    before = len(df)
    df = df.drop_duplicates(keep="first").reset_index(drop=True)

    print(f"Removed {before - len(df)} duplicates. New shape: {df.shape}\n")
    return df

### 4) Data cleaning / imputation ###
def Fill_zero_col(df: pd.DataFrame) -> pd.DataFrame: 
    """
    Fill 0 for missing data in columns "agent", "children", "company" 
    """
    # Fill 0 for agent, children, company 
    zero_fill_cols = [ "children", "company", "agent"]
    for col in zero_fill_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
            # keep as integer if possible
            try:
                df[col] = df[col].astype("Int64")
            except Exception:
                pass
    return df

def Fill_mode_col(df: pd.DataFrame) -> pd.DataFrame: 
    """
    Fill mode for missing data in columns "country"
    """
    # Fill mode for country
    if "country" in df.columns:
        mode_country = df["country"].mode(dropna=True)
        mode_country_value = mode_country.iloc[0] if not mode_country.empty else "Unknown"
        df["country"] = df["country"].fillna(mode_country_value).astype(str)
    return df

def Convert_neg_values(df: pd.DataFrame) -> pd.DataFrame: 
    """
    Convert negative values to zero for numeric columns 
    """
    # Convert negative values to zero (numeric columns only)
    num_cols = df.select_dtypes(include=["number"]).columns
    # Count negatives per numeric column
    neg_col_counts = (df[num_cols] < 0).sum()
    cols_with_negs = neg_col_counts[neg_col_counts > 0]
    # Replace negatives with 0 in ALL numeric columns with negatives
    df[num_cols] = df[num_cols].clip(lower=0)

    if not cols_with_negs.empty:
        print("Negative values found and set to 0 in numeric columns:")
        # Summarize columns with negative values
        print(cols_with_negs.rename("negatives_corrected").to_frame())
    else:
        print("No negative numeric values found.\n")
    return df

### 5) Save cleaned dataset ###
def Save_csv(df: pd.DataFrame, path_out: str = "hotel_bookings_cleaned.csv") -> pd.DataFrame:
    """
    Load csv -> find missing/dupes -> drop duplicates -> fill missing data in selected columns ->
    convert negative values to zero for numeric columns -> save cleaned dataset into a csv.

    Returns the cleaned DataFrame.
    """
    df.to_csv(path_out, index=False)
    print(f"\nSaved cleaned dataset to: {path_out}")

    return df

# cleaned_df = clean_hotel_bookings(
#     path_in="hotel_bookings.csv",
#     path_out="hotel_bookings_cleaned.csv")


df = Load_CSV(path_in = "hotel_bookings.csv")


Loaded: hotel_bookings.csv
Shape: 119,390 rows × 32 columns



In [None]:
    """
    Load csv -> find missing/dupes -> drop duplicates -> fill missing data in selected columns ->
    convert negative values to zero for numeric columns -> save cleaned CSV.

    Returns the cleaned DataFrame.
    """