In [20]:
import pandas as pd

def clean_hotel_bookings(
    path_in: str = "hotel_bookings.csv",
    path_out: str = "hotel_bookings_cleaned.csv",) -> pd.DataFrame:
    """
    Load csv -> find missing/dupes -> drop duplicates -> fill missing data in selected columns ->
    convert negative values to zero for numeric columns -> save cleaned CSV.

    Returns the cleaned DataFrame.
    """
    ### 1) Load dataset ###
    df = pd.read_csv(path_in)
    print(f"Loaded: {path_in}")
    print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns\n")

    ### 2) Find missing value ###
    na_count = df.isna().sum().sort_values(ascending=False)
    na_pct = (df.isna().mean() * 100).round(2).sort_values(ascending=False)
    missing_summary = pd.DataFrame({"missing_count": na_count, "missing_pct": na_pct})
    print("Missing values by column (pre-cleaning):")
    print(missing_summary[missing_summary["missing_count"] > 0])
    print(f"\nTotal missing cells: {int(df.isna().sum().sum()):,}")
    print(f"Rows with ≥1 missing value: {int(df.isna().any(axis=1).sum()):,}\n")

    ### 3) Find and remove duplicate rows ###
    dup_count = int(df.duplicated(keep=False).sum())
    print(f"Exact duplicate rows (pre-cleaning): {dup_count:,}")
    before = len(df)
    df = df.drop_duplicates(keep="first").reset_index(drop=True)

    print(f"Removed {before - len(df)} duplicates. New shape: {df.shape}\n")

    ### 4) Data cleaning / imputation ###
    # Fill 0 for agent, children, company 
    zero_fill_cols = [ "children", "company", "agent"]
    for col in zero_fill_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
            # keep as integer if possible
            try:
                df[col] = df[col].astype("Int64")
            except Exception:
                pass

    # Fill mode for country
    if "country" in df.columns:
        mode_country = df["country"].mode(dropna=True)
        mode_country_value = mode_country.iloc[0] if not mode_country.empty else "Unknown"
        df["country"] = df["country"].fillna(mode_country_value).astype(str)

    # Convert negative values to zero (numeric columns only)
    num_cols = df.select_dtypes(include=["number"]).columns
    # Count negatives per numeric column
    neg_col_counts = (df[num_cols] < 0).sum()
    cols_with_negs = neg_col_counts[neg_col_counts > 0]
    # Replace negatives with 0 in ALL numeric columns with negatives
    df[num_cols] = df[num_cols].clip(lower=0)

    if not cols_with_negs.empty:
        print("Negative values found and set to 0 in numeric columns:")
        # Summarize columns with negative values
        print(cols_with_negs.rename("negatives_corrected").to_frame())
    else:
        print("No negative numeric values found.\n")

    ### 5) Save cleaned dataset ###
    df.to_csv(path_out, index=False)
    print(f"\nSaved cleaned dataset to: {path_out}")

    return df

# Example usage:
cleaned_df = clean_hotel_bookings(
    path_in="hotel_bookings.csv",
    path_out="hotel_bookings_cleaned.csv",)


Loaded: hotel_bookings.csv
Shape: 119,390 rows × 32 columns

Missing values by column (pre-cleaning):
          missing_count  missing_pct
agent             16340        13.69
children              4         0.00
company          112593        94.31
country             488         0.41

Total missing cells: 129,425
Rows with ≥1 missing value: 119,173

Exact duplicate rows (pre-cleaning): 40,165
Removed 31994 duplicates. New shape: (87396, 32)

Negative values found and set to 0 in numeric columns:
     negatives_corrected
adr                    1

Saved cleaned dataset to: hotel_bookings_cleaned.csv


In [21]:
print(cleaned_df.head())

          hotel  is_canceled  lead_time  arrival_date_year arrival_date_month  \
0  Resort Hotel            0        342               2015               July   
1  Resort Hotel            0        737               2015               July   
2  Resort Hotel            0          7               2015               July   
3  Resort Hotel            0         13               2015               July   
4  Resort Hotel            0         14               2015               July   

   arrival_date_week_number  arrival_date_day_of_month  \
0                        27                          1   
1                        27                          1   
2                        27                          1   
3                        27                          1   
4                        27                          1   

   stays_in_weekend_nights  stays_in_week_nights  adults  ...  deposit_type  \
0                        0                     0       2  ...    No Deposit   
1     