In [7]:
import pandas as pd


# 1) Load your dataset
path_in  = "hotel_bookings.csv"                 # <- change if needed
path_out = "hotel_bookings_cleaned.csv"
df = pd.read_csv(path_in)

print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns\n")

# 2) Missing values by column (count & %)
na_count = df.isna().sum().sort_values(ascending=False)
na_pct = (df.isna().mean() * 100).round(2).sort_values(ascending=False)
missing_summary = pd.DataFrame({"missing_count": na_count, "missing_pct": na_pct})
print("Missing values by column:")
print(missing_summary[missing_summary["missing_count"] > 0])

# Quick totals
total_missing_cells = int(df.isna().sum().sum())
rows_with_any_na = int(df.isna().any(axis=1).sum())
print(f"\nTotal missing cells: {total_missing_cells:,}")
print(f"Rows with ≥1 missing value: {rows_with_any_na:,}\n")

# 3) Exact duplicate rows (across all columns)
dup_count = int(df.duplicated(keep=False).sum())
unique_rows = df.drop_duplicates().shape[0]
print(f"Exact duplicate rows (counting all duplicates): {dup_count:,}")
print(f"Unique rows after dropping duplicates: {unique_rows:,}\n")

# Preview some duplicate rows (if any)
if dup_count > 0:
    dup_preview = df[df.duplicated(keep=False)]
    print("Sample duplicate rows:")
    print(dup_preview.head(10))


Shape: 119,390 rows × 32 columns

Missing values by column:
          missing_count  missing_pct
agent             16340        13.69
children              4         0.00
company          112593        94.31
country             488         0.41

Total missing cells: 129,425
Rows with ≥1 missing value: 119,173

Exact duplicate rows (counting all duplicates): 40,165
Unique rows after dropping duplicates: 87,396

Sample duplicate rows:
            hotel  is_canceled  lead_time  arrival_date_year  \
4    Resort Hotel            0         14               2015   
5    Resort Hotel            0         14               2015   
21   Resort Hotel            0         72               2015   
22   Resort Hotel            0         72               2015   
39   Resort Hotel            0         70               2015   
43   Resort Hotel            0         70               2015   
132  Resort Hotel            1          5               2015   
138  Resort Hotel            1          5          

In [2]:
# Drop exact duplicates across all columns
before = len(df)
df2 = df.drop_duplicates(keep="first").reset_index(drop=True)
after = len(df2)

print(f"Removed {before - after} duplicate rows. New shape: {df2.shape}")

Removed 31994 duplicate rows. New shape: (87396, 32)


In [5]:
# 2) Missing values by column (count & %)
na_count = df2.isna().sum().sort_values(ascending=False)
na_pct = (df2.isna().mean() * 100).round(2).sort_values(ascending=False)
missing_summary = pd.DataFrame({"missing_count": na_count, "missing_pct": na_pct})
print("Missing values by column:")
print(missing_summary[missing_summary["missing_count"] > 0])

# Quick totals
total_missing_cells = int(df2.isna().sum().sum())
rows_with_any_na = int(df2.isna().any(axis=1).sum())
print(f"\nTotal missing cells: {total_missing_cells:,}")
print(f"Rows with ≥1 missing value: {rows_with_any_na:,}\n")

Missing values by column:
Empty DataFrame
Columns: [missing_count, missing_pct]
Index: []

Total missing cells: 0
Rows with ≥1 missing value: 0



In [4]:
# 1) Fill 0 for adult/adults, children, company
numeric_fill_cols = ["agent", "children", "company"]
for col in numeric_fill_cols:
    if col in df2.columns:
        # coerce to numeric first (company may be string IDs)
        df2[col] = pd.to_numeric(df2[col], errors="coerce").fillna(0)
        # keep as integer if possible
        try:
            df2[col] = df2[col].astype("Int64")
        except Exception:
            pass  # if it can't be integer, leave as numeric

# 2) Fill mode for country
if "country" in df2.columns:
    mode_vals = df2["country"].mode(dropna=True)
    mode_value = mode_vals.iloc[0] if not mode_vals.empty else "Unknown"
    df2["country"] = df2["country"].fillna(mode_value).astype(str)

In [8]:
# Save
df2.to_csv(path_out, index=False)
#print(f"Filled missing values and saved to: {path_out}")

Filled missing values and saved to: hotel_bookings_cleaned.csv
