In [17]:
# main(df)

In [18]:
# def main(df):
    
  #  clean_data(df) (inside should be missing_values(), duplicated(), outliers())
  # feature 1 (inside should be all helper features invloved for example 2 and 3)
# etc
# return df

In [26]:
import pandas as pd


# 1) Load your dataset
path_in  = "hotel_bookings.csv"                 # <- change if needed
path_out = "hotel_bookings_cleaned.csv"
df = pd.read_csv(path_in)

print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns\n")

# 2) Missing values by column (count & %)
na_count = df.isna().sum().sort_values(ascending=False)
na_pct = (df.isna().mean() * 100).round(2).sort_values(ascending=False)
missing_summary = pd.DataFrame({"missing_count": na_count, "missing_pct": na_pct})
print("Missing values by column:")
print(missing_summary[missing_summary["missing_count"] > 0])

# Quick totals
total_missing_cells = int(df.isna().sum().sum())
rows_with_any_na = int(df.isna().any(axis=1).sum())
print(f"\nTotal missing cells: {total_missing_cells:,}")
print(f"Rows with ≥1 missing value: {rows_with_any_na:,}\n")

# 3) Exact duplicate rows (across all columns)
dup_count = int(df.duplicated(keep=False).sum())
unique_rows = df.drop_duplicates().shape[0]
print(f"Exact duplicate rows (counting all duplicates): {dup_count:,}")
print(f"Unique rows after dropping duplicates: {unique_rows:,}\n")

# Preview some duplicate rows (if any)
if dup_count > 0:
    dup_preview = df[df.duplicated(keep=False)]
    print("Sample duplicate rows:")
    print(dup_preview.head(10))


Shape: 119,390 rows × 32 columns

Missing values by column:
          missing_count  missing_pct
agent             16340        13.69
children              4         0.00
company          112593        94.31
country             488         0.41

Total missing cells: 129,425
Rows with ≥1 missing value: 119,173

Exact duplicate rows (counting all duplicates): 40,165
Unique rows after dropping duplicates: 87,396

Sample duplicate rows:
            hotel  is_canceled  lead_time  arrival_date_year  \
4    Resort Hotel            0         14               2015   
5    Resort Hotel            0         14               2015   
21   Resort Hotel            0         72               2015   
22   Resort Hotel            0         72               2015   
39   Resort Hotel            0         70               2015   
43   Resort Hotel            0         70               2015   
132  Resort Hotel            1          5               2015   
138  Resort Hotel            1          5          

In [27]:
# Drop exact duplicates across all columns

before = len(df)
df2 = df.drop_duplicates(keep="first").reset_index(drop=True)
after = len(df2)
print(f"Removed {before - after} duplicate rows. New shape: {df2.shape}")


Removed 31994 duplicate rows. New shape: (87396, 32)


In [28]:
# 2) Missing values by column (count & %)

na_count = df2.isna().sum().sort_values(ascending=False)
na_pct = (df2.isna().mean() * 100).round(2).sort_values(ascending=False)
missing_summary = pd.DataFrame({"missing_count": na_count, "missing_pct": na_pct})
print("Missing values by column:")
print(missing_summary[missing_summary["missing_count"] > 0])
total_missing_cells = int(df2.isna().sum().sum())
rows_with_any_na = int(df2.isna().any(axis=1).sum())
print(f"\nTotal missing cells: {total_missing_cells:,}")
print(f"Rows with ≥1 missing value: {rows_with_any_na:,}\n")

Missing values by column:
          missing_count  missing_pct
agent             12193        13.95
children              4         0.00
company           82137        93.98
country             452         0.52

Total missing cells: 94,786
Rows with ≥1 missing value: 87,207



In [29]:
# 1) Fill 0 for adult/adults, children, company
numeric_fill_cols = ["agent", "children", "company"]
for col in numeric_fill_cols:
    if col in df2.columns:
        # coerce to numeric first (company may be string IDs)
        df2[col] = pd.to_numeric(df2[col], errors="coerce").fillna(0)
        # keep as integer if possible
        try:
            df2[col] = df2[col].astype("Int64")
        except Exception:
            pass  # if it can't be integer, leave as numeric

# 2) Fill mode for country
if "country" in df2.columns:
    mode_vals = df2["country"].mode(dropna=True)
    mode_value = mode_vals.iloc[0] if not mode_vals.empty else "Unknown"
    df2["country"] = df2["country"].fillna(mode_value).astype(str)

In [30]:
df2.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,0,0,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,0,0,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,0,0,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304,0,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240,0,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [31]:
# Save
# df2.to_csv(path_out, index=False)
#print(f"Filled missing values and saved to: {path_out}")

## Helper features:

1) len_of_stay: weekday + weekend nights (outlier-capped)

In [32]:
def cal_len_of_stay(data):
    total_stay = []
    
    for x in range(len(data)):
        total = data.loc[x, "stays_in_week_nights"] + data.loc[x, "stays_in_weekend_nights"]
        total_stay.append(total)
    return total_stay

In [45]:
def outliers_cap(data, percentile=0.95):
    cleaned=[]
    sorted_data = sorted(data)
    number = len(sorted_data)
    index = int(percentile * (number-1))
    data_cap = sorted_data[index]
    
    for x in data:
            if x>data_cap:
                cleaned.append(data_cap)
            else:
                cleaned.append(x)
    return cleaned

In [46]:
def len_of_stay(data):
    total_stay = cal_len_of_stay(data)
    without_outliers = outliers_cap(total_stay)
    data["len_of_stay"] = without_outliers
    return data

In [47]:
# Test 
len_of_stay(df2)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,len_of_stay
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,0,0,0,Transient,0.00,0,0,Check-Out,2015-07-01,0
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,0,0,0,Transient,0.00,0,0,Check-Out,2015-07-01,0
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,0,0,0,Transient,75.00,0,0,Check-Out,2015-07-02,1
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,304,0,0,Transient,75.00,0,0,Check-Out,2015-07-02,1
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,240,0,0,Transient,98.00,0,1,Check-Out,2015-07-03,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87391,City Hotel,0,23,2017,August,35,30,2,5,2,...,394,0,0,Transient,96.14,0,0,Check-Out,2017-09-06,7
87392,City Hotel,0,102,2017,August,35,31,2,5,3,...,9,0,0,Transient,225.43,0,2,Check-Out,2017-09-07,7
87393,City Hotel,0,34,2017,August,35,31,2,5,2,...,9,0,0,Transient,157.71,0,4,Check-Out,2017-09-07,7
87394,City Hotel,0,109,2017,August,35,31,2,5,2,...,89,0,0,Transient,104.40,0,0,Check-Out,2017-09-07,7


2) revenue_booking:ADR × len_of_stay (set to 0 if cancelled)

In [49]:
def negative_values(data):
    cleaned= []
    for x in data:
        if x < 0:
            cleaned.append(0)
        else:
            cleaned.append(x)
    return cleaned

In [59]:
def revenue_booking(data):
    
    revenue=[]
    data["adr"] = negative_values(data["adr"])
    
    n = len(data)
    
    for x in range(n):
        if (data.loc[x, "is_canceled"]==1):
            revenue.append(0)
        else:
            adr = data.loc[x, "adr"]
            len_stay = data.loc[x, "len_of_stay"]
            revenue.append(adr*len_stay)
    data["revenue_booking"]= revenue
    return data

In [60]:
# test
revenue_dt = revenue_booking(df2)
revenue_dt["revenue_booking"].describe()

count    87396.000000
mean       252.756124
std        300.035874
min          0.000000
25%          0.000000
50%        168.000000
75%        375.300000
max       3410.000000
Name: revenue_booking, dtype: float64

3) late_booking_flag: 1 if lead_time < 7 days

In [52]:
def late_booking_flag(data):
    flag_late = []
    values = data["lead_time"]
    for x in values:
        if x<7:
            flag_late.append(1)
        else:
            flag_late.append(0)
    data["late_booking_flag"] = flag_late
    return data

In [58]:
# test
data = late_booking_flag(df2)
data["late_booking_flag"].unique()

array([0, 1])