In [14]:
import pandas as pd


# 1) Load your dataset
path_in  = "hotel_bookings.csv"                 # <- change if needed
path_out = "hotel_bookings_cleaned.csv"
df = pd.read_csv(path_in)

print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns\n")

# 2) Missing values by column (count & %)
na_count = df.isna().sum().sort_values(ascending=False)
na_pct = (df.isna().mean() * 100).round(2).sort_values(ascending=False)
missing_summary = pd.DataFrame({"missing_count": na_count, "missing_pct": na_pct})
print("Missing values by column:")
print(missing_summary[missing_summary["missing_count"] > 0])

# Quick totals
total_missing_cells = int(df.isna().sum().sum())
rows_with_any_na = int(df.isna().any(axis=1).sum())
print(f"\nTotal missing cells: {total_missing_cells:,}")
print(f"Rows with ≥1 missing value: {rows_with_any_na:,}\n")

# 3) Exact duplicate rows (across all columns)
dup_count = int(df.duplicated(keep=False).sum())
unique_rows = df.drop_duplicates().shape[0]
print(f"Exact duplicate rows (counting all duplicates): {dup_count:,}")
print(f"Unique rows after dropping duplicates: {unique_rows:,}\n")

# Preview some duplicate rows (if any)
if dup_count > 0:
    dup_preview = df[df.duplicated(keep=False)]
    print("Sample duplicate rows:")
    print(dup_preview.head(10))


Shape: 119,390 rows × 32 columns

Missing values by column:
          missing_count  missing_pct
agent             16340        13.69
children              4         0.00
company          112593        94.31
country             488         0.41

Total missing cells: 129,425
Rows with ≥1 missing value: 119,173

Exact duplicate rows (counting all duplicates): 40,165
Unique rows after dropping duplicates: 87,396

Sample duplicate rows:
            hotel  is_canceled  lead_time  arrival_date_year  \
4    Resort Hotel            0         14               2015   
5    Resort Hotel            0         14               2015   
21   Resort Hotel            0         72               2015   
22   Resort Hotel            0         72               2015   
39   Resort Hotel            0         70               2015   
43   Resort Hotel            0         70               2015   
132  Resort Hotel            1          5               2015   
138  Resort Hotel            1          5          

In [15]:
# Drop exact duplicates across all columns

before = len(df)
df2 = df.drop_duplicates(keep="first").reset_index(drop=True)
after = len(df2)
print(f"Removed {before - after} duplicate rows. New shape: {df2.shape}")


Removed 31994 duplicate rows. New shape: (87396, 32)


In [16]:
# 2) Missing values by column (count & %)

na_count = df2.isna().sum().sort_values(ascending=False)
na_pct = (df2.isna().mean() * 100).round(2).sort_values(ascending=False)
missing_summary = pd.DataFrame({"missing_count": na_count, "missing_pct": na_pct})
print("Missing values by column:")
print(missing_summary[missing_summary["missing_count"] > 0])
total_missing_cells = int(df2.isna().sum().sum())
rows_with_any_na = int(df2.isna().any(axis=1).sum())
print(f"\nTotal missing cells: {total_missing_cells:,}")
print(f"Rows with ≥1 missing value: {rows_with_any_na:,}\n")

Missing values by column:
          missing_count  missing_pct
agent             12193        13.95
children              4         0.00
company           82137        93.98
country             452         0.52

Total missing cells: 94,786
Rows with ≥1 missing value: 87,207



In [17]:
# 1) Fill 0 for adult/adults, children, company
numeric_fill_cols = ["agent", "children", "company"]
for col in numeric_fill_cols:
    if col in df2.columns:
        # coerce to numeric first (company may be string IDs)
        df2[col] = pd.to_numeric(df2[col], errors="coerce").fillna(0)
        # keep as integer if possible
        try:
            df2[col] = df2[col].astype("Int64")
        except Exception:
            pass  # if it can't be integer, leave as numeric

# 2) Fill mode for country
if "country" in df2.columns:
    mode_vals = df2["country"].mode(dropna=True)
    mode_value = mode_vals.iloc[0] if not mode_vals.empty else "Unknown"
    df2["country"] = df2["country"].fillna(mode_value).astype(str)

In [18]:
df2.head()
df= df2

In [19]:
# feature 6 
df['customer_id'] = (
    df['country'].astype(str) + '_' +
    df['market_segment'].astype(str) + '_' +
    df['distribution_channel'].astype(str) + '_' +
    df['reserved_room_type'].astype(str)
).factorize()[0] + 1 #factorize in pandas means assign unique id integer to each distinct combination

In [20]:
#loyality check 

# checks if the guest id appears more than once in the dataset
df['is_repeated_guest'] = df['customer_id'].duplicated(keep=False).astype(int)

# checks if the guest cancels before
guest_cancel = df.groupby('customer_id')['is_canceled'].max().to_dict() 
df['previous_cancellations'] = df['customer_id'].map(guest_cancel)

#combine both checks with OR if either one of them is true - if the customer repeated or have previous cancellation their loyality check is 1, otherwise 0
df['loyal_check'] = ((df['is_repeated_guest'] == 1) | (df['previous_cancellations'] == 1)).astype(int)

print(df[['customer_id', 'is_repeated_guest','previous_cancellations','loyal_check']].head())


   customer_id  is_repeated_guest  previous_cancellations  loyal_check
0            1                  1                       1            1
1            1                  1                       1            1
2            2                  1                       1            1
3            3                  1                       0            1
4            4                  1                       1            1


In [22]:
# feature 1
def cal_len_of_stay(data):
    total_stay = []
    number = len(data)
    for x in range(number):
        total = data.loc[x, "stays_in_week_nights"] + data.loc[x, "stays_in_weekend_nights"]
        total_stay.append(total)
    return total_stay

In [23]:
def outliers_cap(data, percentile=0.95):
    cleaned=[]
    sorted_data = sorted(data)
    number = len(sorted_data)
    index = int(percentile * (number-1))
    data_cap = sorted_data[index]
    
    for x in data:
            if x>data_cap:
                cleaned.append(data_cap)
            else:
                cleaned.append(x)
    return cleaned

In [24]:
def len_of_stay(data):
    total_stay = cal_len_of_stay(data)
    without_outliers = outliers_cap(total_stay)
    data["len_of_stay"] = without_outliers
    return data

In [25]:
# Test 
len_of_stay(df2)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,customer_id,loyal_check,customer_segmentation,len_of_stay
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,Transient,0.00,0,0,Check-Out,2015-07-01,1,1,Couple,0
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,Transient,0.00,0,0,Check-Out,2015-07-01,1,1,Couple,0
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,Transient,75.00,0,0,Check-Out,2015-07-02,2,1,Solo,1
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,Transient,75.00,0,0,Check-Out,2015-07-02,3,1,Solo,1
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,Transient,98.00,0,1,Check-Out,2015-07-03,4,1,Couple,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87391,City Hotel,0,23,2017,August,35,30,2,5,2,...,Transient,96.14,0,0,Check-Out,2017-09-06,245,1,Couple,7
87392,City Hotel,0,102,2017,August,35,31,2,5,3,...,Transient,225.43,0,2,Check-Out,2017-09-07,156,1,Others,7
87393,City Hotel,0,34,2017,August,35,31,2,5,2,...,Transient,157.71,0,4,Check-Out,2017-09-07,247,1,Couple,7
87394,City Hotel,0,109,2017,August,35,31,2,5,2,...,Transient,104.40,0,0,Check-Out,2017-09-07,4,1,Couple,7


In [27]:
# feature 2

In [28]:
def negative_values(data):
    cleaned= []
    for x in data:
        if x < 0:
            cleaned.append(0)
        else:
            cleaned.append(x)
    return cleaned

In [29]:
def revenue_booking(data):
    
    revenue=[]
    data["adr"] = negative_values(data["adr"])
    
    n = len(data)
    
    for x in range(n):
        if (data.loc[x, "is_canceled"]==1):
            revenue.append(0)
        else:
            adr = data.loc[x, "adr"]
            len_stay = data.loc[x, "len_of_stay"]
            revenue.append(adr*len_stay)
    data["revenue_booking"]= revenue
    return data

In [30]:
# test
revenue_dt = revenue_booking(df2)
revenue_dt["revenue_booking"].describe()

count    87396.000000
mean       252.756124
std        300.035874
min          0.000000
25%          0.000000
50%        168.000000
75%        375.300000
max       3410.000000
Name: revenue_booking, dtype: float64

In [31]:
# feature 5

In [69]:
def segment_value(df , bins: int = 3):
    df["segment_value"] = pd.cut(
        df["revenue_booking"], bins=bins, labels=["Low", "Medium", "High"], include_lowest=True
    ).astype("category")
    return df

In [70]:
def main(df):
    df = df.sort_values(by ="revenue_booking")
    df = segment_value(df)
    return df

In [71]:
df = main(df)

Low / Medium / High bands split by equal price ranges between min and max.
Stable, easy-to-interpret thresholds (e.g., “0–200, 200–400, 400+”), which is convenient for pricing policies and dashboard presentation.
Business relevance: Clear price ladders for marketing and yield dashboards; good for rule-based actions (e.g., stricter deposit for High band).

In [72]:
# combined feature (5 and 6)

In [73]:
def high_loyalty(df):
    if ((df["previous_cancellations"]==0) & (df["loyal_check"]==1)):
        return "Black Member"
    elif ((df["previous_cancellations"]>0) & (df["loyal_check"]==1)):
        return "Titanium Member"
    elif ((df["revenue_booking"]>200) & (df["loyal_check"]==0)):
        return "Diamond Member"
    else:
        return "Ambassador Member"

In [74]:
def medium_loyalty(df):
    if ((df["previous_cancellations"]==0) & (df["loyal_check"]==1)):
        return "Platinum Member"
    elif ((df["previous_cancellations"]>0) & (df["loyal_check"]==1)):
        return "Gold Member"
    elif ((df["revenue_booking"]>100) & (df["loyal_check"]==0)):
        return "Elite Member"
    else:
        return "Premier Member"

In [75]:
def low_loyalty(df):
    if ((df["previous_cancellations"]==0) & (df["loyal_check"]==1)):
        return "Silver Member"
    elif ((df["previous_cancellations"]>0) & (df["loyal_check"]==1)):
        return "Classic Member"
    elif ((df["revenue_booking"]>50) & (df["loyal_check"]==0)):
        return "Plus Member"
    else:
        return "Member"

In [76]:
def loyalty_matrix(df):
    if df["segment_value"]=="High":
        return high_loyalty(df)
    elif df["segment_value"]=="Medium":
        return medium_loyalty(df)
    elif df["segment_value"]=="Low":
        return low_loyalty(df)
    else:
        "Unclassified"

In [77]:
def assign(df):
    df["loyalty_matrix"] = df.apply(loyalty_matrix,axis=1)
    return df

In [79]:
# test
df["loyalty_matrix"].unique()

array(['Classic Member', 'Silver Member', 'Member', 'Plus Member',
       'Gold Member', 'Platinum Member', 'Elite Member', 'Black Member',
       'Titanium Member', 'Diamond Member'], dtype=object)