In [21]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.impute import SimpleImputer
import joblib

In [22]:

# Path to save/load dummy column names
DUMMY_COLUMNS_PATH = r"C:\Users\user\Desktop\Github\Hotel booking prediction\hotel-booking-prediction\models\dummy_columns.pkl"

def eda_preprocess_regression(df):
    """
    Performs end-to-end EDA preprocessing for ADR regression.
    
    Steps:
    1. Drop columns that are not useful or may cause data leakage.
    2. Impute missing values (median for numeric, mode for categorical).
    3. Filter out rows with no guests (adults, children, and babies are all zero).
    4. Convert 'reservation_status_date' to datetime and extract date features.
    5. Remove outliers using IsolationForest on numeric columns (excluding target 'adr').
    6. One-hot encode categorical variables & ensure all features match training.
    
    Returns a cleaned DataFrame ready for regression modeling.
    """
    df = df.copy()
    
    # Step 1: Drop unwanted columns
    drop_cols = ['is_canceled', 'booking_changes', 'assigned_room_type',
                 'reservation_status', 'agent', 'company', 'days_in_waiting_list']
    df.drop(columns=drop_cols, errors='ignore', inplace=True)
    
    # Step 2: Impute missing values
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    cat_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    if num_cols:
        imp_num = SimpleImputer(strategy='median')
        df[num_cols] = imp_num.fit_transform(df[num_cols])
    
    if cat_cols:
        imp_cat = SimpleImputer(strategy='most_frequent')
        df[cat_cols] = imp_cat.fit_transform(df[cat_cols])
    
    # Step 3: Filter out rows with no guests
    if set(['adults', 'children', 'babies']).issubset(df.columns):
        no_guest_filter = (df['adults'] == 0) & (df['children'] == 0) & (df['babies'] == 0)
        df = df[~no_guest_filter]
    
    # Step 4: Convert 'reservation_status_date' to datetime and extract date features
    if 'reservation_status_date' in df.columns:
        df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'], errors='coerce')
        df['reservation_year'] = df['reservation_status_date'].dt.year
        df['reservation_month'] = df['reservation_status_date'].dt.month
        df['reservation_day'] = df['reservation_status_date'].dt.day
        df['reservation_weekday'] = df['reservation_status_date'].dt.weekday
        df.drop(columns=['reservation_status_date'], inplace=True, errors='ignore')
    
    # Step 5: Outlier handling using IsolationForest on numeric predictors (excluding target 'adr')
    if 'adr' in df.columns:
        predictor_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
        if 'adr' in predictor_cols:
            predictor_cols.remove('adr')
    else:
        predictor_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    
    try:
        if len(predictor_cols) > 0:
            iforest = IsolationForest(n_estimators=50, contamination=0.1, random_state=42)
            outlier_preds = iforest.fit_predict(df[predictor_cols])
            df = df[outlier_preds != -1]
    except Exception as e:
        print("Warning: IsolationForest encountered an error:", e)
    
    # Step 6: One-hot encode categorical variables
    cat_cols = df.select_dtypes(include=['object']).columns.tolist()
    if cat_cols:
        df = pd.get_dummies(df, columns=cat_cols, drop_first=False, dtype=int)

    # Step 7: Ensure all dummy columns are present (important for inference!)
    try:
        # Load saved dummy column names from training
        saved_dummy_columns = joblib.load(DUMMY_COLUMNS_PATH)

        # Add missing columns as zeros
        for col in saved_dummy_columns:
            if col not in df.columns:
                df[col] = 0
        
        # Reorder columns to match training
        df = df[saved_dummy_columns]
    
    except FileNotFoundError:
        print(f"Warning: {DUMMY_COLUMNS_PATH} not found. Saving current columns for future reference.")
        joblib.dump(df.columns.tolist(), DUMMY_COLUMNS_PATH)

    return df


In [23]:
# Load raw data from CSV

raw_data_path =r"C:\Users\user\Desktop\Github\Hotel booking prediction\hotel-booking-prediction\data\archive.zip"  
df_raw = pd.read_csv(raw_data_path)
df_raw.head()


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [24]:
df_clean = eda_preprocess_regression(df_raw)
df_clean.head()






Unnamed: 0,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,...,reserved_room_type_G,reserved_room_type_H,reserved_room_type_L,deposit_type_No Deposit,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party
0,342.0,2015.0,27.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,1,0
1,737.0,2015.0,27.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,1,0
2,7.0,2015.0,27.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,1,0
3,13.0,2015.0,27.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,1,0
4,14.0,2015.0,27.0,1.0,0.0,2.0,2.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,1,0


In [25]:
# Save the cleaned data to a CSV file

cleaned_data_path = r"C:\Users\user\Desktop\Github\Hotel booking prediction\hotel-booking-prediction\data\cleaned_regression_data.csv"
df_clean.to_csv(cleaned_data_path, index=False)
print("Cleaned data saved to:", cleaned_data_path)


Cleaned data saved to: C:\Users\user\Desktop\Github\Hotel booking prediction\hotel-booking-prediction\data\cleaned_regression_data.csv
