In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.impute import SimpleImputer


In [None]:
def eda_preprocess_regression(df):
    """
    Performs end-to-end EDA preprocessing for ADR regression.
    
    Steps:
    1. Drop columns that are not useful or may cause data leakage.
    2. Impute missing values (median for numeric, mode for categorical).
    3. Filter out rows with no guests (adults, children, and babies are all zero).
    4. Convert 'reservation_status_date' to datetime and extract date features.
    5. Remove outliers using IsolationForest on numeric columns (excluding target 'adr').
    6. One-hot encode categorical variables.
    
    Returns a cleaned DataFrame ready for regression modeling.
    """
    df = df.copy()
    
    # Step 1: Drop unwanted columns
    drop_cols = ['is_canceled', 'booking_changes', 'assigned_room_type',
                 'reservation_status', 'agent', 'company', 'days_in_waiting_list']
    df.drop(columns=drop_cols, errors='ignore', inplace=True)
    
    # Step 2: Impute missing values
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    cat_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    if num_cols:
        imp_num = SimpleImputer(strategy='median')
        df[num_cols] = imp_num.fit_transform(df[num_cols])
    
    if cat_cols:
        imp_cat = SimpleImputer(strategy='most_frequent')
        df[cat_cols] = imp_cat.fit_transform(df[cat_cols])
    
    # Step 3: Filter out rows with no guests
    if set(['adults', 'children', 'babies']).issubset(df.columns):
        no_guest_filter = (df['adults'] == 0) & (df['children'] == 0) & (df['babies'] == 0)
        df = df[~no_guest_filter]
    
    # Step 4: Convert 'reservation_status_date' to datetime and extract date features
    if 'reservation_status_date' in df.columns:
        df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'], errors='coerce')
        df['reservation_year'] = df['reservation_status_date'].dt.year
        df['reservation_month'] = df['reservation_status_date'].dt.month
        df['reservation_day'] = df['reservation_status_date'].dt.day
        df['reservation_weekday'] = df['reservation_status_date'].dt.weekday
        df.drop(columns=['reservation_status_date'], inplace=True, errors='ignore')
    
    # Step 5: Outlier handling using IsolationForest on numeric predictors (excluding target 'adr')
    if 'adr' in df.columns:
        predictor_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
        if 'adr' in predictor_cols:
            predictor_cols.remove('adr')
    else:
        predictor_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    
    try:
        if len(predictor_cols) > 0:
            iforest = IsolationForest(n_estimators=50, contamination=0.1, random_state=42)
            outlier_preds = iforest.fit_predict(df[predictor_cols])
            df = df[outlier_preds != -1]
    except Exception as e:
        print("Warning: IsolationForest encountered an error:", e)
    
    # Step 6: One-hot encode categorical variables
    cat_cols = df.select_dtypes(include=['object']).columns.tolist()
    if cat_cols:
        df = pd.get_dummies(df, columns=cat_cols, drop_first=True, dtype=int)
    
    return df


In [None]:
# Load raw data from CSV

raw_data_path =r"C:\Users\user\Desktop\Github\Hotel booking prediction\hotel-booking-prediction\data\archive.zip"  
df_raw = pd.read_csv(raw_data_path)
df_raw.head()


In [None]:
df_clean = eda_preprocess_regression(df_raw)
df_clean.head()


In [None]:
# Save the cleaned data to a CSV file

cleaned_data_path = r"C:\Users\user\Desktop\Github\Hotel booking prediction\hotel-booking-prediction\data\cleaned_regression_data.csv"
df_clean.to_csv(cleaned_data_path, index=False)
print("Cleaned data saved to:", cleaned_data_path)
