In [2]:
#Import Required Libraries
import pandas as pd
import numpy as np
from scipy.stats import zscore

#Load Dataset from GitHub
url = "https://raw.githubusercontent.com/swapnilsaurav/Dataset/master/hotel_bookings.csv"
df = pd.read_csv(url)

print(df.shape)
df.head()

#Select Numeric Columns for Outlier Detection
numeric_cols = [
    "lead_time",
    "adr",
    "stays_in_weekend_nights",
    "stays_in_week_nights",
    "adults",
    "children",
    "babies"
]

df_num = df[numeric_cols].copy()

#Z-Score Based Outlier Detection
z_scores = np.abs(zscore(df_num, nan_policy="omit"))
z_outliers = (z_scores > 3)

df["Z_Outlier"] = z_outliers.any(axis=1)
df["Z_Outlier"].value_counts()

Q1 = df_num.quantile(0.25)
Q3 = df_num.quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR


iqr_outliers = ((df_num < lower_bound) | (df_num > upper_bound))

df["IQR_Outlier"] = iqr_outliers.any(axis=1)

df["IQR_Outlier"].value_counts()

#Compare Z-Score vs IQR
comparison = pd.crosstab(
    df["Z_Outlier"],
    df["IQR_Outlier"],
    rownames=["Z-Score"],
    colnames=["IQR"]
)

print("Comparison:\n",comparison)

df_outliers = df[df["Z_Outlier"] | df["IQR_Outlier"]]

df_outliers[
    ["lead_time", "adr", "adults", "children", "babies",
     "Z_Outlier", "IQR_Outlier"]
].head(10)




(119390, 33)
Comparison:
 IQR      False  True 
Z-Score              
False    75006  35384
True       321   8679


Unnamed: 0,lead_time,adr,adults,children,babies,Z_Outlier,IQR_Outlier
1,737.0,0.0,2.0,0.0,0.0,True,True
2,7.0,75.0,1.0,0.0,0.0,False,True
3,13.0,75.0,1.0,0.0,0.0,False,True
13,18.0,154.77,2.0,1.0,0.0,False,True
20,37.0,97.29,1.0,0.0,0.0,False,True
30,118.0,62.0,1.0,0.0,0.0,True,True
31,95.0,63.86,2.0,0.0,0.0,True,True
32,96.0,108.3,2.0,0.0,0.0,False,True
34,45.0,108.8,3.0,0.0,0.0,False,True
35,40.0,108.8,3.0,0.0,0.0,False,True
