In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:

# Load dataset
data = pd.read_csv("data.csv")

# Step 1: Data Preprocessing
print("Initial Data Info:")
print(data.info())
print("\nMissing Values:")
print(data.isnull().sum())

# Handle missing values (if any)
data.dropna(inplace=True)

# Remove whitespace from column names
data.columns = data.columns.str.strip()


Initial Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36285 entries, 0 to 36284
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Booking_ID                36285 non-null  object 
 1   number of adults          36285 non-null  int64  
 2   number of children        36285 non-null  int64  
 3   number of weekend nights  36285 non-null  int64  
 4   number of week nights     36285 non-null  int64  
 5   type of meal              36285 non-null  object 
 6   car parking space         36285 non-null  int64  
 7   room type                 36285 non-null  object 
 8   lead time                 36285 non-null  int64  
 9   market segment type       36285 non-null  object 
 10  repeated                  36285 non-null  int64  
 11  P-C                       36285 non-null  int64  
 12  P-not-C                   36285 non-null  int64  
 13  average price             36285 non-null  

In [3]:
# Step 2: Handling Outliers using IQR
num_cols = data.select_dtypes(include=['int64', 'float64']).columns
for col in num_cols:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]


In [8]:
# Step 3: Feature Engineering

data['adults_children'] = data['number of adults'] * data['number of children']
data['weekend_weeknights'] = data['number of weekend nights'] * data['number of week nights']

data['lead_time_squared'] = data['lead time'] ** 2
data['average_price_squared'] = data['average price'] ** 2

data['total_nights'] = data['number of weekend nights'] + data['number of week nights']
data['total_people'] = data['number of adults'] + data['number of children']

expected_features = ['number of adults', 'number of children', 'number of weekend nights', 'number of week nights', 'type of meal', 'car parking space', 'room type', 'lead time', 'market segment type', 'repeated', 'P-C', 'P-not-C', 'average price', 'special requests', 'adults_children', 'weekend_weeknights', 'lead_time_squared', 'average_price_squared', 'total_nights', 'total_people']
existing_features = [col for col in expected_features if col in data.columns]
data = data[existing_features + ['booking status']]


In [9]:
# Step 4: Transforming Categorical Data
le = LabelEncoder()
data['booking status'] = le.fit_transform(data['booking status'])  # Binary Target
categorical_cols = ['type of meal', 'room type', 'market segment type']
categorical_cols = [col for col in categorical_cols if col in data.columns]
for col in categorical_cols:
    data[col] = le.fit_transform(data[col])

In [10]:

# Step 5: Train-Test Split
X = data.drop(columns=['booking status'])
y = data['booking status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])


In [7]:

# Step 6: Modeling & Accuracy Calculation
# K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))

# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))


KNN Accuracy: 0.8355356300141576
              precision    recall  f1-score   support

           0       0.79      0.73      0.76      1505
           1       0.86      0.89      0.88      2733

    accuracy                           0.84      4238
   macro avg       0.82      0.81      0.82      4238
weighted avg       0.83      0.84      0.83      4238

Logistic Regression Accuracy: 0.7954223690420009
              precision    recall  f1-score   support

           0       0.76      0.62      0.68      1505
           1       0.81      0.89      0.85      2733

    accuracy                           0.80      4238
   macro avg       0.79      0.75      0.77      4238
weighted avg       0.79      0.80      0.79      4238



In [10]:
# Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.8728173666823974
              precision    recall  f1-score   support

           0       0.85      0.78      0.81      1505
           1       0.89      0.92      0.90      2733

    accuracy                           0.87      4238
   macro avg       0.87      0.85      0.86      4238
weighted avg       0.87      0.87      0.87      4238

