In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [28]:
# Load dataset
df = pd.read_csv("C:/Users/YST PC/Documents/Berkeley Cert/Capstone Project/dataset/hotel_bookings.csv")

In [30]:
# Step 3: Handle missing values

# We clean the data by addressing missing values:
#	Drop the company column because it has too many missing values.
#	Fill missing values in agent with 'unknown'.
#	Fill country with the most frequent value (mode).
#	Fill children with the median value.

df.drop(columns=['company'])
df['agent'].fillna('unknown')
df['country'].fillna(df['country'].mode()[0])
df['children'].fillna(df['children'].median())

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
119385    0.0
119386    0.0
119387    0.0
119388    0.0
119389    0.0
Name: children, Length: 119390, dtype: float64

In [32]:
# Step 4: Drop irrelevant columns
# We remove columns that don't add predictive value or are redundant, such as reservation_status_date and arrival_date_* columns.
df.drop(columns=[
    'reservation_status_date', 'reservation_status',
    'arrival_date_year', 'arrival_date_month',
    'arrival_date_day_of_month', 'arrival_date_week_number'
], inplace=True)

In [34]:
# Step 5: Reduce cardinality for high-cardinality columns
# We simplify high-cardinality columns like agent and country by keeping only the top 10 most frequent values and labeling the rest as 'Other'. 
# This avoids overfitting.

df['agent'] = df['agent'].astype(str)
top_agents = df['agent'].value_counts().nlargest(10).index
df['agent'] = df['agent'].apply(lambda x: x if x in top_agents else 'Other')

top_countries = df['country'].value_counts().nlargest(10).index
df['country'] = df['country'].apply(lambda x: x if x in top_countries else 'Other')

In [36]:
# Step 6: One-hot encode categorical variables
# convert categorical variables into numeric format using one-hot encoding
df_encoded = pd.get_dummies(df, drop_first=True)

In [38]:
# Step 7: Split data into features and target
X = df_encoded.drop('is_canceled', axis=1)
y = df_encoded['is_canceled']

In [54]:
# Validating if X contain NaN value
X = X.fillna(0)
print(X.isnull().sum().sum())

0


In [56]:
# Step 8: Train-test split
# 	80% for training
#   20% for testing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [58]:
# Step 9: Scale features
# Normalize the features using StandardScaler to ensure all numeric features have similar scale
# It  helps logistic regression converge more effectively

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [60]:
# Step 10: Train logistic regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)

In [64]:
# Step 11: Make predictions and evaluate
y_pred = model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.824022112404724
Confusion Matrix:
 [[13599  1434]
 [ 2768  6077]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.90      0.87     15033
           1       0.81      0.69      0.74      8845

    accuracy                           0.82     23878
   macro avg       0.82      0.80      0.80     23878
weighted avg       0.82      0.82      0.82     23878



In [70]:
# Calculate MSE
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print(mse)

0.175977887595276
