| Column Name         | Description                                                                                     |
|---------------------|-------------------------------------------------------------------------------------------------|
| Booking_ID | Unique identifier for each booking.                                                               |
| number of adults           | The count of adult guests in the booking.                                                                              |
| number of children         |The count of child guests in the booking.                                                                            |
| number of weekend nights       | The number of nights during the weekend (typically Friday and Saturday nights).                                             |
| number of week nights             | The number of nights during the weekdays (Sunday through Thursday nights).                         |
| type of meal   | The meal plan included in the booking.                      |
| car parking space                 | Whether the booking includes a parking space for a car.                                                            |
| room type         | The type of room booked.                                                          |
| lead time     | Number of days that elapsed between date of the booking into the PMS and the arrival date                                                           |
| market segment type    | The market segment the booking belongs to                                                   |
| repeated     | Indicates whether the booking is a repeat booking.                                                    |
| P-C   | Probability of Cancelation - The likelihood that the booking will be canceled.                                                  |
| P-not-C    | Probability of Not Cancelation - The likelihood that the booking will not be canceled.                                                   |
| average price      | The average price per night for the booking.                                                              |
| special requests                | Any additional requests or requirements made by the guest.                                                         |
| date of reservation                 | Date when the booking was made.                                                        |
| booking status               | The current status of the booking (e.g., confirmed, canceled, pending).                                                                   |


In [29]:
import numpy as np
import pandas as pd
import pickle

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV


from scipy.stats import zscore, skew



#Step 1 **Data Preprocessing**

In [2]:
df = pd.read_csv(r"C:\Users\youss\Desktop\eng\Machine_learning_course\Cellula technologies\first inten project.csv")
df

Unnamed: 0,Booking_ID,number of adults,number of children,number of weekend nights,number of week nights,type of meal,car parking space,room type,lead time,market segment type,repeated,P-C,P-not-C,average price,special requests,date of reservation,booking status
0,INN00001,1,1,2,5,Meal Plan 1,0,Room_Type 1,224,Offline,0,0,0,88.00,0,10/2/2015,Not_Canceled
1,INN00002,1,0,1,3,Not Selected,0,Room_Type 1,5,Online,0,0,0,106.68,1,11/6/2018,Not_Canceled
2,INN00003,2,1,1,3,Meal Plan 1,0,Room_Type 1,1,Online,0,0,0,50.00,0,2/28/2018,Canceled
3,INN00004,1,0,0,2,Meal Plan 1,0,Room_Type 1,211,Online,0,0,0,100.00,1,5/20/2017,Canceled
4,INN00005,1,0,1,2,Not Selected,0,Room_Type 1,48,Online,0,0,0,77.00,0,4/11/2018,Canceled
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36280,INN36282,2,0,0,2,Meal Plan 2,0,Room_Type 1,346,Online,0,0,0,115.00,1,9/13/2018,Canceled
36281,INN36283,2,0,1,3,Meal Plan 1,0,Room_Type 1,34,Online,0,0,0,107.55,1,10/15/2017,Not_Canceled
36282,INN36284,2,0,1,3,Meal Plan 1,0,Room_Type 4,83,Online,0,0,0,105.61,1,12/26/2018,Not_Canceled
36283,INN36285,3,0,0,4,Meal Plan 1,0,Room_Type 1,121,Offline,0,0,0,96.90,1,7/6/2018,Not_Canceled


In [3]:
df.rename(columns ={"average price " : "average price" }, inplace= True)    # for the extra white space
df.describe()

Unnamed: 0,number of adults,number of children,number of weekend nights,number of week nights,car parking space,lead time,repeated,P-C,P-not-C,average price,special requests
count,36285.0,36285.0,36285.0,36285.0,36285.0,36285.0,36285.0,36285.0,36285.0,36285.0,36285.0
mean,1.844839,0.10536,0.810693,2.204602,0.030977,85.239851,0.02563,0.023343,0.153369,103.421636,0.619733
std,0.518813,0.402704,0.87059,1.410946,0.173258,85.938796,0.158032,0.368281,1.753931,35.086469,0.786262
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,1.0,0.0,17.0,0.0,0.0,0.0,80.3,0.0
50%,2.0,0.0,1.0,2.0,0.0,57.0,0.0,0.0,0.0,99.45,0.0
75%,2.0,0.0,2.0,3.0,0.0,126.0,0.0,0.0,0.0,120.0,1.0
max,4.0,10.0,7.0,17.0,1.0,443.0,1.0,13.0,58.0,540.0,5.0


In [4]:
print(f"The size of my Dataset: {df.shape}")

The size of my Dataset: (36285, 17)


##1.1 Check Null

In [5]:
has_null = df.isnull().values.any()  # False
print(f"DataFrame has null values: {has_null}")

DataFrame has null values: False


##1.2 Data Type

In [6]:
df.dtypes

Booking_ID                   object
number of adults              int64
number of children            int64
number of weekend nights      int64
number of week nights         int64
type of meal                 object
car parking space             int64
room type                    object
lead time                     int64
market segment type          object
repeated                      int64
P-C                           int64
P-not-C                       int64
average price               float64
special requests              int64
date of reservation          object
booking status               object
dtype: object

Each Column has its appropriate data type

In [7]:
# Check for duplicates in the entire DataFrame
duplicates = df[df.duplicated()]

print("Duplicate rows in the DataFrame:")
print(duplicates)

Duplicate rows in the DataFrame:
Empty DataFrame
Columns: [Booking_ID, number of adults, number of children, number of weekend nights, number of week nights, type of meal, car parking space, room type, lead time, market segment type, repeated, P-C, P-not-C, average price, special requests, date of reservation, booking status]
Index: []


Note: Data has no dublicates

#Step 2 **Check an Handle Outliers**

##2.1 Check Outliers

In [8]:
numeric_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(include=['object'])

skewness_values = df[numeric_cols].apply(skew)
print("Skewness of numeric columns:")
print(skewness_values)

Skewness of numeric columns:
number of adults            -0.332819
number of children           4.707481
number of weekend nights     0.737605
number of week nights        1.598619
car parking space            5.414240
lead time                    1.292382
repeated                     6.003535
P-C                         25.202317
P-not-C                     19.252059
average price                0.667158
special requests             1.145051
dtype: float64


Notice the high Skewness in the data.
To handle outliers we have 2 methods:

1.   Use ***Zscore*** for normal distributed data

2.   Use ***IQR*** for skewed data




In the following code we check for skweness and apply the suitable method (using a threshold of 0.5)

In [9]:
numeric_cont_cols = ['lead time', 'average price']
skewed_cols = df[numeric_cont_cols].apply(skew)

outliers_zscore = pd.DataFrame()
outliers_IQR = pd.DataFrame()

for col in numeric_cont_cols:

  if(np.abs(skewed_cols[col]) < 0.5):
    # Use Z-score for near-normal distributions
    z_scores = np.abs(zscore(df[col]))
    outliers_rows = df.loc[z_scores > 4, col]
    # Store the outliers rows with its outlier value for any further processing
    outliers_zscore = pd.concat([outliers_zscore, outliers_rows], axis=1)       #column wise

  else:
    Q1 = df[col].quantile(.25)
    Q3 = df[col].quantile(.75)
    IQR = Q3 - Q1
    # Used factor 2 instead of 1.5 due to the high percentage of outliers
    outliers_col = df.loc[(df[col] < (Q1 - 2 * IQR)) | (df[col] > (Q3 + 2 * IQR)), col]
    outliers_IQR = pd.concat([outliers_IQR, outliers_col], axis=1)


# Combine outlier indices from both methods
outlier_indices_zscore = outliers_zscore.index
outlier_indices_IQR = outliers_IQR.index
total_outlier_indices = outlier_indices_zscore.union(outlier_indices_IQR)

# Remove outliers from the original DataFrame
df_cleaned = df.drop(total_outlier_indices)

print("Outliers detected using Z-score:")
print(outliers_zscore)
print("\nOutliers detected using IQR:")
print(outliers_IQR)

outliers_count = len(total_outlier_indices)
total_rows = df.shape[0]
cleaned_rows = df_cleaned.shape[0]
percentage_outliers = (outliers_count / total_rows) * 100


print(f'Total number of rows has outliers: "{outliers_count}" ')
print(f'The size of the Dataset after extracting outliers: "{df_cleaned.shape}"')

print(f'The percentage of outliers removed: {percentage_outliers:.2f}%')
df = df_cleaned

Outliers detected using Z-score:
Empty DataFrame
Columns: []
Index: []

Outliers detected using IQR:
       lead time  average price
5          346.0            NaN
118        443.0            NaN
199        418.0            NaN
205        433.0            NaN
369        386.0            NaN
...          ...            ...
36217        NaN            0.0
36221        NaN          207.9
36227        NaN          231.0
36250        NaN            0.0
36269        NaN          216.0

[1442 rows x 2 columns]
Total number of rows has outliers: "1442" 
The size of the Dataset after extracting outliers: "(34843, 17)"
The percentage of outliers removed: 3.97%


Note:


*   Low Percentage: In many cases, datasets with well-behaved, normally distributed data might have outlier percentages in the range of 1% to 5%.
*   Higher Percentage: Datasets with skewed distributions or data prone to extreme values (e.g., income data, social media metrics) might have outlier percentages exceeding 5% or more.

-ChatGPT



#Step 3 **Feature Engineering**

In [10]:
df['date of reservation'] = pd.to_datetime(df['date of reservation'], format='%m/%d/%Y',errors ='coerce')
df.drop(columns=['Booking_ID'], inplace=True)

# Extract date features
df['reservation_year'] = df['date of reservation'].dt.year
df['reservation_month'] = df['date of reservation'].dt.month
df['reservation_day'] = df['date of reservation'].dt.day
df['reservation_day_of_week'] = df['date of reservation'].dt.dayofweek
df['is_weekend'] = df['reservation_day_of_week'].apply(lambda x: 1 if x in [4,5] else 0)      # for friday and saturday
df.drop(columns=['date of reservation'], inplace=True)

# Create new features
df['stay_duration'] = df['number of weekend nights'] + df['number of week nights']
df['total_guests'] = df['number of adults'] + df['number of children']

# Handle missing values if any
num_rows_with_nan = df.isna().any(axis=1).sum()
print(f"DataFrame has null values: {num_rows_with_nan}")
if num_rows_with_nan:
  df.dropna(inplace=True)


DataFrame has null values: 37


#Step 4: **Transformation For The Categorical Data**

In [11]:
# First encode the binary categories
df['booking status'] = df['booking status'].map({'Canceled': 0, 'Not_Canceled': 1})

# Second encode the multivalue categories
tmp_cols = df.select_dtypes(include='object').columns
encoded_df = pd.get_dummies(data = df, prefix = tmp_cols, columns = tmp_cols)
del(tmp_cols)

In [12]:
encoded_df

Unnamed: 0,number of adults,number of children,number of weekend nights,number of week nights,car parking space,lead time,repeated,P-C,P-not-C,average price,...,room type_Room_Type 3,room type_Room_Type 4,room type_Room_Type 5,room type_Room_Type 6,room type_Room_Type 7,market segment type_Aviation,market segment type_Complementary,market segment type_Corporate,market segment type_Offline,market segment type_Online
0,1,1,2,5,0,224,0,0,0,88.00,...,False,False,False,False,False,False,False,False,True,False
1,1,0,1,3,0,5,0,0,0,106.68,...,False,False,False,False,False,False,False,False,False,True
2,2,1,1,3,0,1,0,0,0,50.00,...,False,False,False,False,False,False,False,False,False,True
3,1,0,0,2,0,211,0,0,0,100.00,...,False,False,False,False,False,False,False,False,False,True
4,1,0,1,2,0,48,0,0,0,77.00,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36279,2,0,1,1,0,48,0,0,0,94.50,...,False,False,False,False,False,False,False,False,False,True
36281,2,0,1,3,0,34,0,0,0,107.55,...,False,False,False,False,False,False,False,False,False,True
36282,2,0,1,3,0,83,0,0,0,105.61,...,False,True,False,False,False,False,False,False,False,True
36283,3,0,0,4,0,121,0,0,0,96.90,...,False,False,False,False,False,False,False,False,True,False


In [13]:
encoded_df.dtypes

number of adults                       int64
number of children                     int64
number of weekend nights               int64
number of week nights                  int64
car parking space                      int64
lead time                              int64
repeated                               int64
P-C                                    int64
P-not-C                                int64
average price                        float64
special requests                       int64
booking status                         int64
reservation_year                     float64
reservation_month                    float64
reservation_day                      float64
reservation_day_of_week              float64
is_weekend                             int64
stay_duration                          int64
total_guests                           int64
type of meal_Meal Plan 1                bool
type of meal_Meal Plan 2                bool
type of meal_Not Selected               bool
room type_

In [14]:
num_rows_with_nan = encoded_df.isna().any(axis=1).sum()
print(f"DataFrame has null values: {num_rows_with_nan}")
# if num_rows_with_nan:
#   df.dropna(inplace=True)

DataFrame has null values: 0


#Step 5: **Train Test Split**

In [15]:
y = df['booking status'].to_numpy()

X = np.array(encoded_df.drop(columns='booking status', inplace = False))
print(X.shape)
print(y.shape)

(34806, 33)
(34806,)


In [16]:
X_train, X_, y_train, y_ = train_test_split(X, y, train_size = 0.8)
X_cv, X_test, y_cv, y_test = train_test_split(X_,y_, train_size = 0.5)
del(X_,y_)

In [17]:
print("X train shape: ",X_train.shape)
print("y train shape: ",y_train.shape)

print("X CV shape:",X_cv.shape)
print("X CV shape:",y_cv.shape)

print("X test shape:",X_test.shape)
print("X test shape:",y_test.shape)


X train shape:  (27844, 33)
y train shape:  (27844,)
X CV shape: (3481, 33)
X CV shape: (3481,)
X test shape: (3481, 33)
X test shape: (3481,)


#Step 6: **Modeling And Accuracy Calculation**

In [19]:
lr_model = LogisticRegression()
lr_model.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


For a simple model without any scaling or polynomial features the Score is as following:

In [22]:

yhat = lr_model.predict(X_train)
print("Accuracy on training set:", lr_model.score(X_train,y_train))

Accuracy on training set: 0.7949288895273667


With adding Scaling and polynomial features:

In [24]:
#Add polynomial feature
poly = PolynomialFeatures(degree=3, include_bias=False)
X_train_mapped = poly.fit_transform(X_train)
X_cv_mapped = poly.transform(X_cv)
X_test_mapped = poly.transform(X_test)

In [25]:
# Scale the features using the z-score
scaler = StandardScaler()
X_train_mapped_scaled = scaler.fit_transform(X_train_mapped)
X_cv_mapped_scaled = scaler.transform(X_cv_mapped)
X_test_mapped_scaled = scaler.transform(X_test_mapped)

In [30]:
newModel = LogisticRegression()
newModel.fit(X_train_mapped_scaled,y_train)

yhat = newModel.predict(X_train_mapped_scaled)
print("Training error: ", mean_squared_error(y_train,yhat) / 2)

yhat = newModel.predict(X_cv_mapped_scaled)
print("Cross-Validation error: ", mean_squared_error(y_cv,yhat) / 2)

print("Accuracy on training set:", newModel.score(X_test_mapped_scaled,y_test))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training error:  0.07852679212756788
Cross-Validation error:  0.08330939385234128
Accuracy on training set: 0.8287848319448434


In [37]:
models=[]
polys=[]
scalers=[]
train_mses=[]
cv_mses=[]

for degree in range(1,5):
  #Poly
  poly=PolynomialFeatures(degree=degree,include_bias=False)
  X_train_mapped = poly.fit_transform(X_train)
  polys.append(poly)

  #Scaler
  scaler=StandardScaler()
  X_train_mapped_scaled = scaler.fit_transform(X_train_mapped)
  scalers.append(scaler)

  #Model
  model = LogisticRegression()
  model.fit(X_train_mapped_scaled,y_train)
  models.append(model)

  #Training_mse
  yhat=model.predict(X_train_mapped_scaled)
  train_mse = mean_squared_error(y_train,yhat) / 2
  train_mses.append(train_mse)


  #poly, scaler CV
  X_cv_mapped = poly.transform(X_cv)
  X_cv_mapped_scaled = scaler.transform(X_cv_mapped)

  #CV_mse
  yhat=model.predict(X_cv_mapped_scaled)
  cv_mse = mean_squared_error(y_cv,yhat) / 2
  cv_mses.append(cv_mse)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [38]:
degree = np.argmin(cv_mses) + 1
print(f"Lowest CV MSE is found in the model with degree= {degree}")

Lowest CV MSE is found in the model with degree= 4


In [19]:
# Add polynomial features to the test set
X_test_mapped = polys[degree-1].transform(X_test)

# Scale the test set
X_test_mapped_scaled = scalers[degree-1].transform(X_test_mapped)

# Compute the test MSE
yhat = models[degree-1].predict(X_test_mapped_scaled)
test_mse = mean_squared_error(y_test, yhat) / 2

print(f"Training MSE: {train_mses[degree-1]:.2f}")
print(f"Cross Validation MSE: {cv_mses[degree-1]:.2f}")
print(f"Test MSE: {test_mse:.2f}")

NameError: name 'polys' is not defined

In [40]:
mymodel = models[degree-1]

In [41]:
mymodel.score(X_train_mapped_scaled,y_train)

0.8550854762246803

# ***Decision Tree***

In [25]:
clf = DecisionTreeClassifier()
clf.fit(X_train_mapped_scaled, y_train)
treePrediction = clf.predict(X_train_mapped_scaled)
print("Accuracy:", clf.score(X_train_mapped_scaled,y_train))
ypred = clf.predict(X_test_mapped_scaled)
print(classification_report(y_test,ypred))

Accuracy: 0.9942536991811521
              precision    recall  f1-score   support

           0       0.78      0.80      0.79      1130
           1       0.90      0.89      0.90      2351

    accuracy                           0.86      3481
   macro avg       0.84      0.84      0.84      3481
weighted avg       0.86      0.86      0.86      3481



# ***Gradient Boosting Classifier***

In [39]:
clf2 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
    max_depth=15, random_state=0).fit(X_train_mapped_scaled, y_train)

In [None]:
# Accuracy calculation
ypred = clf2.predict(X_test_mapped_scaled)
print("Accuracy: ",accuracy_score(y_test,ypred))
print(classification_report(y_test,ypred))


Accuracy:  0.843723068083884
              precision    recall  f1-score   support

           0       0.79      0.71      0.75      1130
           1       0.87      0.91      0.89      2351

    accuracy                           0.84      3481
   macro avg       0.83      0.81      0.82      3481
weighted avg       0.84      0.84      0.84      3481



# ***Random Forest Classifier***

In [26]:
rfc = RandomForestClassifier()
rfc.fit(X_train_mapped_scaled,y_train)
y_pred = rfc.predict(X_test_mapped_scaled)
# Accuracy calculation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: ', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy:  0.8939959781671933
              precision    recall  f1-score   support

           0       0.87      0.79      0.83      1133
           1       0.90      0.95      0.92      2348

    accuracy                           0.89      3481
   macro avg       0.89      0.87      0.88      3481
weighted avg       0.89      0.89      0.89      3481



In [30]:
with open('model.pkl', 'wb') as f:  # open a text file
    pickle.dump(rfc, f) # serialize the list

In [31]:
with open('model.pkl', 'rb') as f:

    pmodel = pickle.load(f) # deserialize using load()


In [33]:
xyz = pmodel.predict(X_test_mapped_scaled)
xyz


array([1, 0, 1, ..., 1, 1, 1], dtype=int64)