In [72]:
import numpy as np
import matplotlib.pyplot as plt
import IPython.display as HTML
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [60]:
%matplotlib inline

In [61]:
data = pd.read_csv('/content/intern/first inten project.csv')
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36285 entries, 0 to 36284
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Booking_ID                36285 non-null  object 
 1   number of adults          36285 non-null  int64  
 2   number of children        36285 non-null  int64  
 3   number of weekend nights  36285 non-null  int64  
 4   number of week nights     36285 non-null  int64  
 5   type of meal              36285 non-null  object 
 6   car parking space         36285 non-null  int64  
 7   room type                 36285 non-null  object 
 8   lead time                 36285 non-null  int64  
 9   market segment type       36285 non-null  object 
 10  repeated                  36285 non-null  int64  
 11  P-C                       36285 non-null  int64  
 12  P-not-C                   36285 non-null  int64  
 13  average price             36285 non-null  float64
 14  specia

Unnamed: 0,Booking_ID,number of adults,number of children,number of weekend nights,number of week nights,type of meal,car parking space,room type,lead time,market segment type,repeated,P-C,P-not-C,average price,special requests,date of reservation,booking status
0,INN00001,1,1,2,5,Meal Plan 1,0,Room_Type 1,224,Offline,0,0,0,88.0,0,10/2/2015,Not_Canceled
1,INN00002,1,0,1,3,Not Selected,0,Room_Type 1,5,Online,0,0,0,106.68,1,11/6/2018,Not_Canceled
2,INN00003,2,1,1,3,Meal Plan 1,0,Room_Type 1,1,Online,0,0,0,50.0,0,2/28/2018,Canceled
3,INN00004,1,0,0,2,Meal Plan 1,0,Room_Type 1,211,Online,0,0,0,100.0,1,5/20/2017,Canceled
4,INN00005,1,0,1,2,Not Selected,0,Room_Type 1,48,Online,0,0,0,77.0,0,4/11/2018,Canceled


In [62]:
data.columns = data.columns.str.strip().str.lower().str.replace(" ", "_")

categorical_cols = ["type_of_meal", "room_type", "market_segment_type", "booking_status"]
data[categorical_cols] = data[categorical_cols].apply(lambda x: x.str.strip())

data["date_of_reservation"] = pd.to_datetime(data["date_of_reservation"], errors="coerce")

data.info()
data.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36285 entries, 0 to 36284
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   booking_id                36285 non-null  object        
 1   number_of_adults          36285 non-null  int64         
 2   number_of_children        36285 non-null  int64         
 3   number_of_weekend_nights  36285 non-null  int64         
 4   number_of_week_nights     36285 non-null  int64         
 5   type_of_meal              36285 non-null  object        
 6   car_parking_space         36285 non-null  int64         
 7   room_type                 36285 non-null  object        
 8   lead_time                 36285 non-null  int64         
 9   market_segment_type       36285 non-null  object        
 10  repeated                  36285 non-null  int64         
 11  p-c                       36285 non-null  int64         
 12  p-not-c           

Unnamed: 0,booking_id,number_of_adults,number_of_children,number_of_weekend_nights,number_of_week_nights,type_of_meal,car_parking_space,room_type,lead_time,market_segment_type,repeated,p-c,p-not-c,average_price,special_requests,date_of_reservation,booking_status
0,INN00001,1,1,2,5,Meal Plan 1,0,Room_Type 1,224,Offline,0,0,0,88.0,0,2015-10-02,Not_Canceled
1,INN00002,1,0,1,3,Not Selected,0,Room_Type 1,5,Online,0,0,0,106.68,1,2018-11-06,Not_Canceled
2,INN00003,2,1,1,3,Meal Plan 1,0,Room_Type 1,1,Online,0,0,0,50.0,0,2018-02-28,Canceled
3,INN00004,1,0,0,2,Meal Plan 1,0,Room_Type 1,211,Online,0,0,0,100.0,1,2017-05-20,Canceled
4,INN00005,1,0,1,2,Not Selected,0,Room_Type 1,48,Online,0,0,0,77.0,0,2018-04-11,Canceled


In [63]:
missing_values = data.isnull().sum()

data_types = data.dtypes

data_trimmed = data.copy()
for col in data_trimmed.select_dtypes(include=["object"]).columns:
    data_trimmed[col] = data_trimmed[col].str.strip()

changes_made = (data != data_trimmed).sum().sum()

print(missing_values)
print(data_types)
print(changes_made)


booking_id                   0
number_of_adults             0
number_of_children           0
number_of_weekend_nights     0
number_of_week_nights        0
type_of_meal                 0
car_parking_space            0
room_type                    0
lead_time                    0
market_segment_type          0
repeated                     0
p-c                          0
p-not-c                      0
average_price                0
special_requests             0
date_of_reservation         37
booking_status               0
dtype: int64
booking_id                          object
number_of_adults                     int64
number_of_children                   int64
number_of_weekend_nights             int64
number_of_week_nights                int64
type_of_meal                        object
car_parking_space                    int64
room_type                           object
lead_time                            int64
market_segment_type                 object
repeated                      

In [64]:
data.dropna(subset=['date_of_reservation'], inplace=True) #37 is a very low number of records in comparison with the total number

In [65]:
data.isnull().sum()

Unnamed: 0,0
booking_id,0
number_of_adults,0
number_of_children,0
number_of_weekend_nights,0
number_of_week_nights,0
type_of_meal,0
car_parking_space,0
room_type,0
lead_time,0
market_segment_type,0


In [66]:
numerical_cols = data.select_dtypes(include=["int64", "float64"]).columns.tolist()
numerical_cols.remove("p-c")
numerical_cols.remove("p-not-c")

def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[column] < lower_bound) | (data[column] > upper_bound)]


outliers = {col: detect_outliers_iqr(data, col).shape[0] for col in numerical_cols}
outliers

{'number_of_adults': 10155,
 'number_of_children': 2698,
 'number_of_weekend_nights': 21,
 'number_of_week_nights': 323,
 'car_parking_space': 1121,
 'lead_time': 1332,
 'repeated': 926,
 'average_price': 1692,
 'special_requests': 762}

In [67]:
def cap_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    data[column] = np.clip(data[column], lower_bound, upper_bound)

for col in ["number_of_adults", "number_of_children", "lead_time", "average_price"]:
    cap_outliers_iqr(data, col)

outliers_after_capping = {col: detect_outliers_iqr(data, col).shape[0] for col in numerical_cols}

In [68]:
data = data.drop(columns=["booking_id", "date_of_reservation"])

data["booking_status"] = data["booking_status"].map({"Canceled": 1, "Not_Canceled": 0})

categorical_cols = ["type_of_meal", "room_type", "market_segment_type"]
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

data.head()

Unnamed: 0,number_of_adults,number_of_children,number_of_weekend_nights,number_of_week_nights,type_of_meal,car_parking_space,room_type,lead_time,market_segment_type,repeated,p-c,p-not-c,average_price,special_requests,booking_status
0,2,0,2,5,0,0,0,224.0,3,0,0,0,88.0,0,0
1,2,0,1,3,3,0,0,5.0,4,0,0,0,106.68,1,0
2,2,0,1,3,0,0,0,1.0,4,0,0,0,50.0,0,1
3,2,0,0,2,0,0,0,211.0,4,0,0,0,100.0,1,1
4,2,0,1,2,3,0,0,48.0,4,0,0,0,77.0,0,1


In [73]:
X_numeric = data.select_dtypes(include=["int64", "float64"])

vif_data = pd.DataFrame()
vif_data["Feature"] = X_numeric.columns
vif_data["VIF"] = [variance_inflation_factor(X_numeric.values, i) for i in range(len(X_numeric.columns))]

vif_data.sort_values(by="VIF", ascending=False)


  return 1 - self.ssr/self.centered_tss


Unnamed: 0,Feature,VIF
0,number_of_adults,42.137802
9,repeated,1.644755
11,p-not-c,1.598343
8,market_segment_type,1.560319
12,average_price,1.544242
14,booking_status,1.458288
6,room_type,1.363197
7,lead_time,1.345215
10,p-c,1.340213
13,special_requests,1.280331


In [74]:
X = data.drop(columns=["booking_status", "number_of_adults"]) #remove number of adults column as it has very high VIF
y = data["booking_status"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((28998, 13), (7250, 13), (28998,), (7250,))

In [76]:
model = LogisticRegression(max_iter=5000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(accuracy)
print("\n")
print(report)


0.7946206896551724


              precision    recall  f1-score   support

           0       0.82      0.89      0.85      4873
           1       0.73      0.60      0.66      2377

    accuracy                           0.79      7250
   macro avg       0.77      0.74      0.75      7250
weighted avg       0.79      0.79      0.79      7250

