In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, roc_auc_score

## Meri doing things

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = '/content/drive/My Drive/BT4301 Deployment/hotel_bookings_train_data.csv'
hotel_bookings_train_data = pd.read_csv(file_path)
hotel_bookings_train_data.rename(columns={'Unnamed: 0': 'booking_id'}, inplace=True)

In [None]:
hotel_bookings_train_data[:2]

Unnamed: 0,booking_id,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,...,agent,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date,booking_date
0,0,Resort Hotel,0,342,2015,July,27,1,0,0,...,0.0,0,Transient,0.0,0,0,Check-Out,2015-07-01,2015-07-01,2014-07-24
1,1,Resort Hotel,0,737,2015,July,27,1,0,0,...,0.0,0,Transient,0.0,0,0,Check-Out,2015-07-01,2015-07-01,2013-06-24


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np


class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # Fit should handle setting up any parameters/information needed for transformation
        return self

    def transform(self, X, y=None):
        X = X.copy()  # To avoid changes to the original data

        # Preprocessing steps as specified
        X = X.drop(X.columns[0], axis=1)
        X.fillna(-1, inplace=True)
        filter = (X['children'] == 0) & (X['adults'] == 0) & (X['babies'] == 0)
        X = X[~filter]
        useless_col = ['days_in_waiting_list', 'arrival_date_year', 'assigned_room_type', 'booking_changes',
                       'reservation_status', 'country', 'days_in_waiting_list']
        X.drop(useless_col, axis=1, inplace=True)
        X["arrival_date"] = pd.to_datetime(X["arrival_date"])
        X["booking_date"] = pd.to_datetime(X["booking_date"])

        cat_cols = [col for col in X.columns if X[col].dtype == 'O']
        cat_df = X[cat_cols]

        if 'reservation_status_date' in cat_df:
            cat_df['reservation_status_date'] = pd.to_datetime(cat_df['reservation_status_date'])
            cat_df['year'] = cat_df['reservation_status_date'].dt.year
            cat_df['month'] = cat_df['reservation_status_date'].dt.month
            cat_df['day'] = cat_df['reservation_status_date'].dt.day
            cat_df.drop(['reservation_status_date', 'arrival_date_month'], axis=1, inplace=True)

        # Encoding categorical variables
        mappings = {
            'hotel': {'Resort Hotel': 0, 'City Hotel': 1},
            'meal': {'BB': 0, 'FB': 1, 'HB': 2, 'SC': 3, 'Undefined': 4},
            'market_segment': {'Direct': 0, 'Corporate': 1, 'Online TA': 2, 'Offline TA/TO': 3,
                               'Complementary': 4, 'Groups': 5, 'Undefined': 6, 'Aviation': 7},
            'distribution_channel': {'Direct': 0, 'Corporate': 1, 'TA/TO': 2, 'Undefined': 3,
                                     'GDS': 4},
            'reserved_room_type': {'C': 0, 'A': 1, 'D': 2, 'E': 3, 'G': 4, 'F': 5, 'H': 6,
                                   'L': 7, 'B': 8},
            'deposit_type': {'No Deposit': 0, 'Refundable': 1, 'Non Refund': 3},
            'customer_type': {'Transient': 0, 'Contract': 1, 'Transient-Party': 2, 'Group': 3},
            'year': {2015: 0, 2014: 1, 2016: 2, 2017: 3}
        }

        for col, mapping in mappings.items():
            if col in cat_df:
                cat_df[col] = cat_df[col].map(mapping)
                cat_df[col] = cat_df[col].fillna(-1)

        # Continue with other encoding as specified
        # Note: This is simplified for brevity. Please include all your mappings.

        num_df = X.select_dtypes(include=['int64', 'float64'])
        #num_df.drop('is_canceled', axis=1, inplace=True)

        # Log transformation
        for col in ['lead_time', 'arrival_date_week_number', 'arrival_date_day_of_month', 'agent', 'adr']:
            if col in num_df:
                num_df[col] = np.log(num_df[col] + 1)

        # Merge categorical and numerical dataframes
        X_transformed = pd.concat([cat_df, num_df], axis=1)
        X_transformed.fillna(0, inplace=True)

        return X_transformed


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier

# Define the pipeline
model_pipe = Pipeline(steps=[
    ('preprocessor', CustomPreprocessor()),
    ('classifier', AdaBoostClassifier())
])

In [None]:
filter = (hotel_bookings_train_data['children'] == 0) & (hotel_bookings_train_data['adults'] == 0) & (hotel_bookings_train_data['babies'] == 0)
df_filtered = hotel_bookings_train_data[~filter]

# Now split into X and y
X = df_filtered.drop('is_canceled', axis=1)
y = df_filtered['is_canceled']


In [None]:

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
X.columns

Index(['booking_id', 'hotel', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date', 'arrival_date',
       'booking_date'],
      dtype='object')

In [None]:
model_pipe.fit(X_train, y_train)

In [None]:
y_pred = model_pipe.predict(X_test)
y_prob = model_pipe.predict_proba(X_test)  # This will give you probabilities of each class


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score


In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob[:, 1]))


Accuracy: 0.8332533386035743
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.94      0.88     22184
           1       0.87      0.66      0.75     13235

    accuracy                           0.83     35419
   macro avg       0.84      0.80      0.81     35419
weighted avg       0.84      0.83      0.83     35419

Confusion Matrix:
 [[20835  1349]
 [ 4557  8678]]
ROC AUC Score: 0.8791529793541832


In [None]:
import joblib

file_path_model = '/content/drive/My Drive/BT4301 Deployment/model_pipe.joblib'
# Assuming model_pipe is your trained pipeline
joblib.dump(model_pipe, file_path_model)


['/content/drive/My Drive/BT4301 Deployment/model_pipe.joblib']