In [1]:
!pip install pycaret



# Import

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import plotly.express as px
from sklearn.impute import SimpleImputer
import plotly.subplots as sp
import plotly.graph_objs as go
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from datetime import datetime
from pycaret.classification import setup, compare_models

# Data Preprocessing

In [4]:
def Data_Preprocessing(filepath):
    """
    Reads and preprocesses the dataset from the given CSV file.

    Parameters:
    filepath (str): Path to the CSV file.

    Returns:
    pd.DataFrame: Cleaned and preprocessed DataFrame.
    """
    # Read CSV file
    df = pd.read_csv(filepath)

    # Select relevant columns
    df = df[['Booking_ID', 'number of adults', 'number of children',
             'number of weekend nights', 'number of week nights', 'type of meal',
             'car parking space', 'room type', 'lead time', 'market segment type',
             'repeated', 'P-C', 'P-not-C', 'average price ', 'special requests',
             'date of reservation', 'booking status']]

    df["booking status"] = df["booking status"].map({"Canceled": 1, "Not_Canceled": 0})

    # Convert 'date of reservation' to datetime format
    df['date of reservation'] = pd.to_datetime(df['date of reservation'], errors='coerce')

    # Extract day, month, and year into separate columns
    df['reservation_day'] = df['date of reservation'].dt.day
    df['reservation_month'] = df['date of reservation'].dt.month
    df['reservation_year'] = df['date of reservation'].dt.year

    # Drop the original 'date of reservation' column
    df.drop(columns=['date of reservation'], inplace=True)

    # Handle missing values
    if df.isnull().sum().sum() > 0:
        imputer = SimpleImputer(strategy='mean')  # Replace missing values with column mean
        numeric_columns = df.select_dtypes(include='number').columns
        df[numeric_columns] = imputer.fit_transform(df[numeric_columns])

    # Remove duplicate values
    if df.duplicated().sum() > 0:
        df.drop_duplicates(inplace=True)


    return df

# Removing white spaces in column names
    df.columns = df.columns.str.strip()



# Load and preprocess data
df = Data_Preprocessing('/content/first inten project.csv')
df.head()


Unnamed: 0,Booking_ID,number of adults,number of children,number of weekend nights,number of week nights,type of meal,car parking space,room type,lead time,market segment type,repeated,P-C,P-not-C,average price,special requests,booking status,reservation_day,reservation_month,reservation_year
0,INN00001,1.0,1.0,2.0,5.0,Meal Plan 1,0.0,Room_Type 1,224.0,Offline,0.0,0.0,0.0,88.0,0.0,0.0,2.0,10.0,2015.0
1,INN00002,1.0,0.0,1.0,3.0,Not Selected,0.0,Room_Type 1,5.0,Online,0.0,0.0,0.0,106.68,1.0,0.0,6.0,11.0,2018.0
2,INN00003,2.0,1.0,1.0,3.0,Meal Plan 1,0.0,Room_Type 1,1.0,Online,0.0,0.0,0.0,50.0,0.0,1.0,28.0,2.0,2018.0
3,INN00004,1.0,0.0,0.0,2.0,Meal Plan 1,0.0,Room_Type 1,211.0,Online,0.0,0.0,0.0,100.0,1.0,1.0,20.0,5.0,2017.0
4,INN00005,1.0,0.0,1.0,2.0,Not Selected,0.0,Room_Type 1,48.0,Online,0.0,0.0,0.0,77.0,0.0,1.0,11.0,4.0,2018.0


In [5]:
print(df.dtypes)

Booking_ID                   object
number of adults            float64
number of children          float64
number of weekend nights    float64
number of week nights       float64
type of meal                 object
car parking space           float64
room type                    object
lead time                   float64
market segment type          object
repeated                    float64
P-C                         float64
P-not-C                     float64
average price               float64
special requests            float64
booking status              float64
reservation_day             float64
reservation_month           float64
reservation_year            float64
dtype: object


In [6]:
def change_column_types(df):
    """
    دالة لتحويل أنواع الأعمدة إلى الصيغة الصحيحة.
    """
    columns_types = {
        "number of adults": int,
        "number of children": int,
        "number of weekend nights": int,
        "number of week nights": int,
        "car parking space": int,
        "lead time": int,
        "repeated": int,
        "special requests": int,
        "reservation_day": int,
        "reservation_month": int,
        "reservation_year": int,
        "P-C": float,
        "P-not-C": float
    }

    # تحويل الأعمدة للأنواع المطلوبة
    for col, dtype in columns_types.items():
        try:
            df[col] = df[col].astype(dtype)
        except Exception as e:
            print(f"❌ خطأ في تحويل العمود {col}: {e}")

    print("✅ تم تحويل أنواع البيانات بنجاح!")
    return df
change_column_types(df)
print(df.dtypes)

✅ تم تحويل أنواع البيانات بنجاح!
Booking_ID                   object
number of adults              int64
number of children            int64
number of weekend nights      int64
number of week nights         int64
type of meal                 object
car parking space             int64
room type                    object
lead time                     int64
market segment type          object
repeated                      int64
P-C                         float64
P-not-C                     float64
average price               float64
special requests              int64
booking status              float64
reservation_day               int64
reservation_month             int64
reservation_year              int64
dtype: object


# Check And Handel The Outliers Using (EX : IQR OR Z SCORE )

In [7]:
from scipy import stats
import numpy as np

# حساب Z-score لجميع الأعمدة العددية
z_scores = np.abs(stats.zscore(df.select_dtypes(include=['number'])))

# تحديد القيم الشاذة حيث Z-score > 3
outliers_z = (z_scores > 3)

# حذف القيم الشاذة
df_clean_z = df[(~outliers_z).all(axis=1)]
print("Dataset after removing outliers using Z-score:\n", df_clean_z)


Dataset after removing outliers using Z-score:
       Booking_ID  number of adults  number of children  \
1       INN00002                 1                   0   
2       INN00003                 2                   1   
3       INN00004                 1                   0   
4       INN00005                 1                   0   
6       INN00007                 1                   1   
...          ...               ...                 ...   
36278   INN36279                 2                   0   
36279   INN36281                 2                   0   
36281   INN36283                 2                   0   
36282   INN36284                 2                   0   
36283   INN36285                 3                   0   

       number of weekend nights  number of week nights  type of meal  \
1                             1                      3  Not Selected   
2                             1                      3   Meal Plan 1   
3                             0        

In [8]:
from scipy import stats
import numpy as np

# نسخ البيانات الأصلية للعمل عليها
df_filled_z1 = df.copy()

# تطبيق Z-score على جميع الأعمدة العددية
for col in df_filled_z1.select_dtypes(include=['number']).columns:
    z_scores = np.abs(stats.zscore(df_filled_z1[col]))  # حساب Z-score
    median_value = df_filled_z1[col].median()  # حساب الميديان للعمود

    # استبدال القيم الشاذة بالقيمة المتوسطة
    df_filled_z1[col] = np.where(z_scores > 3, median_value, df_filled_z1[col])

print("Dataset after replacing outliers with median using Z-score:\n", df_filled_z1)


Dataset after replacing outliers with median using Z-score:
       Booking_ID  number of adults  number of children  \
0       INN00001               1.0                 1.0   
1       INN00002               1.0                 0.0   
2       INN00003               2.0                 1.0   
3       INN00004               1.0                 0.0   
4       INN00005               1.0                 0.0   
...          ...               ...                 ...   
36280   INN36282               2.0                 0.0   
36281   INN36283               2.0                 0.0   
36282   INN36284               2.0                 0.0   
36283   INN36285               3.0                 0.0   
36284   INN36286               2.0                 0.0   

       number of weekend nights  number of week nights  type of meal  \
0                           2.0                    5.0   Meal Plan 1   
1                           1.0                    3.0  Not Selected   
2                         

In [9]:
df_filled_z = df.copy()

for col in df_filled_z.select_dtypes(include=['number']).columns:
    z_scores = np.abs(stats.zscore(df_filled_z[col]))  # حساب Z-score
    mean_value = df_filled_z[col].mean()  # حساب المتوسط
    std_dev = df_filled_z[col].std()  # حساب الانحراف المعياري

    # تحديد الحدود المقبولة
    lower_bound = mean_value - 3 * std_dev
    upper_bound = mean_value + 3 * std_dev

    # استبدال القيم الشاذة بأقرب حد مقبول
    df_filled_z[col] = np.where(df_filled_z[col] > upper_bound, upper_bound, df_filled_z[col])
    df_filled_z[col] = np.where(df_filled_z[col] < lower_bound, lower_bound, df_filled_z[col])

print("Dataset after capping outliers at threshold using Z-score:\n", df_filled_z)


Dataset after capping outliers at threshold using Z-score:
       Booking_ID  number of adults  number of children  \
0       INN00001               1.0                 1.0   
1       INN00002               1.0                 0.0   
2       INN00003               2.0                 1.0   
3       INN00004               1.0                 0.0   
4       INN00005               1.0                 0.0   
...          ...               ...                 ...   
36280   INN36282               2.0                 0.0   
36281   INN36283               2.0                 0.0   
36282   INN36284               2.0                 0.0   
36283   INN36285               3.0                 0.0   
36284   INN36286               2.0                 0.0   

       number of weekend nights  number of week nights  type of meal  \
0                           2.0                    5.0   Meal Plan 1   
1                           1.0                    3.0  Not Selected   
2                          

#  Feature Engineering ( Feature Selection , Feature Extraction)

In [10]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA

# Function to encode categorical features
def encode_categorical_features(df):
    encoder = LabelEncoder()
    df_encoded = df.copy()

    for col in df_encoded.select_dtypes(include='object'):
        df_encoded[col] = encoder.fit_transform(df_encoded[col].astype(str))

    return df_encoded

# Function to scale numerical features
def scale_numerical_features(df):
    scaler = StandardScaler()
    df_scaled = df.copy()

    for col in df_scaled.select_dtypes(include='number'):
        df_scaled[col] = scaler.fit_transform(df_scaled[col].values.reshape(-1, 1))

    return df_scaled



# Apply encoding, scaling, and PCA
df_encoded = encode_categorical_features(df_filled_z)
#f_scaled = scale_numerical_features(df_encoded)


print("Original DataFrame:")
print(df)
print("\nEncoded DataFrame:")
print(df_encoded)





Original DataFrame:
      Booking_ID  number of adults  number of children  \
0       INN00001                 1                   1   
1       INN00002                 1                   0   
2       INN00003                 2                   1   
3       INN00004                 1                   0   
4       INN00005                 1                   0   
...          ...               ...                 ...   
36280   INN36282                 2                   0   
36281   INN36283                 2                   0   
36282   INN36284                 2                   0   
36283   INN36285                 3                   0   
36284   INN36286                 2                   0   

       number of weekend nights  number of week nights  type of meal  \
0                             2                      5   Meal Plan 1   
1                             1                      3  Not Selected   
2                             1                      3   Meal Plan 

In [11]:
from sklearn.feature_selection import VarianceThreshold

# إنشاء كائن VarianceThreshold مع حد معين (مثلاً 0.01)
selector = VarianceThreshold(threshold=0.01)

# تطبيق الاختيار على البيانات
X_var_selected = selector.fit_transform(df_encoded)

# استرجاع أسماء الميزات التي تم الاحتفاظ بها
selected_features = df_encoded.columns[selector.get_support()]

# عرض الميزات المختارة
print("Selected Features:", selected_features)

Selected Features: Index(['Booking_ID', 'number of adults', 'number of children',
       'number of weekend nights', 'number of week nights', 'type of meal',
       'room type', 'lead time', 'market segment type', 'P-C', 'P-not-C',
       'average price ', 'special requests', 'booking status',
       'reservation_day', 'reservation_month', 'reservation_year'],
      dtype='object')


# Train Test Split Modeling And Accuracy Calculation


In [12]:
from pycaret.classification import setup, compare_models
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from collections import Counter

# ✅ تحديد العمود المستهدف
target_col = "booking status"

# ✅ تطبيق اختيار الميزات بناءً على التباين
selector = VarianceThreshold(threshold=0.01)
X_filtered = selector.fit_transform(df_encoded.drop(columns=[target_col]))  # إزالة العمود المستهدف

# ✅ استرجاع أسماء الميزات التي تم الاحتفاظ بها
selected_feature_names = df_encoded.drop(columns=[target_col]).columns[selector.get_support()]

# ✅ إنشاء DataFrame جديد بعد تصفية الميزات منخفضة التباين
df_filtered = pd.DataFrame(X_filtered, columns=selected_feature_names)

# ✅ إضافة العمود المستهدف مرة أخرى بعد التصفية
df_filtered[target_col] = df_encoded[target_col]

# ✅ فصل الميزات (X) والمتغير المستهدف (y)
X = df_filtered.drop(columns=[target_col])
y = df_filtered[target_col]

# ✅ التحقق من توزيع الفئات قبل SMOTE
print("🔹 توزيع الفئات قبل SMOTE:", Counter(y))

# ✅ تقسيم البيانات إلى تدريب واختبار
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# ✅ تطبيق SMOTE على بيانات التدريب
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# ✅ التحقق من توزيع الفئات بعد SMOTE
print("✅ توزيع الفئات بعد SMOTE:", Counter(y_train_resampled))

# ✅ تحويل البيانات إلى DataFrame مرة أخرى
df_resampled = pd.concat([pd.DataFrame(X_train_resampled, columns=X.columns), pd.DataFrame(y_train_resampled, columns=[target_col])], axis=1)

# ✅ تشغيل PyCaret على البيانات النهائية
clf_setup = setup(data=df_resampled,
                  target=target_col,
                  train_size=0.8,
                  data_split_stratify=False,
                  verbose=True)

# 🔥 مقارنة النماذج واختيار الأفضل
best_model = compare_models()


🔹 توزيع الفئات قبل SMOTE: Counter({0.0: 24396, 1.0: 11889})
✅ توزيع الفئات بعد SMOTE: Counter({0.0: 19517, 1.0: 19517})


Unnamed: 0,Description,Value
0,Session id,7506
1,Target,booking status
2,Target type,Binary
3,Original data shape,"(39034, 17)"
4,Transformed data shape,"(39034, 17)"
5,Transformed train set shape,"(31227, 17)"
6,Transformed test set shape,"(7807, 17)"
7,Numeric features,16
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9188,0.9733,0.8992,0.9352,0.9168,0.8375,0.8381,4.735
et,Extra Trees Classifier,0.9186,0.9713,0.907,0.9279,0.9173,0.8371,0.8374,3.092
xgboost,Extreme Gradient Boosting,0.9136,0.9726,0.8961,0.928,0.9117,0.8272,0.8277,0.604
lightgbm,Light Gradient Boosting Machine,0.9095,0.9717,0.8925,0.9233,0.9076,0.819,0.8195,1.743
dt,Decision Tree Classifier,0.8798,0.8799,0.8806,0.8784,0.8795,0.7597,0.7597,0.223
gbc,Gradient Boosting Classifier,0.879,0.9527,0.8601,0.8929,0.8762,0.7579,0.7584,5.557
ada,Ada Boost Classifier,0.8566,0.9302,0.8554,0.8565,0.8559,0.7132,0.7133,1.362
lda,Linear Discriminant Analysis,0.7804,0.8561,0.7849,0.7766,0.7807,0.5609,0.561,0.077
ridge,Ridge Classifier,0.7803,0.8563,0.7847,0.7765,0.7806,0.5606,0.5607,0.059
lr,Logistic Regression,0.7776,0.8542,0.7783,0.7758,0.777,0.5552,0.5553,2.82


Processing:   0%|          | 0/65 [00:00<?, ?it/s]



### **Quick Analysis of the Results:**  
1. **Best Performing Models:**  
   - **Random Forest (RF)**: Highest **accuracy (91.88%)** and best **AUC (0.9733)** with an **F1-score of 91.68%**.  
   - **Extra Trees (ET)**: Very close performance to RF but faster in execution.  
   - **XGBoost** and **LightGBM** provide a good balance between performance and speed.  

2. **Poorly Performing Models:**  
   - **SVM, Naive Bayes, and QDA** show lower performance. QDA and NB have high **recall** but low accuracy, which could be an issue if you aim to reduce false predictions for a particular class.  

3. **Execution Time (TT):**  
   - Fastest model: **Linear Discriminant Analysis (LDA) - 0.077 seconds**.  
   - Slowest model: **Gradient Boosting (GBC) - 5.557 seconds**.  

