Advantages of This Approach
Flexibility: Python and its libraries offer great flexibility for data manipulation and analysis.
All-in-one: Your entire workflow, from data preparation to model training, can be done within a single environment.
Scalability: While Pandas is in-memory and suitable for small to medium-sized datasets, for larger datasets, you can integrate with tools like Dask or use cloud-based solutions.
This approach provides a seamless pipeline from data loading to model deployment, all within the Python ecosystem, making it efficient and highly customizable to your specific needs.

In [7]:
import pandas as pd

In [8]:
df = pd.read_csv('INNHotelsGroup.csv')

In [9]:
df.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled


In [10]:
# Import findspark and initialize. 
import findspark
findspark.init()

# Import packages
from pyspark.sql import SparkSession
import time

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [11]:
# Import CSV file utilizing PySpark
path = 'INNHotelsGroup.csv'
spark.sparkContext.addFile(path)
df = spark.read.csv(path, header=True, sep=',')
df.show()

23/11/20 16:41:33 WARN SparkContext: The path INNHotelsGroup.csv has been added already. Overwriting of added paths is not supported in the current version.


+----------+------------+--------------+--------------------+-----------------+-----------------+--------------------------+------------------+---------+------------+-------------+------------+-------------------+--------------+----------------------------+------------------------------------+------------------+----------------------+--------------+
|Booking_ID|no_of_adults|no_of_children|no_of_weekend_nights|no_of_week_nights|type_of_meal_plan|required_car_parking_space|room_type_reserved|lead_time|arrival_year|arrival_month|arrival_date|market_segment_type|repeated_guest|no_of_previous_cancellations|no_of_previous_bookings_not_canceled|avg_price_per_room|no_of_special_requests|booking_status|
+----------+------------+--------------+--------------------+-----------------+-----------------+--------------------------+------------------+---------+------------+-------------+------------+-------------------+--------------+----------------------------+------------------------------------+--

In [12]:
pd_df = df.toPandas()

# Data Cleaning
meal_plan_dummies = pd.get_dummies(pd_df["type_of_meal_plan"])
meal_plan_dummies.head()

Unnamed: 0,Meal Plan 1,Meal Plan 2,Meal Plan 3,Not Selected
0,1,0,0,0
1,0,0,0,1
2,1,0,0,0
3,1,0,0,0
4,0,0,0,1


In [13]:
room_type_dummies = pd.get_dummies(pd_df["room_type_reserved"])
room_type_dummies.head()

Unnamed: 0,Room_Type 1,Room_Type 2,Room_Type 3,Room_Type 4,Room_Type 5,Room_Type 6,Room_Type 7
0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0


In [14]:
cleaned_df = pd.concat([pd_df, meal_plan_dummies, room_type_dummies], axis=1)
cleaned_df = cleaned_df.drop(columns=["type_of_meal_plan", "room_type_reserved"])
cleaned_df.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,...,Meal Plan 2,Meal Plan 3,Not Selected,Room_Type 1,Room_Type 2,Room_Type 3,Room_Type 4,Room_Type 5,Room_Type 6,Room_Type 7
0,INN00001,2,0,1,2,0,224,2017,10,2,...,0,0,0,1,0,0,0,0,0,0
1,INN00002,2,0,2,3,0,5,2018,11,6,...,0,0,1,1,0,0,0,0,0,0
2,INN00003,1,0,2,1,0,1,2018,2,28,...,0,0,0,1,0,0,0,0,0,0
3,INN00004,2,0,0,2,0,211,2018,5,20,...,0,0,0,1,0,0,0,0,0,0
4,INN00005,2,0,1,1,0,48,2018,4,11,...,0,0,1,1,0,0,0,0,0,0


In [15]:
# Replace string values with boolean values to make data easier to use
def encode_market(market):
    if market == "Online":
        return 1
    else:
        return 0
# Call the encode_market function on the market column
cleaned_df["market_segment_type"] = cleaned_df["market_segment_type"].apply(encode_market)
cleaned_df.head()

def encode_cancel(cancel):
    if cancel == "Canceled":
        return 1
    else:
        return 0
# Call the encode_cancel function on the cancel column
cleaned_df["booking_status"] = cleaned_df["booking_status"].apply(encode_cancel)
cleaned_df.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,...,Meal Plan 2,Meal Plan 3,Not Selected,Room_Type 1,Room_Type 2,Room_Type 3,Room_Type 4,Room_Type 5,Room_Type 6,Room_Type 7
0,INN00001,2,0,1,2,0,224,2017,10,2,...,0,0,0,1,0,0,0,0,0,0
1,INN00002,2,0,2,3,0,5,2018,11,6,...,0,0,1,1,0,0,0,0,0,0
2,INN00003,1,0,2,1,0,1,2018,2,28,...,0,0,0,1,0,0,0,0,0,0
3,INN00004,2,0,0,2,0,211,2018,5,20,...,0,0,0,1,0,0,0,0,0,0
4,INN00005,2,0,1,1,0,48,2018,4,11,...,0,0,1,1,0,0,0,0,0,0


In [16]:
# Set index of cleaned_df
cleaned_df.set_index("Booking_ID", inplace=True)
cleaned_df.head()


Unnamed: 0_level_0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,...,Meal Plan 2,Meal Plan 3,Not Selected,Room_Type 1,Room_Type 2,Room_Type 3,Room_Type 4,Room_Type 5,Room_Type 6,Room_Type 7
Booking_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
INN00001,2,0,1,2,0,224,2017,10,2,0,...,0,0,0,1,0,0,0,0,0,0
INN00002,2,0,2,3,0,5,2018,11,6,1,...,0,0,1,1,0,0,0,0,0,0
INN00003,1,0,2,1,0,1,2018,2,28,1,...,0,0,0,1,0,0,0,0,0,0
INN00004,2,0,0,2,0,211,2018,5,20,1,...,0,0,0,1,0,0,0,0,0,0
INN00005,2,0,1,1,0,48,2018,4,11,1,...,0,0,1,1,0,0,0,0,0,0


In [17]:
# Import dependencies for Machine Learning Model
from sklearn.model_selection import train_test_split

In [18]:
# Split our preprocessed data into our features and target arrays
y = cleaned_df['booking_status'].values
X = cleaned_df.drop(columns='booking_status').values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [22]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(random_state=1)

# Fit the model using training data
lr_model = logistic_regression_model.fit(X_train, y_train)
lr_model

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
# Make a prediction using the testing data
testing_predictions = lr_model.predict(X_test)
testing_predictions

array([0, 0, 0, ..., 1, 0, 0])

In [24]:
# Print the balanced_accuracy score of the model
balanced_accuracy = balanced_accuracy_score(y_test, testing_predictions)
print(f"Balanced Accuracy Score : {balanced_accuracy}")

Balanced Accuracy Score : 0.7423860408009632


In [25]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, testing_predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

print("Confusion Matrix")
display(cm_df)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5487,624
Actual 1,1222,1736


In [26]:
# Print the classification report for the model
print("Classification Report")
print(classification_report(y_test, testing_predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.82      0.90      0.86      6111
           1       0.74      0.59      0.65      2958

    accuracy                           0.80      9069
   macro avg       0.78      0.74      0.75      9069
weighted avg       0.79      0.80      0.79      9069

