Advantages of This Approach
Flexibility: Python and its libraries offer great flexibility for data manipulation and analysis.
All-in-one: Your entire workflow, from data preparation to model training, can be done within a single environment.
Scalability: While Pandas is in-memory and suitable for small to medium-sized datasets, for larger datasets, you can integrate with tools like Dask or use cloud-based solutions.
This approach provides a seamless pipeline from data loading to model deployment, all within the Python ecosystem, making it efficient and highly customizable to your specific needs.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Resources/INNHotelsGroup.csv')

In [3]:
df.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled


In [4]:
# Import findspark and initialize. 
import findspark
findspark.init()

# Import packages
from pyspark.sql import SparkSession
import time

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

23/11/16 18:54:39 WARN Utils: Your hostname, Taits-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.19 instead (on interface en0)
23/11/16 18:54:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/16 18:54:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
# Import CSV file utilizing PySpark
path = 'Resources/INNHotelsGroup.csv'
spark.sparkContext.addFile(path)
df = spark.read.csv(path, header=True, sep=',')
df.show()

+----------+------------+--------------+--------------------+-----------------+-----------------+--------------------------+------------------+---------+------------+-------------+------------+-------------------+--------------+----------------------------+------------------------------------+------------------+----------------------+--------------+
|Booking_ID|no_of_adults|no_of_children|no_of_weekend_nights|no_of_week_nights|type_of_meal_plan|required_car_parking_space|room_type_reserved|lead_time|arrival_year|arrival_month|arrival_date|market_segment_type|repeated_guest|no_of_previous_cancellations|no_of_previous_bookings_not_canceled|avg_price_per_room|no_of_special_requests|booking_status|
+----------+------------+--------------+--------------------+-----------------+-----------------+--------------------------+------------------+---------+------------+-------------+------------+-------------------+--------------+----------------------------+------------------------------------+--

23/11/16 18:54:53 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [13]:
pd_df = df.toPandas()

# Data Cleaning
meal_plan_dummies = pd.get_dummies(pd_df["type_of_meal_plan"])
meal_plan_dummies.head()

Unnamed: 0,Meal Plan 1,Meal Plan 2,Meal Plan 3,Not Selected
0,1,0,0,0
1,0,0,0,1
2,1,0,0,0
3,1,0,0,0
4,0,0,0,1


In [14]:
room_type_dummies = pd.get_dummies(pd_df["room_type_reserved"])
room_type_dummies.head()

Unnamed: 0,Room_Type 1,Room_Type 2,Room_Type 3,Room_Type 4,Room_Type 5,Room_Type 6,Room_Type 7
0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0


In [15]:
cleaned_df = pd.concat([pd_df, meal_plan_dummies, room_type_dummies], axis=1)
cleaned_df = cleaned_df.drop(columns=["type_of_meal_plan", "room_type_reserved"])
cleaned_df.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,...,Meal Plan 2,Meal Plan 3,Not Selected,Room_Type 1,Room_Type 2,Room_Type 3,Room_Type 4,Room_Type 5,Room_Type 6,Room_Type 7
0,INN00001,2,0,1,2,0,224,2017,10,2,...,0,0,0,1,0,0,0,0,0,0
1,INN00002,2,0,2,3,0,5,2018,11,6,...,0,0,1,1,0,0,0,0,0,0
2,INN00003,1,0,2,1,0,1,2018,2,28,...,0,0,0,1,0,0,0,0,0,0
3,INN00004,2,0,0,2,0,211,2018,5,20,...,0,0,0,1,0,0,0,0,0,0
4,INN00005,2,0,1,1,0,48,2018,4,11,...,0,0,1,1,0,0,0,0,0,0


In [18]:
# Replace string values with boolean values to make data easier to use
def encode_market(market):
    if market == "Online":
        return 1
    else:
        return 0
# Call the encode_market function on the market column
cleaned_df["market_segment_type"] = cleaned_df["market_segment_type"].apply(encode_market)
cleaned_df.head()

def encode_cancel(cancel):
    if cancel == "Canceled":
        return 1
    else:
        return 0
# Call the encode_cancel function on the cancel column
cleaned_df["booking_status"] = cleaned_df["booking_status"].apply(encode_cancel)
cleaned_df.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,...,Meal Plan 2,Meal Plan 3,Not Selected,Room_Type 1,Room_Type 2,Room_Type 3,Room_Type 4,Room_Type 5,Room_Type 6,Room_Type 7
0,INN00001,2,0,1,2,0,224,2017,10,2,...,0,0,0,1,0,0,0,0,0,0
1,INN00002,2,0,2,3,0,5,2018,11,6,...,0,0,1,1,0,0,0,0,0,0
2,INN00003,1,0,2,1,0,1,2018,2,28,...,0,0,0,1,0,0,0,0,0,0
3,INN00004,2,0,0,2,0,211,2018,5,20,...,0,0,0,1,0,0,0,0,0,0
4,INN00005,2,0,1,1,0,48,2018,4,11,...,0,0,1,1,0,0,0,0,0,0


In [23]:
# Set index of cleaned_df
cleaned_df.set_index("Booking_ID", inplace=True)
cleaned_df.head()


Unnamed: 0_level_0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,...,Meal Plan 2,Meal Plan 3,Not Selected,Room_Type 1,Room_Type 2,Room_Type 3,Room_Type 4,Room_Type 5,Room_Type 6,Room_Type 7
Booking_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
INN00001,2,0,1,2,0,224,2017,10,2,0,...,0,0,0,1,0,0,0,0,0,0
INN00002,2,0,2,3,0,5,2018,11,6,0,...,0,0,1,1,0,0,0,0,0,0
INN00003,1,0,2,1,0,1,2018,2,28,0,...,0,0,0,1,0,0,0,0,0,0
INN00004,2,0,0,2,0,211,2018,5,20,0,...,0,0,0,1,0,0,0,0,0,0
INN00005,2,0,1,1,0,48,2018,4,11,0,...,0,0,1,1,0,0,0,0,0,0


In [24]:
# Import dependencies for Machine Learning Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

In [25]:
# Split our preprocessed data into our features and target arrays
y = cleaned_df['booking_status'].values
X = cleaned_df.drop(columns='booking_status').values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [27]:
# Find shape of the data to determine best number nodes for the model
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (27206, 26)
X_test shape: (9069, 26)
y_train shape: (27206,)
y_test shape: (9069,)


In [28]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [43]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 7
#hidden_nodes_layer2 = 3

nn_model = tf.keras.models.Sequential()

# First hidden layer
nn_model.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
#nn_model.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn_model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 7)                 189       
                                                                 
 dense_10 (Dense)            (None, 1)                 8         
                                                                 
Total params: 197 (788.00 Byte)
Trainable params: 197 (788.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [44]:
# Compile the model
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [45]:
# Train the model
fit_model = nn_model.fit(X_train_scaled,y_train,epochs=2)

Epoch 1/2
Epoch 2/2


In [46]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

284/284 - 0s - loss: 0.0028 - accuracy: 1.0000 - 151ms/epoch - 533us/step
Loss: 0.002801917027682066, Accuracy: 1.0
