In [None]:
# Import findspark and initialize. 
import findspark
findspark.init()

# Import packages
from pyspark.sql import SparkSession
import time

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [None]:
# Import CSV file utilizing PySpark
path = 'Resources/INNHotelsGroup.csv'
spark.sparkContext.addFile(path)
df = spark.read.csv(path, header=True, sep=',')
df.show()

In [None]:
pd_df = df.toPandas()

# Data Cleaning
meal_plan_dummies = pd.get_dummies(pd_df["type_of_meal_plan"])
meal_plan_dummies.head()

In [None]:
room_type_dummies = pd.get_dummies(pd_df["room_type_reserved"])
room_type_dummies.head()

In [None]:
cleaned_df = pd.concat([pd_df, meal_plan_dummies, room_type_dummies], axis=1)
cleaned_df = cleaned_df.drop(columns=["type_of_meal_plan", "room_type_reserved"])
cleaned_df.head()

In [None]:
# Replace string values with boolean values to make data easier to use
def encode_market(market):
    if market == "Online":
        return 1
    else:
        return 0
# Call the encode_market function on the market column
cleaned_df["market_segment_type"] = cleaned_df["market_segment_type"].apply(encode_market)
cleaned_df.head()

def encode_cancel(cancel):
    if cancel == "Canceled":
        return 1
    else:
        return 0
# Call the encode_cancel function on the cancel column
cleaned_df["booking_status"] = cleaned_df["booking_status"].apply(encode_cancel)
cleaned_df.head()

In [None]:
# Set index of cleaned_df
cleaned_df.set_index("Booking_ID", inplace=True)
cleaned_df.head()

In [None]:
# Import dependencies for Machine Learning Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

In [None]:
# Split our preprocessed data into our features and target arrays
y = cleaned_df['booking_status'].values
X = cleaned_df.drop(columns='booking_status').values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# Find shape of the data to determine best number nodes for the model
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

In [None]:
TensorFlow

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 7
#hidden_nodes_layer2 = 3

nn_model = tf.keras.models.Sequential()

# First hidden layer
nn_model.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
#nn_model.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn_model.summary()

In [None]:
# Compile the model
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn_model.fit(X_train_scaled,y_train,epochs=2)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Logistic Regression

In [None]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(random_state=1)

# Fit the model using training data
lr_model = logistic_regression_model.fit(X_train, y_train)
lr_model

In [None]:
# Make a prediction using the testing data
testing_predictions = lr_model.predict(X_test)
testing_predictions

In [None]:
# Print the balanced_accuracy score of the model
balanced_accuracy = balanced_accuracy_score(y_test, testing_predictions)
print(f"Balanced Accuracy Score : {balanced_accuracy}")

In [None]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, testing_predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

print("Confusion Matrix")
display(cm_df)

In [None]:
# Print the classification report for the model
print("Classification Report")
print(classification_report(y_test, testing_predictions))

In [None]:
Random Forest ML model

In [None]:
# Import dependencies for Machine Learning Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline

In [None]:
# Split our preprocessed data into our features and target arrays
y = cleaned_df['booking_status'].values
X = cleaned_df.copy()
X.drop("booking_status", axis=1, inplace=True)
X.head()

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [None]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

In [None]:
importances_df = pd.DataFrame(sorted(zip(rf_model.feature_importances_, X.columns), reverse=True))
importances_df.set_index(importances_df[1], inplace=True)
importances_df.drop(columns=1, inplace=True)
importances_df.rename(columns={0: 'Feature Importances'}, inplace=True)
importances_sorted = importances_df.sort_values(by='Feature Importances')
importances_sorted.plot(kind='barh', color='lightgreen', title= 'Features Importances', legend=False)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))