# step 5 - PySpark & Mllib for step 3

### Installation and Setup

Spark requires a java version of 8 or 11 to work

In [None]:
%%capture 
%conda install -c conda-forge openjdk=11 -y

config spark to work with our env java version(11)

In [1]:
import os 
java_home_path = os.popen('dirname $(dirname $(which java))').read().strip()
os.environ["JAVA_HOME"] = java_home_path
print(f"JAVA_HOME is set to: {os.environ['JAVA_HOME']}")


JAVA_HOME is set to: /opt/anaconda3/envs/DS-101-Final


In [None]:
%%capture 
%pip install pyspark pandas

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, lit, concat, first, array, udf, min as spark_min
from pyspark.sql.types import ArrayType, IntegerType
from pyspark.ml.feature import VectorAssembler, PCA
from pyspark.ml.clustering import BisectingKMeans
import plotly.express as px
import pandas as pd
import os

### Create Spark session

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, lit, concat, first, array, udf
from pyspark.sql.types import ArrayType, IntegerType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import BisectingKMeans
from pyspark.sql.functions import min as spark_min

try:
    spark.stop()
except:
    pass

spark = SparkSession.builder.appName("HotelClustering").getOrCreate()
print("Create Spark session") 

25/03/12 13:00:01 WARN Utils: Your hostname, Yoavs-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.122 instead (on interface en0)
25/03/12 13:00:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/12 13:00:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/12 13:00:02 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Create Spark session


### Data Loading and Filtering

- Load the CSV file.
- Select the top 150 hotels (by number of records).
- Select the top 40 checkin dates.

In [2]:
file_path = "../data/hotels_data_changed.csv"

df = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path)

top150_hotels = df.groupBy("Hotel Name").agg(count("*").alias("cnt")) .orderBy(col("cnt").desc()).limit(150)


df_top150 = df.join(top150_hotels.select("Hotel Name"), on="Hotel Name", how="inner")


top40_dates = df_top150.groupBy("Checkin Date").agg(count("*").alias("cnt")).orderBy(col("cnt").desc()).limit(40)

df_top150_dates = df_top150.join(top40_dates.select("Checkin Date"), on="Checkin Date", how="inner")

df_top150_dates.head()


Row(Checkin Date=datetime.date(2015, 8, 13), Hotel Name='The Peninsula New York', Snapshot ID=1, Snapshot Date=datetime.date(2015, 7, 17), Days=5, Original Price=4370, Discount Price=4240, Discount Code=1, Available Rooms=3, Hotel Stars=5, DayDiff=27, WeekDay='Thursday', DiscountDiff=130, DiscountPerc=2.9748283752860414)

### Convert to 160-dims vector

In [3]:
df_grouped = df_top150_dates.groupBy("Hotel Name", "Checkin Date", "Discount Code").agg(spark_min("Discount Price").alias("minDiscountPrice"))

df_grouped = df_grouped.withColumn("date_code", concat(col("Checkin Date"), lit("_"), col("Discount Code")))

df_pivot = df_grouped.groupBy("Hotel Name").pivot("date_code").agg(first("minDiscountPrice"))

df_pivot = df_pivot.fillna(-1)

df_pivot.head(3)

25/03/12 13:00:14 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Row(Hotel Name='Bentley Hotel', 2015-08-12_1=-1, 2015-08-12_2=-1, 2015-08-12_3=-1, 2015-08-12_4=-1, 2015-08-13_1=-1, 2015-08-13_2=-1, 2015-08-13_3=-1, 2015-08-13_4=-1, 2015-08-19_1=-1, 2015-08-19_2=-1, 2015-08-19_3=-1, 2015-08-19_4=-1, 2015-08-26_1=-1, 2015-08-26_2=-1, 2015-08-26_3=-1, 2015-08-26_4=-1, 2015-08-27_1=-1, 2015-08-27_2=-1, 2015-08-27_3=980, 2015-08-27_4=-1, 2015-08-28_1=-1, 2015-08-28_2=-1, 2015-08-28_3=-1, 2015-08-28_4=-1, 2015-09-09_1=-1, 2015-09-09_2=-1, 2015-09-09_3=-1, 2015-09-09_4=-1, 2015-09-10_1=-1, 2015-09-10_2=-1, 2015-09-10_3=-1, 2015-09-10_4=-1, 2015-09-11_1=-1, 2015-09-11_2=-1, 2015-09-11_3=-1, 2015-09-11_4=-1, 2015-09-16_1=-1, 2015-09-16_2=-1, 2015-09-16_3=-1, 2015-09-16_4=-1, 2015-09-17_1=-1, 2015-09-17_2=-1, 2015-09-17_3=-1, 2015-09-17_4=-1, 2015-09-18_1=-1, 2015-09-18_2=-1, 2015-09-18_3=-1, 2015-09-18_4=-1, 2015-09-30_1=-1, 2015-09-30_2=-1, 2015-09-30_3=-1, 2015-09-30_4=-1, 2015-10-01_1=-1, 2015-10-01_2=-1, 2015-10-01_3=-1, 2015-10-01_4=-1, 2015-10-02_1=-

We might find ourself in a situation where some of the hotels don't have a column for all the 160 dates + discount codes.
So we would ensure thy all have that.

In [4]:
top40_list = [row["Checkin Date"] for row in top40_dates.collect()]
discount_codes = [1, 2, 3, 4]

# Build the expected column names (format: "YYYY-MM-DD_1", etc.)
expected_cols = [f"{date}_{code}" for date in top40_list for code in discount_codes]

# Add any missing expected columns with default -1
existing_cols = df_pivot.columns
for col_name in expected_cols:
    if col_name not in existing_cols:
        df_pivot = df_pivot.withColumn(col_name, lit(-1))

# Reorder the DataFrame columns so that they appear in the desired order:
df_pivot = df_pivot.select(["Hotel Name"] + expected_cols)
df_pivot.head(3)

[Row(Hotel Name='Bentley Hotel', 2015-11-11_1=-1, 2015-11-11_2=-1, 2015-11-11_3=-1, 2015-11-11_4=-1, 2015-10-14_1=-1, 2015-10-14_2=-1, 2015-10-14_3=-1, 2015-10-14_4=-1, 2015-11-04_1=-1, 2015-11-04_2=-1, 2015-11-04_3=-1, 2015-11-04_4=-1, 2015-08-19_1=-1, 2015-08-19_2=-1, 2015-08-19_3=-1, 2015-08-19_4=-1, 2015-10-28_1=-1, 2015-10-28_2=-1, 2015-10-28_3=-1, 2015-10-28_4=-1, 2015-10-21_1=1405, 2015-10-21_2=1403, 2015-10-21_3=1394, 2015-10-21_4=1389, 2015-11-06_1=-1, 2015-11-06_2=-1, 2015-11-06_3=-1, 2015-11-06_4=-1, 2015-08-12_1=-1, 2015-08-12_2=-1, 2015-08-12_3=-1, 2015-08-12_4=-1, 2015-11-05_1=1184, 2015-11-05_2=1138, 2015-11-05_3=1133, 2015-11-05_4=1179, 2015-10-22_1=-1, 2015-10-22_2=-1, 2015-10-22_3=-1, 2015-10-22_4=-1, 2015-11-12_1=-1, 2015-11-12_2=-1, 2015-11-12_3=-1, 2015-11-12_4=-1, 2015-09-10_1=-1, 2015-09-10_2=-1, 2015-09-10_3=-1, 2015-09-10_4=-1, 2015-10-29_1=-1, 2015-10-29_2=-1, 2015-10-29_3=2076, 2015-10-29_4=-1, 2015-09-09_1=-1, 2015-09-09_2=-1, 2015-09-09_3=-1, 2015-09-09_4=-

### Normalization and Save to CSV

Normalize the 160 price columns row-by-row (scaling valid prices to a 0–100 range, leaving missing values as -1).

In [5]:
# Combine the 160 price columns into an array column
df_pivot = df_pivot.withColumn("prices_array", array(*expected_cols))

# Define a UDF for normalizing the prices for each hotel (ignoring -1 values)
def normalize_prices(prices):
    # Filter out missing values (-1)
    valid_prices = [p for p in prices if p != -1]
    if not valid_prices:
        return prices
    min_price = min(valid_prices)
    max_price = max(valid_prices)
    if min_price == max_price:
        return [0 if p != -1 else -1 for p in prices]
    normalized = []
    for p in prices:
        if p == -1:
            normalized.append(-1)
        else:
            norm_val = round(((p - min_price) / (max_price - min_price)) * 100)
            normalized.append(int(norm_val))
    return normalized

normalize_udf = udf(normalize_prices, ArrayType(IntegerType()))

# Apply the normalization UDF to create a new column with normalized prices.
df_pivot = df_pivot.withColumn("norm_prices_array", normalize_udf("prices_array"))

# Replace the original price columns with the normalized values.
for i, col_name in enumerate(expected_cols):
    df_pivot = df_pivot.withColumn(col_name, col("norm_prices_array")[i])

# Optionally, drop helper columns.
df_final = df_pivot.drop("prices_array", "norm_prices_array")


**Save to CSV**

In [6]:
pyspark_hotels_clustering_data= "../data/pyspark_hotels_clustering_data.csv"
df_final.write.option("header", "true").mode("overwrite").csv(pyspark_hotels_clustering_data)


                                                                                

### Clustering

#### Read CSV and assemble features

In [7]:
from pyspark.ml.feature import VectorAssembler

# Define the path to the CSV folder that was saved in Step 5.
pyspark_hotels_clustering_data = "../data/pyspark_hotels_clustering_data.csv"

# Read the CSV data (Spark will read all part files in the folder)
df_loaded = spark.read.option("header", "true").option("inferSchema", "true").csv(pyspark_hotels_clustering_data)

# Identify the feature columns (all columns except "Hotel Name")
feature_cols = [col for col in df_loaded.columns if col != "Hotel Name"]

# Assemble the 160 normalized price columns into a single feature vector.

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_features = assembler.transform(df_loaded)


#### Clustering with MLlib (BisectingKMeans)

In [8]:
from pyspark.ml.clustering import BisectingKMeans

# Set the number of clusters (adjust k as needed)
bkmeans = BisectingKMeans(featuresCol="features", predictionCol="cluster", k=4)

# Train the model
model = bkmeans.fit(df_features)

# Add the cluster assignments to the DataFrame
df_clustered = model.transform(df_features)

# Show the hotel names along with their cluster assignments
df_clustered.select("Hotel Name", "cluster").show(truncate=False)


25/03/12 13:00:42 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


+-----------------------------------------------------+-------+
|Hotel Name                                           |cluster|
+-----------------------------------------------------+-------+
|Bentley Hotel                                        |0      |
|Westin New York at Times Square                      |3      |
|The Westin New York Grand Central                    |2      |
|Super 8 Brooklyn   Park Slope Hotel                  |0      |
|Four Seasons Hotel New York                          |3      |
|Omni Berkshire Place                                 |2      |
|DoubleTree by Hilton Metropolitan - New York City    |3      |
|Dumont NYC-an Affinia hotel                          |1      |
|Hampton Inn Manhattan Downtown-Financial District    |3      |
|Eventi Hotel a Kimpton Hotel                         |2      |
|Magnuson Convention Center Hotel                     |3      |
|Courtyard Newark Elizabeth                           |0      |
|Park Hyatt New York                    

#### Visualization with PCA and Plotly

Since the features are 160-dimensional, we use PCA to reduce them to 2 dimensions for visualization. Then, we convert the Spark DataFrame to a Pandas DataFrame and use Plotly Express to create a scatter plot.

In [9]:
from pyspark.ml.feature import PCA
pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures")
pca_model = pca.fit(df_clustered)
df_pca = pca_model.transform(df_clustered)

pandas_df = df_pca.select("Hotel Name", "cluster", "pcaFeatures").toPandas()

# Split the PCA features into two separate columns for plotting
pandas_df["pca1"] = pandas_df["pcaFeatures"].apply(lambda x: x[0])
pandas_df["pca2"] = pandas_df["pcaFeatures"].apply(lambda x: x[1])


25/03/12 13:00:52 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [10]:
import plotly.express as px

fig = px.scatter(
    pandas_df,
    x="pca1",
    y="pca2",
    color="cluster",
    hover_data=["Hotel Name"],
    title="Hotel Clusters Visualization (PCA Reduced)"
)
fig.show()


#### Stop the spark session

In [11]:
spark.stop()