In [6]:
import os
# Find the latest version of spark 3.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.1'
spark_version = 'spark-3.0.1'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark
# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("BlueSkyHotelReviews").getOrCreate()

0% [Working]            Ign:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [Conn                                                                               Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [Conn                                                                               Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [Conn0% [Release.gpg gpgv 697 B] [Connecting to archive.ubuntu.com] [Connecting to s                                                                               Hit:4 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Hit:5 https://developer.download.nvidia.com/comp

In [11]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.sql.types import StringType
from pyspark.ml.feature import StopWordsRemover

In [8]:
# Create a DataFrame with hotel review dataset
dataframe = spark.read.csv("/content/hotel_reviews.csv", header = True)

In [9]:
# Show DataFrame
dataframe.show(truncate = False)

+------------------------------------+------+---------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|hotel_name                          |rating|title                                        |content                                                                                                                                                                                                                                                                                                                                                  

In [19]:
# Tokenize word
tokenizer = Tokenizer(inputCol = "content", outputCol = "tokenized_reviews")
tokenizer

Tokenizer_07c7dff61e30

In [20]:
# Transform and show DataFrame
tokenized = tokenizer.transform(dataframe)
tokenized.show()

+--------------------+------+--------------------+--------------------+--------------------+
|          hotel_name|rating|               title|             content|   tokenized_reviews|
+--------------------+------+--------------------+--------------------+--------------------+
|Rancho Valencia R...|     5|Best romantic vac...|Our experience at...|[our, experience,...|
|Rancho Valencia R...|     5|Sweet sweet serenity|Amazing place. Ev...|[amazing, place.,...|
|Rancho Valencia R...|     5|Amazing Property ...|We booked a 3 nig...|[we, booked, a, 3...|
| Aloft Arundel Mills|     2|Never again...bew...|Currently in bed ...|[currently, in, b...|
| Aloft Arundel Mills|     5|ALWAYS GREAT STAY...|I live in Md and ...|[i, live, in, md,...|
| Aloft Arundel Mills|     5|      Wonderful stay|I stayed here wit...|[i, stayed, here,...|
| Aloft Arundel Mills|     5|     Worth the money|Beautiful rooms a...|[beautiful, rooms...|
| Aloft Arundel Mills|     5|Great Hotel Exper...|We stayed here wh...

In [26]:
# Create review_length() UDF
def review_length(review):
  return len(review)

def unique_items(review):
  return len(set(review))

In [27]:
# Create UDF in PySpark
review_length = udf(review_length, IntegerType())

unique_items = udf(unique_items, IntegerType())

In [29]:
# Select the needed columns and don't truncate results
tokenized.select("title", "content", "tokenized_reviews")\
.withColumn("tokens", review_length(col("tokenized_reviews")))\
.withColumn("unique_tokens", unique_items(col("tokenized_reviews"))).show(truncate=False)

+---------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [35]:
# Select the needed columns, average the column, and don't truncate results
tokenized.select("title", "content", "tokenized_reviews")\
    .withColumn("tokens", review_length(col("tokenized_reviews")))\
    .agg({"tokens":"avg"}).show(truncate = False)

+-----------+
|avg(tokens)|
+-----------+
|59.0689    |
+-----------+



In [36]:
# Find the average hotel rating
dataframe.groupBy("hotel_name").agg({"rating":"avg"}).show()

+--------------------+------------------+
|          hotel_name|       avg(rating)|
+--------------------+------------------+
|Fairfield Inn Phi...|               4.2|
|    Beach Comber Inn|               5.0|
|Atlantis Oceanfro...|               4.5|
|Hampton Inn Suite...|               5.0|
|Extended Stay Ame...|               3.0|
|Comfort Suites Co...|               4.4|
|         Sound Board|             4.625|
|    Super 8 Anderson|               5.0|
|Chart House Suite...|               4.5|
|Budget Inn of Ame...|               4.0|
|Holiday Inn Expre...|               3.4|
|          Hotel Rose| 4.333333333333333|
|     Motel 6 Ardmore|               4.0|
|Magnuson Hotel Na...|               1.0|
|Hampton Inn Suite...|               4.0|
|Holiday Inn Expre...|2.3333333333333335|
|Alicia's Eagle Ro...|               3.0|
|Hampton Inn Roano...|               4.0|
|The Siena Hotel, ...|               5.0|
|Best Western Mt. ...|               5.0|
+--------------------+------------

In [38]:
# Instantiate Remover
remover = StopWordsRemover(inputCol="tokenized_reviews", outputCol= "stopword_filtered")

# Transform and show data
remover.transform(tokenized).show()

+--------------------+------+--------------------+--------------------+--------------------+--------------------+
|          hotel_name|rating|               title|             content|   tokenized_reviews|   stopword_filtered|
+--------------------+------+--------------------+--------------------+--------------------+--------------------+
|Rancho Valencia R...|     5|Best romantic vac...|Our experience at...|[our, experience,...|[experience, ranc...|
|Rancho Valencia R...|     5|Sweet sweet serenity|Amazing place. Ev...|[amazing, place.,...|[amazing, place.,...|
|Rancho Valencia R...|     5|Amazing Property ...|We booked a 3 nig...|[we, booked, a, 3...|[booked, 3, night...|
| Aloft Arundel Mills|     2|Never again...bew...|Currently in bed ...|[currently, in, b...|[currently, bed, ...|
| Aloft Arundel Mills|     5|ALWAYS GREAT STAY...|I live in Md and ...|[i, live, in, md,...|[live, md, aloft,...|
| Aloft Arundel Mills|     5|      Wonderful stay|I stayed here wit...|[i, stayed, here,

In [39]:
# Run hashing term frequency
hashing = HashingTF(inputCol= "tokenized_reviews", outputCol= "hashedValues", numFeatures=pow(2,4))

# Transform into a DF
hashed_df = hashing.transform(tokenized)

In [40]:
# Display new DataFrame
hashed_df.show(truncate = False)

+------------------------------------+------+---------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [42]:
# Fit the IDF on the data set 
idf = IDF(inputCol="hashedValues", outputCol= "features")
idfModel = idf.fit(hashed_df)
rescaledData = idfModel.transform(hashed_df)


In [43]:
# Display the DataFrame
rescaledData.select("content", "features").show(truncate= False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|content                                                                                                                                                              