In [0]:
## PySpark Jupyter Notebook with Python 3 on Azure Databricks
## Extract the top most frequent used words in reviews for each property listing

In [0]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import pyspark.sql.functions as F
from pyspark.sql.window import Window

In [0]:
# define schema for reviews.csv.gz
reviews_schema = StructType([
    StructField('location', StringType(), True),
    StructField('listing_id', StringType(), True),
    #StructField('id',         IntegerType(),True),
    StructField('date',       StringType(),True),
    #StructField('reviewer_id', IntegerType(),True),
    StructField('reviewer_name', StringType(),True),
    StructField('comments', StringType(), True)])

In [0]:
# Read reviews parquet file into data frame reviews_df
#cleandata = 'c:/sb/strfacts/cleandata'
cleandata = '/mnt/blob1/cleandata'
reviews_df = spark.read.format("parquet").schema(reviews_schema).load(cleandata)

In [0]:
reviews_df.show(5)

In [0]:
# group by list_id, concat all comments
reviews_df.createOrReplaceTempView("reviews")
query = """
    SELECT listing_id, CONCAT_WS(' ', collect_list(comments)) as all_comments, first(location) as area
    FROM reviews
    GROUP BY listing_id
    """
tmpdf = spark.sql(query)
tmpdf.show(2)

In [0]:
# regex remove all nonalphanumeric characters, split by " " and explode words
df1 = tmpdf.select(tmpdf.listing_id,tmpdf.area, F.explode(F.split(F.regexp_replace(tmpdf.all_comments,"[^a-zA-Z0-9 -]","")," ")).alias("words"))

In [0]:
df1.show(10)

In [0]:
# Find the top 5 most frequent used words in the comments
df2 = df1.groupBy(df1.listing_id, df1.area, df1.words)\
      .agg(F.count(df1.words).alias("cnt"))

In [0]:
df2.show(5)

In [0]:
windowSpec  = Window.partitionBy(df2.listing_id).orderBy(df2.cnt)
df3 = df2.withColumn("row_number",F.row_number().over(windowSpec))
df3.show(3)

In [0]:
df4 = df3.select("*").where(df3.row_number<=5)\
      .select(df3.listing_id, df3.area, df3.words)\
      .groupBy(df3.listing_id, df3.area)\
      .agg(F.collect_list(df3.words)\
      .alias("frequentw"))\
      .select(df3.listing_id, df3.area, F.array_join("frequentw", ",")\
      .alias("most_frequent_used_words"))

In [0]:
df4.show(10)

In [0]:
#analyticsdata = 'c:/sb/strfacts/analyticsdata'
analyticsdata = '/mnt/blob1/analyticsdata'
df4.write.partitionBy("area").mode("append").parquet(analyticsdata)