In [0]:
## PySpark Jupyter Notebook with Python 3 on Azure Databricks
## Read data from local storage, Cleaning / Transforming the data, then persist data in parquet format with partition

In [0]:
import os
from pyspark.sql import SparkSession
import datetime
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [0]:
storageAccountName = 'strfactsblob'
storageAccountAccessKey = 'TpOQSdCGOmT7XjOC+47zvsS5ba+mm8CUJHtp8aia6oBgUGVQGqP8mKm4ShVe4U72ZdWvyl9kRCeW+AStY9WK8Q=='
ContainerName = 'bcontainer1'
#indata = 'c:/sb/strfacts/indata'
indata = '/mnt/blob1/indata'

In [0]:
## Read all csv files of last quarter, folders in format yyyy-mm

In [0]:
today = datetime.datetime.now()
print(today.year)
print(today.month)

In [0]:
# Free data is available quaterly, but you can pay to get monthly data. For now we will use free data
# Calculate date for the last quater data.
if today.month in [1, 2, 3]:
    dirnames = ["{}-{}".format(today.year-1, i) for i in ['10','11','12']]
elif today.month in [4, 5, 6]:
    dirnames = ["{}-{}".format(today.year, i) for i in ['01','02','03']]
elif today.month in [7, 8, 9]:
    dirnames = ["{}-{}".format(today.year, i) for i in ['04','05','06']]
else:
    dirnames = ["{}-{}".format(today.year, i) for i in ['07','08','09']]

In [0]:
# define schema for reviews.csv.gz
reviews_schema = StructType([
    StructField('location', StringType(), True),
    StructField('listing_id', StringType(), True),
    #StructField('id',         IntegerType(),True),
    StructField('date',       StringType(),True),
    #StructField('reviewer_id', IntegerType(),True),
    StructField('reviewer_name', StringType(),True),
    StructField('comments', StringType(), True)])

In [0]:
#parse csv file
def parse_csv(line:str, location):
    f =line.split(',')
    if len(f) < 6:
        # "NONE" partition holds the invalid data
        #return ("NONE", None, None, None, None, None, None)
        return ("NONE", None, None, None, None)
    else:
        # join the comments together into one string
        comments = ''
        for words in f[5:]:
            comments = comments +' '+words
        return(location,f[0],f[2],f[4],comments)
    

In [0]:
# rdd
reviews_rdd = None
for dn in os.listdir('/dbfs'+indata):
    # read the files in this folder if dn is last quarter
    if dn in dirnames:
        # for this project, we are only interested in the reviews.csv.gz file
        filenames = os.listdir('/dbfs'+indata+'/'+dn)
        for fn in filenames:
            a = len(fn)
            b = len("reviews.csv.gz")
            # hawaii.reviews.csv.gz; location = hawaii
            if a > b and fn[a-b:] == "reviews.csv.gz":
                location = fn[:a-b-1]
                raw_rdd=spark.sparkContext.textFile(indata+'/'+dn+'/'+fn)
                header = raw_rdd.first() #get the first row to a variable
                #remove the header, then clean the rest rows with parse_csv()
                clean_rdd = raw_rdd.filter(lambda row:row != header)\
                                   .map(lambda row: parse_csv(row, location))
                if reviews_rdd is None:
                    reviews_rdd = clean_rdd
                else:
                    union_rdd = reviews_rdd.union(clean_rdd)
                    reviews_rdd = union_rdd

In [0]:
# Append to the parquet files with place name as partition
#cleandata = 'c:/sb/strfacts/cleandata'
cleandata = '/mnt/blob1/cleandata'
reviews_df = spark.createDataFrame(reviews_rdd, schema=reviews_schema)

In [0]:
reviews_df.write.partitionBy("location").mode("append").parquet(cleandata)