## PySpark Jupyter Notebook with Python 3 on Windows 11
## Read data from local storage, Cleaning / Transforming the data, then persist data in parquet format with partition

In [11]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [12]:
import os
from pyspark.sql import SparkSession
import datetime
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [13]:
indata = 'c:/sb/strfacts/indata'

In [14]:
spark = SparkSession.builder.appName("ingestion").getOrCreate()

## Read all csv files of last quarter, folders in format yyyy-mm

In [15]:
today = datetime.datetime.now()
print(today.year)
print(today.month)

2022
6


In [16]:
# Free data is available quaterly, but you can pay to get monthly data. For now we will use free data
# Calculate date for the last quater data.
if today.month in [1, 2, 3]:
    dirnames = ["{}-{}".format(today.year-1, i) for i in ['10','11','12']]
elif today.month in [4, 5, 6]:
    dirnames = ["{}-{}".format(today.year, i) for i in ['01','02','03']]
elif today.month in [7, 8, 9]:
    dirnames = ["{}-{}".format(today.year, i) for i in ['04','05','06']]
else:
    dirnames = ["{}-{}".format(today.year, i) for i in ['07','08','09']]

In [17]:

# define schema for reviews.csv.gz
reviews_schema = StructType([
    StructField('location', StringType(), True),
    StructField('listing_id', StringType(), True),
    #StructField('id',         IntegerType(),True),
    StructField('date',       StringType(),True),
    #StructField('reviewer_id', IntegerType(),True),
    StructField('reviewer_name', StringType(),True),
    StructField('comments', StringType(), True)])


In [18]:
print (dirnames)

['2022-01', '2022-02', '2022-03']


In [19]:
#parse csv file
def parse_csv(line:str, location):
    f =line.split(',')
    if len(f) < 6:
        # "NONE" partition holds the invalid data
        #return ("NONE", None, None, None, None, None, None)
        return ("NONE", None, None, None, None)
    else:
        # join the comments together into one string
        comments = ''
        for words in f[5:]:
            comments = comments +' '+words
        return(location,f[0],f[2],f[4],comments)
    

In [20]:
# rdd
reviews_rdd = None
for dn in os.listdir(indata):
    # read the files in this folder if dn is last quarter
    if dn in dirnames:
        # for this project, we are only interested in the reviews.csv.gz file
        filenames = os.listdir(indata+'/'+dn)
        for fn in filenames:
            a = len(fn)
            b = len("reviews.csv.gz")
            # hawaii.reviews.csv.gz; location = hawaii
            if a > b and fn[a-b:] == "reviews.csv.gz":
                location = fn[:a-b-1]
                raw_rdd=spark.sparkContext.textFile(indata+'/'+dn+'/'+fn)
                header = raw_rdd.first() #get the first row to a variable
                #remove the header, then clean the rest rows with parse_csv()
                clean_rdd = raw_rdd.filter(lambda row:row != header)\
                                   .map(lambda row: parse_csv(row, location))
                if reviews_rdd is None:
                    reviews_rdd = clean_rdd
                else:
                    union_rdd = reviews_rdd.union(clean_rdd)
                    reviews_rdd = union_rdd

## Only keep a small data set in the local indata folder to make the next line a bit faster, for easy development and testing.

In [22]:
# Append to the parquet files with place name as partition
cleandata = 'c:/sb/strfacts/cleandata'
reviews_df = spark.createDataFrame(reviews_rdd, schema=reviews_schema)

In [23]:
reviews_df.show(5)

+--------+----------+----------+-------------+--------------------+
|location|listing_id|      date|reviewer_name|            comments|
+--------+----------+----------+-------------+--------------------+
|  hawaii|   2797791|2014-05-03|        Evita| "We spent a week...|
|  hawaii|   2797791|2014-05-28|      Michael| The reservation ...|
|  hawaii|   2797791|2014-06-02|          Tim| "It was an epic ...|
|    NONE|      null|      null|         null|                null|
|    NONE|      null|      null|         null|                null|
+--------+----------+----------+-------------+--------------------+
only showing top 5 rows



In [24]:
reviews_df.write.partitionBy("location").mode("append").parquet(cleandata)