In [1]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark import SparkConf

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 0.38.1 
Authenticating with environment variables and user-defined glue_role_arn: arn:aws:iam::170663929988:role/GlueS3_ETL
Trying to create a Glue session for the kernel.
Worker Type: G.1X
Number of Workers: 5
Session ID: f8830531-921d-4f04-9507-c19fa33713e7
Job Type: glueetl
Applying the following default arguments:
--glue_kernel_version 0.38.1
--enable-glue-datacatalog true
Waiting for session f8830531-921d-4f04-9507-c19fa33713e7 to get into ready status...
Session f8830531-921d-4f04-9507-c19fa33713e7 has been created.



# Import the tables as Dynamic frame

In [2]:
google_stores_dynamic = glueContext.create_dynamic_frame.from_catalog(
    database="stores_and_yelp",
    table_name="google_raw_metadata_sitios",    
)

reviews_california_dynamic = glueContext.create_dynamic_frame.from_catalog(
    database="stores_and_yelp",
    table_name="california_raw_review_california",    
)

reviews_florida_dynamic = glueContext.create_dynamic_frame.from_catalog(
    database="stores_and_yelp",
    table_name="florida_raw_review_florida",    
)

reviews_nevada_dynamic = glueContext.create_dynamic_frame.from_catalog(
    database="stores_and_yelp",
    table_name="nevada_rawreview_nevada",    
)

reviews_newyork_dynamic = glueContext.create_dynamic_frame.from_catalog(
    database="stores_and_yelp",
    table_name="ny_raw_review_new_york",    
)

reviews_texas_dynamic = glueContext.create_dynamic_frame.from_catalog(
    database="stores_and_yelp",
    table_name="texas_raw_review_texas",    
)

yelp_stores_dynamic = glueContext.create_dynamic_frame.from_catalog(
    database="stores_and_yelp",
    table_name="yelp_raw_business_parquet",    
)

yelp_reviews_dynamic = glueContext.create_dynamic_frame.from_catalog(
    database="stores_and_yelp",
    table_name="yelp_raw_review_001_json",    
)

yelp_clientes_dynamic = glueContext.create_dynamic_frame.from_catalog(
    database="stores_and_yelp",
    table_name="yelp_raw_user_003_parquet",    
)

states_dynamic = glueContext.create_dynamic_frame.from_catalog(
    database="stores_and_yelp",
    table_name="yelp_raw_states_parquet",    
)




# Convert the dynamic frames into spark dataframes

In [3]:
google_stores_spark = google_stores_dynamic.toDF()
reviews_california_spark = reviews_california_dynamic.toDF()
reviews_florida_spark = reviews_florida_dynamic.toDF()
reviews_nevada_spark = reviews_nevada_dynamic.toDF()
reviews_newyork_spark = reviews_newyork_dynamic.toDF()
reviews_texas_spark = reviews_texas_dynamic.toDF()
yelp_stores_spark = yelp_stores_dynamic.toDF()
yelp_reviews_spark = yelp_reviews_dynamic.toDF()
yelp_clientes_spark = yelp_clientes_dynamic.toDF()
states_spark = states_dynamic.toDF()




In [31]:
# rename the column state_id to stateID in states_spark

states_spark = states_spark.withColumnRenamed('state_id', 'stateID')




In [5]:
from pyspark.sql.functions import concat, concat_ws, col, row_number, lower, udf, split, array, create_map, array_contains, when, lit, monotonically_increasing_id, from_unixtime, explode
from functools import reduce
from pyspark.sql.types import StringType, TimestampType, ArrayType, IntegerType




# YELP

In [32]:
# First, the stores_yelp df is transform so it´ll fit the needs for the machine learning and data analysis stage


states = ['CA', 'FL', 'NV', 'NY', 'TX']
categories = ['restaurant', 'coffee', 'bar', 'deli', 'sandwich', 'grocery', 'bakery', 'restaurants', 'coffees', 'bars', 'delis', 'sandwiches', 'groceries', 'bakeries']

# Drop duplicates, create the address column and filter the stores by state

yelp_stores_spark = yelp_stores_spark.dropDuplicates(['business_id'])
yelp_stores_spark = yelp_stores_spark.withColumn('address_total', concat(col('address'), col('city'), col('state'), col('postal_code')))
yelp_stores_spark = yelp_stores_spark.drop('address', 'city', 'postal_code', 'is_open', 'review_count', 'hours', '__index_level_0__', 'stars')
yelp_stores_spark = yelp_stores_spark.withColumnRenamed('address_total', 'address')
yelp_stores_spark = yelp_stores_spark.filter(col('state').isin(states))
yelp_stores_spark = yelp_stores_spark.withColumn('state', when(yelp_stores_spark['state'] == 'CA', 'California')
                                                         .when(yelp_stores_spark['state'] == 'FL', 'Florida')
                                                         .when(yelp_stores_spark['state'] == 'NV', 'Nevada')
                                                         .when(yelp_stores_spark['state'] == 'NY', 'New York')
                                                         .when(yelp_stores_spark['state'] == 'TX', 'Texas'))

# Filter the stores by category and join with the states df to have the correct stateID

yelp_stores_spark = yelp_stores_spark.withColumn('categories', lower(col('categories').cast('string')))
yelp_stores_spark = yelp_stores_spark.withColumn('category_array', split(col('categories'), ','))
categories_expr = [array_contains(col('category_array'), cat) for cat in categories]
yelp_stores_spark = yelp_stores_spark.withColumn('category_match', reduce(lambda x, y: x | y, categories_expr))
yelp_stores_spark = yelp_stores_spark.filter(col('category_match'))
yelp_stores_spark = yelp_stores_spark.join(states_spark, 'state', 'inner')

# Create the column delivery and drop the unnecessary columns

yelp_stores_spark = yelp_stores_spark.withColumn('delivery', col('attributes.RestaurantsDelivery'))
yelp_stores_spark = yelp_stores_spark.withColumn('delivery', when(yelp_stores_spark['delivery'] == 'True', 'Y').otherwise('N'))
yelp_stores_spark = yelp_stores_spark.drop('attributes', 'category_array', 'category_match', 'state')




In [33]:
# Next, we need to transform the reviews_yelp df.
# Drop duplicates, join with the stores df so the reviews from other stores won´t be taken into account, drop the unnecessary columns, and transform the
# date column

yelp_reviews_spark = yelp_reviews_spark.dropDuplicates(['review_id'])
yelp_reviews_spark = yelp_reviews_spark.join(yelp_stores_spark, 'business_id', 'inner')
yelp_reviews_spark = yelp_reviews_spark.drop('funny', 'cool', 'useful', 'name', 'address', 'latitude', 'longitude', 'stateID', 'delivery', 'categories')
yelp_reviews_spark = yelp_reviews_spark.withColumnRenamed('stars', 'rating')
yelp_reviews_spark = yelp_reviews_spark.withColumn("date", from_unixtime(col("date") / 1000).cast(TimestampType()))




In [8]:
# Finally, we´ll transform the clientes_yelp df.
# We drop the unnecessary columns and duplicates and join with the review_yelp df so only the users that made a review in the stores that
# are being considered will be taken into account

drop_columns = ['review_count', 'yelping_since', 'useful', 'funny', 'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot', 'compliment_more', 'compliment_profile', 'compliment_cute', 'compliment_list', 'compliment_note', 'compliment_plain', 'compliment_cool', 'compliment_funny', 'compliment_writer', 'compliment_photos']
yelp_clientes_spark = yelp_clientes_spark.drop(*drop_columns)
yelp_clientes_spark = yelp_clientes_spark.dropDuplicates(['user_id'])
yelp_clientes_spark = yelp_clientes_spark.join(yelp_reviews_spark, 'user_id', 'inner')
yelp_clientes_spark = yelp_clientes_spark.drop('business_id', 'review_id', 'rating', 'text', 'date')




In [9]:
# Rename the ID columns in the yelps df

yelp_stores_spark = yelp_stores_spark.withColumnRenamed('business_id', 'storeID')
yelp_reviews_spark = yelp_reviews_spark.withColumnRenamed('business_id', 'storeID') \
                                       .withColumnRenamed('review_id', 'reviewID') \
                                       .withColumnRenamed('user_id', 'userID')
yelp_clientes_spark = yelp_clientes_spark.withColumnRenamed('user_id', 'userID')




# Google

In [10]:
#It´s the turn for the google df.
# First stores_google: drop the unnecessary columns, drop null values and convert to lower case all the words in the category column

categories = ['restaurant', 'coffee', 'bar', 'deli', 'sandwich', 'grocery', 'bakery', 'restaurants', 'coffees', 'bars', 'delis', 'sandwiches', 'groceries', 'bakeries']
drop_columns = ['description', 'state', 'price', 'hours', 'avg_rating', 'num_of_reviews', 'relative_results', 'url']
delivery = ['Delivery', 'delivery']

google_stores_spark = google_stores_spark.drop(*drop_columns)
google_stores_spark = google_stores_spark.dropDuplicates(['gmap_id'])
google_stores_spark = google_stores_spark.na.drop(subset = ['category', 'gmap_id'])
google_stores_spark = google_stores_spark.withColumn('category', concat_ws(', ', google_stores_spark['category']))
google_stores_spark = google_stores_spark.withColumn('category', lower(col('category').cast('string')))

# Filter the categories

google_stores_spark = google_stores_spark.withColumn('category_array', split(col('category'), ','))
categories_expr = [array_contains(col('category_array'), cat) for cat in categories]
google_stores_spark = google_stores_spark.withColumn('category_match', reduce(lambda x, y: x | y, categories_expr))
google_stores_spark = google_stores_spark.filter(col('category_match'))

# Convert the latitude and longitude columns to double

google_stores_spark = google_stores_spark.withColumn('latitude', col('latitude').getField('double'))
google_stores_spark = google_stores_spark.withColumn('longitude', col('longitude').getField('double'))

# Create the delivery column and drop some unnecessary columns

google_stores_spark = google_stores_spark.withColumn("service_options", col("MISC.Service options"))
google_stores_spark = google_stores_spark.withColumn("delivery", when(array_contains(col("service_options"), delivery[0]) | array_contains(col("service_options"), delivery[1]), "Y").otherwise("N"))
google_stores_spark = google_stores_spark.drop('category_array', 'category_match', 'MISC', 'service_options')

# Rename the category column to categories

google_stores_spark = google_stores_spark.withColumnRenamed("category", "categories")                                        




In [11]:
# Add the column state with the name of the state, renamed the column name to user_name and time to date drop the rows where
# text values are null

reviews_california_spark = reviews_california_spark.withColumnRenamed("name", "user_name") \
                                                    .withColumnRenamed("time", "date")
reviews_california_spark = reviews_california_spark.withColumn("state", lit("California"))
reviews_california_spark = reviews_california_spark.na.drop(subset = ['text', 'gmap_id'])

reviews_florida_spark = reviews_florida_spark.withColumnRenamed("name", "user_name") \
                                            .withColumnRenamed("time", "date")
reviews_florida_spark = reviews_florida_spark.withColumn("state", lit("Florida"))
reviews_florida_spark = reviews_florida_spark.na.drop(subset = ['text', 'gmap_id'])

reviews_nevada_spark = reviews_nevada_spark.withColumnRenamed("name", "user_name") \
                                            .withColumnRenamed("time", "date")
reviews_nevada_spark = reviews_nevada_spark.withColumn("state", lit("Nevada"))
reviews_nevada_spark = reviews_nevada_spark.na.drop(subset = ['text', 'gmap_id'])

reviews_texas_spark = reviews_texas_spark.withColumnRenamed("name", "user_name") \
                                            .withColumnRenamed("time", "date")
reviews_texas_spark = reviews_texas_spark.withColumn("state", lit("Texas"))
reviews_texas_spark = reviews_texas_spark.na.drop(subset = ['text', 'gmap_id'])

reviews_newyork_spark = reviews_newyork_spark.withColumnRenamed("name", "user_name") \
                                            .withColumnRenamed("time", "date")
reviews_newyork_spark = reviews_newyork_spark.withColumn("state", lit("New York"))
reviews_newyork_spark = reviews_newyork_spark.na.drop(subset = ['text', 'gmap_id'])




In [12]:
# Join the reviews df with the stores df

california = google_stores_spark.join(reviews_california_spark, "gmap_id", "inner")
florida = google_stores_spark.join(reviews_florida_spark, "gmap_id", "inner")
nevada = google_stores_spark.join(reviews_nevada_spark, "gmap_id", "inner")
newyork = google_stores_spark.join(reviews_newyork_spark, "gmap_id", "inner")
texas = google_stores_spark.join(reviews_texas_spark, "gmap_id", "inner")




In [13]:
# Unified the reviews df

reviews_google = california.union(florida).union(nevada).union(newyork).union(texas)




In [14]:
# Create the df for stores and reviews of google

stores_google = reviews_google.select('gmap_id', 'name', 'latitude', 'longitude', 'categories', 'address', 'delivery', 'state')
reviews_google = reviews_google.drop('name', 'address', 'latitude', 'longitude', 'categories', 'delivery', 'pics', 'state')




In [15]:
# Add the column review_id to reviews_google and rename the column user_name. Then, convert the date column type to timestamp

reviews_google = reviews_google.withColumnRenamed("user_name", "name").withColumnRenamed('gmap_id', 'storeID').withColumnRenamed('user_id', 'userID')
reviews_google = reviews_google.withColumn("reviewID", (monotonically_increasing_id() + 1).cast("int"))
reviews_google = reviews_google.withColumn('reviewID', col('reviewID').cast('string'))

reviews_google = reviews_google.withColumn("date", from_unixtime(col("date") / 1000).cast(TimestampType()))




In [16]:
# Create a new table named clientes and another named comentarios. Make the transformation needed in both tables.
# Then drop the columns name and resp from reviews_google

clientes_google = reviews_google.select("name", "userID")
clientes_google = clientes_google.dropDuplicates(['userID'])

comentarios = reviews_google.select("resp", "reviewID")
comentarios = comentarios.na.drop(subset = ['resp'])
comentarios = comentarios.withColumn("commentID", (monotonically_increasing_id() + 1).cast("int"))
comentarios = comentarios.withColumn("date", col("resp.time"))
comentarios = comentarios.withColumn("response", col("resp.text"))
comentarios = comentarios.withColumn("date", from_unixtime(col("date") / 1000).cast(TimestampType()))
comentarios = comentarios.drop('resp')

reviews_google = reviews_google.drop("name", "resp")




In [17]:
# Drop the duplicates from the column gmap_id of stores_google, renamed the column to storeID and join with states_spark

stores_google = stores_google.withColumnRenamed("gmap_id", "storeID")
stores_google = stores_google.join(states_spark, "state", "inner")
stores_google = stores_google.drop("state")




# Union of the yelp and google dataframes

In [18]:
# Unify the yelp dataframes with the google dataframes

stores = yelp_stores_spark.union(stores_google)
reviews = yelp_reviews_spark.union(reviews_google)
clientes = yelp_clientes_spark.union(clientes_google)




In [19]:
# Create the table category

stores = stores.withColumn('categories', split(col('categories'), ','))
category = stores.select(explode("categories").alias("categories"))
category = category.dropDuplicates(['categories'])
category = category.withColumn("categoryID", (monotonically_increasing_id() + 1).cast("int"))
category = category.withColumnRenamed("categories", "category")




In [20]:
# Create the table store_category

store_category = stores.select('storeID', 'categories')
store_category = store_category.withColumn('categories', explode('categories'))
store_category = store_category.withColumnRenamed('categories', 'category')
store_category = store_category.join(category, "category", "inner")
store_category = store_category.drop('category')




In [21]:
# drop the column categories from stores

stores = stores.drop('categories')




# Upload the dataframes to S3

In [22]:
# Convert the spark df to dynamic frames

from awsglue.dynamicframe import DynamicFrame

stores_dynamic = DynamicFrame.fromDF(stores, glueContext, 'stores')
reviews_dynamic = DynamicFrame.fromDF(reviews, glueContext, 'reviews')
clientes_dynamic = DynamicFrame.fromDF(clientes, glueContext, 'clientes')
comentarios_dynamic = DynamicFrame.fromDF(comentarios, glueContext, 'comentarios')
states_dynamic = DynamicFrame.fromDF(states_spark, glueContext, 'states')
category_dynamic = DynamicFrame.fromDF(category, glueContext, 'category')
store_category_dynamic = DynamicFrame.fromDF(store_category, glueContext, 'store_category')




### Load the data

In [26]:
s3output = glueContext.getSink(
  path="s3://googleyelpproject/clean_data/stores",
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  partitionKeys=[],
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output",
)

s3output.setCatalogInfo(
  catalogDatabase="clean_data", catalogTableName="stores"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(stores_dynamic)

<awsglue.dynamicframe.DynamicFrame object at 0x7f6fb1f0a650>


In [27]:
s3output = glueContext.getSink(
  path="s3://googleyelpproject/clean_data/reviews",
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  partitionKeys=[],
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output",
)

s3output.setCatalogInfo(
  catalogDatabase="clean_data", catalogTableName="reviews"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(reviews_dynamic)

<awsglue.dynamicframe.DynamicFrame object at 0x7f6fb1f0ab50>


In [None]:
s3output = glueContext.getSink(
  path="s3://googleyelpproject/clean_data/clientes",
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  partitionKeys=[],
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output",
)

s3output.setCatalogInfo(
  catalogDatabase="clean_data", catalogTableName="clientes"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(clientes_dynamic)

In [29]:
s3output = glueContext.getSink(
  path="s3://googleyelpproject/clean_data/comentarios",
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  partitionKeys=[],
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output",
)

s3output.setCatalogInfo(
  catalogDatabase="clean_data", catalogTableName="comentarios"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(comentarios_dynamic)

<awsglue.dynamicframe.DynamicFrame object at 0x7f6fb1f0afd0>


In [30]:
s3output = glueContext.getSink(
  path="s3://googleyelpproject/clean_data/states",
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  partitionKeys=[],
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output",
)

s3output.setCatalogInfo(
  catalogDatabase="clean_data", catalogTableName="states"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(states_dynamic)

<awsglue.dynamicframe.DynamicFrame object at 0x7f6fb351c450>


In [31]:
s3output = glueContext.getSink(
  path="s3://googleyelpproject/clean_data/category",
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  partitionKeys=[],
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output",
)

s3output.setCatalogInfo(
  catalogDatabase="clean_data", catalogTableName="category"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(category_dynamic)

<awsglue.dynamicframe.DynamicFrame object at 0x7f6fb351c890>


In [32]:
s3output = glueContext.getSink(
  path="s3://googleyelpproject/clean_data/store_category",
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  partitionKeys=[],
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output",
)

s3output.setCatalogInfo(
  catalogDatabase="clean_data", catalogTableName="store_category"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(store_category_dynamic)

<awsglue.dynamicframe.DynamicFrame object at 0x7f6fb351cc10>
