In [3]:
# Spark init. / Data Path
from pyspark.sql import SparkSession
bucket = spark._jsc.hadoopConfiguration().get("fs.gs.system.bucket")
review_path = "gs://" + bucket + "/amazon_reviews_2023/silver/reviews_combined_compact"
meta_path = "gs://" + bucket + "/amazon_reviews_2023/silver/meta_combined_compact"
print(review_path)
print(meta_path)

gs://qst843-project/amazon_reviews_2023/silver/reviews_combined_compact
gs://qst843-project/amazon_reviews_2023/silver/meta_combined_compact


In [4]:
# Load Cleaned Data (Review & Meta)
df_review = spark.read.option("recursiveFileLookup", "true").parquet(review_path)
df_meta = spark.read.option("recursiveFileLookup", "true").parquet(meta_path)

In [5]:
# Rename / Drop Duplicate Field Names ("title", "category_name")
df_review = df_review.withColumnRenamed("title", "review_title")
df_meta = df_meta.withColumnRenamed("title", "product_name")
df_meta = df_meta.drop("category_name")

In [6]:
# Join Review & Meta Data into a Single df
df = df_review.join(
    df_meta,
    on="parent_asin",
    how="left"
)

In [8]:
df_sampled = df.sample(withReplacement=False, fraction=0.0002, seed=42)

In [9]:
df_sampled.printSchema()

root
 |-- parent_asin: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- rating: integer (nullable = true)
 |-- review_title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- helpful_vote: integer (nullable = true)
 |-- verified_purchase: boolean (nullable = true)
 |-- review_image: boolean (nullable = true)
 |-- category_name: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- main_category: string (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- price: double (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- average_rating: double (nullable = true)
 |-- rating_number: long (nullable = true)
 |-- brand: string (nullable = true)
 |-- store: string (nullable = true)
 |-- produ

product_image
product_video
verified_purchase
review_image
price
helpful_vote

rating

In [15]:
from pyspark.ml.feature import VectorAssembler
feature_cols = [
    'product_image',
    'product_video',
    'verified_purchase',
    'review_image',
    'price',
    'helpful_vote'
]

df_clean = df_sampled.na.drop(subset=feature_cols)

va = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features_ml"
    # default handleInvalid="error", but now there are no nulls left
)

df_vec = va.transform(df_clean)
df_vec = df_vec.select("features_ml")

In [17]:
df_vec.show(truncate=False)

[Stage 25:>                                                         (0 + 1) / 1]

+----------------------------+
|features_ml                 |
+----------------------------+
|[1.0,1.0,0.0,0.0,72.85,0.0] |
|[1.0,1.0,1.0,0.0,8.49,0.0]  |
|[1.0,1.0,1.0,0.0,13.99,0.0] |
|[1.0,1.0,0.0,0.0,18.94,0.0] |
|[1.0,1.0,1.0,0.0,23.56,0.0] |
|[1.0,1.0,1.0,0.0,8.56,0.0]  |
|[1.0,0.0,1.0,0.0,29.99,0.0] |
|[1.0,1.0,1.0,0.0,54.0,0.0]  |
|[1.0,1.0,1.0,0.0,54.0,0.0]  |
|[1.0,0.0,1.0,1.0,94.3,0.0]  |
|[1.0,1.0,1.0,0.0,17.99,0.0] |
|[1.0,1.0,1.0,0.0,17.99,31.0]|
|[1.0,1.0,1.0,0.0,17.99,0.0] |
|[1.0,0.0,1.0,0.0,8.49,0.0]  |
|[1.0,0.0,1.0,0.0,44.83,0.0] |
|[1.0,1.0,1.0,0.0,5.29,0.0]  |
|[1.0,1.0,1.0,0.0,55.25,0.0] |
|[1.0,1.0,1.0,0.0,16.0,0.0]  |
|[1.0,1.0,1.0,0.0,5.1,6.0]   |
|[1.0,0.0,1.0,0.0,4.38,1.0]  |
+----------------------------+
only showing top 20 rows



                                                                                