In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Prepare Dataset") \
    .master("local[*]") \
    .config("spark.driver.memory", "8g") \
    .config("spark.sql.shuffle.partitions", "8") \
    .config("spark.executor.memoryOverhead", "512m") \
    .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/07/23 16:18:27 WARN Utils: Your hostname, MacBook-Pro-cua-Quach.local, resolves to a loopback address: 127.0.0.1; using 192.168.100.49 instead (on interface en0)
25/07/23 16:18:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/23 16:18:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [12]:
df_product = spark.read \
    .option("header", True) \
    .option("inferSchema", True) \
    .option("delimiter", "\t") \
    .csv("data_source/merged_kaggle_datasets/")

df_product.printSchema()




root
 |-- marketplace: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: date (nullable = true)



                                                                                

In [13]:
df_product.count() # 22941811

                                                                                

22941811

In [None]:
df_product = df_product.withColumnRenamed("review_id", "id")

In [11]:
import os
import shutil
from pyspark.sql import SparkSession


input_dir = "data_source/merged_kaggle_datasets/"
temp_output_dir = "data_source/merged_kaggle_datasets/temp_output/"

# Đảm bảo thư mục tạm tồn tại
os.makedirs(temp_output_dir, exist_ok=True)

for filename in os.listdir(input_dir):
    if not filename.endswith(".tsv"):
        continue

    input_path = os.path.join(input_dir, filename)

    # Đọc file TSV
    df = spark.read \
        .option("header", True) \
        .option("inferSchema", True) \
        .option("delimiter", "\t") \
        .csv(input_path)

    # Đổi tên cột nếu cần
    if "review_id" in df.columns:
        df = df.withColumnRenamed("review_id", "id")

    # Ghi ra thư mục tạm dưới dạng TSV, chỉ 1 file
    df.coalesce(1) \
        .write \
        .option("header", True) \
        .option("delimiter", "\t") \
        .mode("overwrite") \
        .csv(temp_output_dir)

    # Tìm file CSV thật sự trong thư mục tạm
    for temp_file in os.listdir(temp_output_dir):
        if temp_file.endswith(".csv"):
            temp_file_path = os.path.join(temp_output_dir, temp_file)

            # Ghi đè lại file gốc
            shutil.move(temp_file_path, input_path)
            break

    # Xóa thư mục tạm (dọn dẹp sau mỗi file)
    shutil.rmtree(temp_output_dir)


                                                                                

In [None]:
df_product \
    .coalesce(1) \
    .write \
    .option("header", True) \
    .mode("overwrite") \
    .csv("data_source/processed_datasets/df_location_temp")

import os
import shutil

# Thư mục tạm nơi Spark vừa ghi file
output_temp_dir = "data_source/processed_datasets/df_location_temp"
final_output_path = "data_source/processed_datasets/customer_location.csv"

# Tìm file CSV thực sự (bỏ _SUCCESS)
for filename in os.listdir(output_temp_dir):
    if filename.endswith(".csv"):
        temp_csv_path = os.path.join(output_temp_dir, filename)
        shutil.move(temp_csv_path, final_output_path)
        break

# Xoá thư mục tạm
shutil.rmtree(output_temp_dir)


In [None]:
from pyspark.sql.functions import col, rand, sequence, explode, lit, round, date_add, to_date
from pyspark.sql.types import IntegerType

# Gán ngẫu nhiên số lần cập nhật giá (1–3 lần) để tạo ~8 triệu dòng
df_with_changes = df_product.withColumn("change_times", (rand() * 3 + 1).cast("int"))

# Tạo các chỉ số cập nhật cho mỗi sản phẩm
df_exploded = df_with_changes.withColumn("change_index", explode(sequence(lit(1), col("change_times"))))

# Tạo price mới và updated_at
start_date = to_date(lit("1999-12-13"))
date_range_days = 5700  # khoảng từ 1999-12-13 đến 2015-08-31

df_shadow_product = df_exploded \
    .withColumn("price", round(col("price") * (1 + rand() * 0.2 - 0.1), 2)) \
    .withColumn("updated_at", date_add(start_date, (rand() * date_range_days).cast(IntegerType()))) \
    .select(
        col("product_id"),
        col("product_title"),
        col("price"),
        col("currency"),
        col("updated_at")
    )


In [None]:
from pyspark.sql.functions import expr

df_shadow_product = df_shadow_product.withColumn("id", expr("uuid()"))

In [None]:
# df_reviews = spark.read \
#     .option("header", True) \
#     .option("delimiter", "\t") \
#     .csv("data_source/merged_kaggle_datasets/")


# +-------------+-----------+
# |earliest_date|latest_date|
# +-------------+-----------+
# |   1999-12-13| 2015-08-31|
# +-------------+-----------+

In [None]:
df_shadow_product \
    .coalesce(1) \
    .write \
    .option("header", True) \
    .mode("overwrite") \
    .csv("data_source/processed_datasets/df_location_temp")

import os
import shutil

# Thư mục tạm nơi Spark vừa ghi file
output_temp_dir = "data_source/processed_datasets/df_location_temp"
final_output_path = "data_source/processed_datasets/category.csv"

# Tìm file CSV thực sự (bỏ _SUCCESS)
for filename in os.listdir(output_temp_dir):
    if filename.endswith(".csv"):
        temp_csv_path = os.path.join(output_temp_dir, filename)
        shutil.move(temp_csv_path, final_output_path)
        break

# Xoá thư mục tạm
shutil.rmtree(output_temp_dir)


In [None]:
df_shadow_product.count()

In [None]:
spark.stop()