### Import thư viện và khởi tạo SparkSession

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "512m").\
        getOrCreate()

### Đọc các files từ HDFS

In [None]:
# Đường dẫn tới log_activity parquet 
log_path = "hdfs://namenode:9000/raw_zone/fact/activity"
# Đường dẫn tới file danh_sach_sv_de.csv
list_path = "hdfs://namenode:9000/raw_zone/vdt2024/data_engineering/danh_sach_sv_de.csv"

try:
    logDF = spark.read \
                .format("parquet") \
                .load(log_path)
except Exception as e:
    print(f"Error reading parquet files: {e}")
    
try:
    listDF = spark.read \
                .format("csv") \
                .option("header", "false") \
                .option("inferSchema", "true") \
                .load(list_path)
except Exception as e:
    print(f"Error reading csv file: {e}")

In [None]:
logDF.printSchema()
listDF.printSchema()

### Đổi tên cột

In [23]:
listDF = listDF.withColumnRenamed("_c0", "student_code") \
                        .withColumnRenamed("_c1", "student_name")
listDF.printSchema()

### Chuyển cột timestamp thành dạng 'yyyyMMdd' và chuyển tên cột thành date

In [26]:
logDF = logDF.withColumn("timestamp", date_format(to_date(col("timestamp"), "M/d/yyyy"), "yyyyMMdd")) \
                .withColumnRenamed("timestamp", "date")

### Join hai DF đã được xử lý

In [None]:
joinedDF = logDF.join(listDF, "student_code", "inner")
joinedDF.show(5)

### Xem schema

In [None]:
joinedDF.printSchema()

### Giải bài toán bằng Spark SQL

In [None]:
joinedDF.createOrReplaceTempView("student_activity")

result_sql = spark.sql("SELECT \
                        date \
                        , student_code \
                        , student_name \
                        , activity \
                        , SUM(numberOfFile) AS totalFile \
                  FROM student_activity \
                  GROUP BY date, activity, student_code, student_name \
                  ORDER BY student_code, date, activity ASC")
result_sql.show()

### Giải bài toán bằng Spark Dataframe

In [None]:
result_df = joinedDF.groupBy("date", "student_code", "student_name", "activity") \
              .agg(sum("numberOfFile").alias("totalFile")) \
              .orderBy(col("student_code"), col("date").asc(), col("activity"))

result_df.show(5)

### Lưu lại file vào HDFS

In [None]:
result_df.repartition(1) \
            .write \
            .csv("hdfs://namenode:9000/gold_zone/asignments/result/38_Vu_Huu_Sy", 
                 header=True, 
                 mode="overwrite")