In [1]:
import os
os.environ["PYSPARK_SUBMIT_ARGS"] = (
    "--packages org.apache.hadoop:hadoop-aws:3.3.2,"
    "com.amazonaws:aws-java-sdk-bundle:1.12.180 pyspark-shell"
)


In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder \
    .appName("SparkMinIOExample") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.access.key", "labdata1") \
    .config("spark.hadoop.fs.s3a.secret.key", "labdata1") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://172.18.0.3:9000") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .getOrCreate()

In [14]:
df_bronze = spark.read.csv("s3a://raw/csv/movies/year=2025/month=10/day=06/netflix_titles_20251006_201457.csv", header=True, inferSchema=True)
df_bronze.show()

+-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|            director|                cast|             country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|     Kirsten Johnson|                null|       United States|September 25, 2021|        2020| PG-13|   90 min|       Documentaries|As her father nea...|
|     s2|TV Show|       Blood & Water|                null|Ama Qamata, Khosi...|        South Africa|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|
|     s3|TV Show|           Ganglan

In [33]:
df_bronze = df_bronze.dropDuplicates()
df_bronze.show()

+-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|            director|                cast|             country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|    s33|TV Show|       Sex Education|                null|Asa Butterfield, ...|      United Kingdom|September 17, 2021|        2020| TV-MA|3 Seasons|British TV Shows,...|Insecure Otis has...|
|   s461|  Movie|           Surf's Up|Ash Brannon, Chri...|Shia LaBeouf, Jef...|United States, Ca...|     July 15, 2021|        2007|    PG|   86 min|Children & Family...|This Oscar-nomina...|
|   s695|  Movie|               Azi

In [35]:
df_bronze = df_bronze.fillna({
    "director": "Unknown",
    "cast": "Unknown",
    "country": "Unknown"
})
df_bronze.show()

+-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|            director|                cast|             country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|    s33|TV Show|       Sex Education|             Unknown|Asa Butterfield, ...|      United Kingdom|September 17, 2021|        2020| TV-MA|3 Seasons|British TV Shows,...|Insecure Otis has...|
|   s461|  Movie|           Surf's Up|Ash Brannon, Chri...|Shia LaBeouf, Jef...|United States, Ca...|     July 15, 2021|        2007|    PG|   86 min|Children & Family...|This Oscar-nomina...|
|   s695|  Movie|               Azi

In [37]:
from pyspark.sql.functions import to_date, col

df_bronze = df_bronze.withColumn("date_added", to_date(col("date_added"), "MMMM d, yyyy")) \
       .withColumn("release_year", col("release_year").cast("int"))

df_bronze.show()

+-------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|            director|                cast|             country|date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+------+---------+--------------------+--------------------+
|    s33|TV Show|       Sex Education|             Unknown|Asa Butterfield, ...|      United Kingdom|2021-09-17|        2020| TV-MA|3 Seasons|British TV Shows,...|Insecure Otis has...|
|   s461|  Movie|           Surf's Up|Ash Brannon, Chri...|Shia LaBeouf, Jef...|United States, Ca...|2021-07-15|        2007|    PG|   86 min|Children & Family...|This Oscar-nomina...|
|   s695|  Movie|               Aziza|      Soudade Kaadan|Caress Bashar, A

In [38]:
df_bronze = df_bronze.toDF(*[c.lower().replace(" ", "_") for c in df.columns])
df_bronze.show()

+-------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|            director|                cast|             country|date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+------+---------+--------------------+--------------------+
|    s33|TV Show|       Sex Education|             Unknown|Asa Butterfield, ...|      United Kingdom|2021-09-17|        2020| TV-MA|3 Seasons|British TV Shows,...|Insecure Otis has...|
|   s461|  Movie|           Surf's Up|Ash Brannon, Chri...|Shia LaBeouf, Jef...|United States, Ca...|2021-07-15|        2007|    PG|   86 min|Children & Family...|This Oscar-nomina...|
|   s695|  Movie|               Aziza|      Soudade Kaadan|Caress Bashar, A

In [39]:
from pyspark.sql.functions import regexp_extract

df_bronze = df_bronze.withColumn("duration_int", regexp_extract(col("duration"), r"(\d+)", 1).cast("int")) \
       .withColumn("duration_type", regexp_extract(col("duration"), r"([A-Za-z]+)", 1))
df_bronze.show()

+-------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+------+---------+--------------------+--------------------+------------+-------------+
|show_id|   type|               title|            director|                cast|             country|date_added|release_year|rating| duration|           listed_in|         description|duration_int|duration_type|
+-------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+------+---------+--------------------+--------------------+------------+-------------+
|    s33|TV Show|       Sex Education|             Unknown|Asa Butterfield, ...|      United Kingdom|2021-09-17|        2020| TV-MA|3 Seasons|British TV Shows,...|Insecure Otis has...|           3|      Seasons|
|   s461|  Movie|           Surf's Up|Ash Brannon, Chri...|Shia LaBeouf, Jef...|United States, Ca...|2021-07-15|        2007|    PG|   86 min|Children &

In [40]:
from pyspark.sql.functions import year, month

df_bronze = df_bronze.withColumn("added_year", year("date_added")) \
       .withColumn("added_month", month("date_added"))

df_bronze.show()

+-------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+------+---------+--------------------+--------------------+------------+-------------+----------+-----------+
|show_id|   type|               title|            director|                cast|             country|date_added|release_year|rating| duration|           listed_in|         description|duration_int|duration_type|added_year|added_month|
+-------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+------+---------+--------------------+--------------------+------------+-------------+----------+-----------+
|    s33|TV Show|       Sex Education|             Unknown|Asa Butterfield, ...|      United Kingdom|2021-09-17|        2020| TV-MA|3 Seasons|British TV Shows,...|Insecure Otis has...|           3|      Seasons|      2021|          9|
|   s461|  Movie|           Surf's Up|Ash Brannon, Chri...|S

In [42]:
df_bronze = df_bronze.withColumn("is_movie", (col("type") == "Movie"))
df_bronze.show()

+-------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+------+---------+--------------------+--------------------+------------+-------------+----------+-----------+--------+
|show_id|   type|               title|            director|                cast|             country|date_added|release_year|rating| duration|           listed_in|         description|duration_int|duration_type|added_year|added_month|is_movie|
+-------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+------+---------+--------------------+--------------------+------------+-------------+----------+-----------+--------+
|    s33|TV Show|       Sex Education|             Unknown|Asa Butterfield, ...|      United Kingdom|2021-09-17|        2020| TV-MA|3 Seasons|British TV Shows,...|Insecure Otis has...|           3|      Seasons|      2021|          9|   false|
|   s461|  Movie|       

In [45]:
from pyspark.sql.functions import split

df_bronze = df_bronze.withColumn("genres", split(col("listed_in"), ",\\s*"))
df_bronze.show()

+-------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+------+---------+--------------------+--------------------+------------+-------------+----------+-----------+--------+--------------------+
|show_id|   type|               title|            director|                cast|             country|date_added|release_year|rating| duration|           listed_in|         description|duration_int|duration_type|added_year|added_month|is_movie|              genres|
+-------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+------+---------+--------------------+--------------------+------------+-------------+----------+-----------+--------+--------------------+
|    s33|TV Show|       Sex Education|             Unknown|Asa Butterfield, ...|      United Kingdom|2021-09-17|        2020| TV-MA|3 Seasons|British TV Shows,...|Insecure Otis has...|           3|      Se

In [46]:
from pyspark.sql.functions import split

df_bronze = df_bronze.withColumn("main_country", split(col("country"), ",")[0])
df_bronze.show()

+-------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+------+---------+--------------------+--------------------+------------+-------------+----------+-----------+--------+--------------------+--------------+
|show_id|   type|               title|            director|                cast|             country|date_added|release_year|rating| duration|           listed_in|         description|duration_int|duration_type|added_year|added_month|is_movie|              genres|  main_country|
+-------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+------+---------+--------------------+--------------------+------------+-------------+----------+-----------+--------+--------------------+--------------+
|    s33|TV Show|       Sex Education|             Unknown|Asa Butterfield, ...|      United Kingdom|2021-09-17|        2020| TV-MA|3 Seasons|British TV Shows,.

In [48]:
from pyspark.sql.functions import regexp_replace

df_bronze = df_bronze.withColumn("description", regexp_replace(col("description"), r'^"|"$', ''))
df_bronze.show()

+-------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+------+---------+--------------------+--------------------+------------+-------------+----------+-----------+--------+--------------------+--------------+
|show_id|   type|               title|            director|                cast|             country|date_added|release_year|rating| duration|           listed_in|         description|duration_int|duration_type|added_year|added_month|is_movie|              genres|  main_country|
+-------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+------+---------+--------------------+--------------------+------------+-------------+----------+-----------+--------+--------------------+--------------+
|    s33|TV Show|       Sex Education|             Unknown|Asa Butterfield, ...|      United Kingdom|2021-09-17|        2020| TV-MA|3 Seasons|British TV Shows,.

In [49]:
df_bronze.write \
    .mode("overwrite") \
    .format("parquet") \
    .save("s3a://trusted/silver")

In [50]:
df_check = spark.read.format("parquet").load("s3a://trusted/silver/")
df_check.show(5)

+-------+-------+--------------------+--------------------+--------------------+--------+----------+------------+------+---------+--------------------+--------------------+------------+-------------+----------+-----------+--------+--------------------+------------+
|show_id|   type|               title|            director|                cast| country|date_added|release_year|rating| duration|           listed_in|         description|duration_int|duration_type|added_year|added_month|is_movie|              genres|main_country|
+-------+-------+--------------------+--------------------+--------------------+--------+----------+------------+------+---------+--------------------+--------------------+------------+-------------+----------+-----------+--------+--------------------+------------+
|   s289|TV Show|Las muñecas de la...|             Unknown|Amparo Grisales, ...|Colombia|2021-08-10|        2018| TV-MA| 1 Season|Crime TV Shows, I...|Based on the book...|           1|       Season|   