In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

# S3/MinIO config

In [10]:
spark.sparkContext._jsc\
     .hadoopConfiguration().set("fs.s3a.access.key", "minio")
spark.sparkContext._jsc\
     .hadoopConfiguration().set("fs.s3a.secret.key", "minio123")
spark.sparkContext._jsc\
      .hadoopConfiguration().set("fs.s3a.endpoint", "http://minio1:9000")
spark.sparkContext._jsc\
      .hadoopConfiguration().set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark.sparkContext._jsc\
      .hadoopConfiguration().set("spark.hadoop.fs.s3a.path.style.access", "true")
spark.sparkContext._jsc\
      .hadoopConfiguration().set("fs.s3a.multipart.size", "104857600")

# Read from MinIO

In [12]:
ratings = spark.read\
            .option("header", "true")\
            .option("inferSchema", "true")\
            .csv("s3a://bucket1/movielens/ratings.csv")

In [13]:
ratings.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    307|   3.5|1256677221|
|     1|    481|   3.5|1256677456|
|     1|   1091|   1.5|1256677471|
|     1|   1257|   4.5|1256677460|
|     1|   1449|   4.5|1256677264|
|     1|   1590|   2.5|1256677236|
|     1|   1591|   1.5|1256677475|
|     1|   2134|   4.5|1256677464|
|     1|   2478|   4.0|1256677239|
|     1|   2840|   3.0|1256677500|
|     1|   2986|   2.5|1256677496|
|     1|   3020|   4.0|1256677260|
|     1|   3424|   4.5|1256677444|
|     1|   3698|   3.5|1256677243|
|     1|   3826|   2.0|1256677210|
|     1|   3893|   3.5|1256677486|
|     2|    170|   3.5|1192913581|
|     2|    849|   3.5|1192913537|
|     2|   1186|   3.5|1192913611|
|     2|   1235|   3.0|1192913585|
+------+-------+------+----------+
only showing top 20 rows



In [17]:
ratings.registerTempTable("ratings")

In [19]:
movies = spark.read\
            .option("header", "true")\
            .option("inferSchema", "true")\
            .csv("s3a://bucket1/movielens/movies.csv")

In [20]:
movies.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [21]:
movies.registerTempTable("movies")

# Save to MinIO

In [29]:
top_100_movies = spark.sql("""
    SELECT title, AVG(rating) as avg_rating
    FROM movies m
    LEFT JOIN ratings r ON m.movieId = r.movieID
    GROUP BY title
    HAVING COUNT(*) > 100
    ORDER BY avg_rating DESC
    LIMIT 100
""")

In [30]:
top_100_movies.show()

+--------------------+------------------+
|               title|        avg_rating|
+--------------------+------------------+
|Planet Earth II (...|4.4865181711606095|
| Planet Earth (2006)| 4.458092485549133|
|Shawshank Redempt...| 4.424188001918387|
|Band of Brothers ...| 4.399898373983739|
|Black Mirror: Whi...| 4.350558659217877|
|              Cosmos| 4.343949044585988|
|The Godfather Tri...| 4.339667458432304|
|Godfather, The (1...| 4.332892749244713|
|Usual Suspects, T...| 4.291958829205532|
|        Black Mirror| 4.263888888888889|
|Godfather: Part I...|4.2630353697749195|
|Last Year's Snow ...| 4.261904761904762|
|Schindler's List ...| 4.257501817775044|
|Seven Samurai (Sh...|4.2541157909178215|
|Over the Garden W...| 4.244031830238727|
|Sherlock - A Stud...|  4.23943661971831|
| 12 Angry Men (1957)| 4.237075455914338|
|Blue Planet II (2...| 4.236389684813753|
|  Rear Window (1954)| 4.230798598634567|
|   Fight Club (1999)| 4.230663235786717|
+--------------------+------------

In [36]:
top_100_movies.write.parquet("s3a://bucket1/movielens/results/top_100_movies")

# Read Parquet

In [38]:
spark.read.parquet("s3a://bucket1/movielens/results/top_100_movies").show()

+--------------------+------------------+
|               title|        avg_rating|
+--------------------+------------------+
|Planet Earth II (...|4.4865181711606095|
| Planet Earth (2006)| 4.458092485549133|
|Shawshank Redempt...| 4.424188001918387|
|Band of Brothers ...| 4.399898373983739|
|Black Mirror: Whi...| 4.350558659217877|
|              Cosmos| 4.343949044585988|
|The Godfather Tri...| 4.339667458432304|
|Godfather, The (1...| 4.332892749244713|
|Usual Suspects, T...| 4.291958829205532|
|        Black Mirror| 4.263888888888889|
|Godfather: Part I...|4.2630353697749195|
|Last Year's Snow ...| 4.261904761904762|
|Schindler's List ...| 4.257501817775044|
|Seven Samurai (Sh...|4.2541157909178215|
|Over the Garden W...| 4.244031830238727|
|Sherlock - A Stud...|  4.23943661971831|
| 12 Angry Men (1957)| 4.237075455914338|
|Blue Planet II (2...| 4.236389684813753|
|  Rear Window (1954)| 4.230798598634567|
|   Fight Club (1999)| 4.230663235786717|
+--------------------+------------

# Kill test

In [42]:
ratings.count()

27753444

## One killed

In [43]:
ratings.count()

27753444

## Two killed

In [44]:
ratings.count()

27753444

## Three killed

In [None]:
ratings.count()