In [1]:
import pyspark
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/26 12:34:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from datetime import datetime
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, TimestampType

In [4]:
movies_data = [
    (1, "Avengers"),
    (2, "Frozen 2"),
    (3, "Joker")
]
movies_schema = StructType([
    StructField("movie_id", IntegerType(), True),
    StructField("title", StringType(), True)
])
movies_df = spark.createDataFrame(movies_data, schema=movies_schema)

movies_df.show()
print(movies_df.dtypes)

                                                                                

+--------+--------+
|movie_id|   title|
+--------+--------+
|       1|Avengers|
|       2|Frozen 2|
|       3|   Joker|
+--------+--------+

[('movie_id', 'int'), ('title', 'string')]


In [5]:
users_data = [
    (1, "Daniel"),
    (2, "Monica"),
    (3, "Maria"),
    (4, "James")
]

users_schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("name", StringType(), True)
])

users_df = spark.createDataFrame(users_data, schema=users_schema)
users_df.show()
print(users_df.dtypes)

+-------+------+
|user_id|  name|
+-------+------+
|      1|Daniel|
|      2|Monica|
|      3| Maria|
|      4| James|
+-------+------+

[('user_id', 'int'), ('name', 'string')]


In [8]:
movie_rating_data = [
    (1, 1, 3, "2020-01-12"),
    (1, 2, 4, "2020-02-11"),
    (1, 3, 2, "2020-02-12"),
    (1, 4, 1, "2020-01-01"),
    (2, 1, 5, "2020-02-17"),
    (2, 2, 2, "2020-02-01"),
    (2, 3, 2, "2020-03-01"),
    (3, 1, 3, "2020-02-22"),
    (3, 2, 4, "2020-02-25")
]

movie_rating_schema = StructType([
    StructField("movie_id", IntegerType(), True),
    StructField("user_id", IntegerType(), True),
    StructField("rating", IntegerType(), True),
    StructField("created_at", StringType(), True)
])

movie_rating_df = spark.createDataFrame(movie_rating_data, schema=movie_rating_schema)
movie_rating_df.show()
print(movie_rating_df.dtypes)

+--------+-------+------+----------+
|movie_id|user_id|rating|created_at|
+--------+-------+------+----------+
|       1|      1|     3|2020-01-12|
|       1|      2|     4|2020-02-11|
|       1|      3|     2|2020-02-12|
|       1|      4|     1|2020-01-01|
|       2|      1|     5|2020-02-17|
|       2|      2|     2|2020-02-01|
|       2|      3|     2|2020-03-01|
|       3|      1|     3|2020-02-22|
|       3|      2|     4|2020-02-25|
+--------+-------+------+----------+

[('movie_id', 'int'), ('user_id', 'int'), ('rating', 'int'), ('created_at', 'string')]


In [9]:
movies_df.createOrReplaceTempView("Movies")
users_df.createOrReplaceTempView("Users")
movie_rating_df.createOrReplaceTempView("MovieRating")

In [19]:
result = spark.sql(
    """
    SELECT min(name)
    FROM (SELECT name, count(MovieRating.movie_id) as count, max(count(MovieRating.movie_id)) OVER () as max_count
        FROM MovieRating LEFT JOIN Users 
            ON MovieRating.user_id = Users.user_id
        GROUP BY MovieRating.user_id, name) movie_count
    WHERE count = max_count
    """
)
result.show()

23/11/26 12:53:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 12:53:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 12:53:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 12:53:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 12:53:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 12:53:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 1

+---------+
|min(name)|
+---------+
|   Daniel|
+---------+



23/11/26 12:53:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 12:53:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [31]:
result = spark.sql(
    """
    SELECT min(title) as title
    FROM (SELECT title, avg(rating) as rate, max(avg(rating)) over () as max_rate
            FROM MovieRating LEFT JOIN Movies
                ON MovieRating.movie_id = Movies.movie_id
            WHERE DATEDIFF(MONTH, '2020-02-01', created_at) < 1
            GROUP BY MovieRating.movie_id, title
            ) as movie_rating
    WHERE rate = max_rate
    """
)
result.show()

23/11/26 13:01:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 13:01:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 13:01:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 13:01:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 13:01:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 13:01:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 1

+--------+
|   title|
+--------+
|Frozen 2|
+--------+



23/11/26 13:01:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 13:01:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 13:01:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [48]:
result = spark.sql(
    """
SELECT name AS results FROM (
    SELECT name
    FROM MovieRating LEFT JOIN Users 
            ON MovieRating.user_id = Users.user_id    
    GROUP BY MovieRating.user_id, name
    ORDER BY count(MovieRating.movie_id) DESC, name
    LIMIT 1
) AS rating_counts
UNION ALL
SELECT title FROM(
    SELECT title
    FROM MovieRating LEFT JOIN Movies ON MovieRating.movie_id = Movies.movie_id
    WHERE DATEDIFF(created_at, '2020-02-01') BETWEEN 0 and 28
    GROUP BY MovieRating.movie_id, title
    ORDER BY avg(rating) DESC, title
    LIMIT 1
) AS max_rate

    """
)
result.show()

                                                                                

+--------+
| results|
+--------+
|  Daniel|
|Frozen 2|
+--------+



In [45]:
result = spark.sql(
    """
SELECT min(name) AS results
FROM (SELECT name, count(MovieRating.movie_id) as count, max(count(MovieRating.movie_id)) OVER () as max_count
    FROM MovieRating LEFT JOIN Users 
        ON MovieRating.user_id = Users.user_id
    GROUP BY MovieRating.user_id, name) movie_count
WHERE count = max_count
UNION ALL
SELECT min(title) as title
FROM (SELECT title, avg(rating) as rate, max(avg(rating)) over () as max_rate
        FROM MovieRating LEFT JOIN Movies
            ON MovieRating.movie_id = Movies.movie_id
        WHERE DATEDIFF(MovieRating.created_at, '2020-02-01') BETWEEN 0 AND 28
        GROUP BY MovieRating.movie_id, title
        ) as movie_rating
WHERE rate = max_rate
    """
)
result.show()

23/11/26 14:13:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 14:13:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 14:13:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 14:13:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 14:13:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 14:13:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 1

+--------+
| results|
+--------+
|  Daniel|
|Frozen 2|
+--------+



23/11/26 14:13:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 14:13:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 14:13:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/26 14:13:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
