# Hybrid Recommender System
# Data Preparation

In [1]:
import pandas as pd

In [2]:
movie = pd.read_csv("datasets/movie.csv")
rating = pd.read_csv("datasets/rating.csv")

Read datasets from csv files.

In [3]:
df = movie.merge(rating, how="left", on="movieId")

In [4]:
df.to_parquet("datasets/dataset.parquet")

Merged movie data to rating data on movieId. Saved file to parquet for continue with Apache Spark and compression. Finaly delete csv files.

In [5]:
from pyspark.sql import SparkSession, functions as F

import findspark

findspark.init("C:\Program Files\Spark\spark-3.3.1-bin-hadoop3")

In [6]:
spark = SparkSession.builder \
    .master("local[2]") \
    .appName("Hybrid Recommender System") \
    .config("spark.sql.adaptive.enabled", True) \
    .getOrCreate()

In [12]:
path = "file:///Users/talha/OneDrive/Masaüstü/Talha Nebi Kumru/Data Science/Miuul/Recommendation Systems/Hybrid_Recommender_System/datasets/dataset.parquet"

df = spark.read \
    .format("parquet") \
    .option("header", True) \
    .option("inferSchema", True) \
    .load(path)

In [13]:
df.show(5)

+-------+----------------+--------------------+------+------+-------------------+-----------------+
|movieId|           title|              genres|userId|rating|          timestamp|__index_level_0__|
+-------+----------------+--------------------+------+------+-------------------+-----------------+
|      1|Toy Story (1995)|Adventure|Animati...|   3.0|   4.0|1999-12-11 13:36:47|                0|
|      1|Toy Story (1995)|Adventure|Animati...|   6.0|   5.0|1997-03-13 17:50:52|                1|
|      1|Toy Story (1995)|Adventure|Animati...|   8.0|   4.0|1996-06-05 13:37:51|                2|
|      1|Toy Story (1995)|Adventure|Animati...|  10.0|   4.0|1999-11-25 02:44:47|                3|
|      1|Toy Story (1995)|Adventure|Animati...|  11.0|   4.5|2009-01-02 01:13:41|                4|
+-------+----------------+--------------------+------+------+-------------------+-----------------+
only showing top 5 rows



In [14]:
df.printSchema()

root
 |-- movieId: long (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- userId: double (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- __index_level_0__: long (nullable = true)



In [15]:
df = df.drop("__index_level_0__")

In [20]:
df = df.withColumn("timestamp", F.to_timestamp(F.col("timestamp")))

In [25]:
df.groupby("title").count().show(5)

+--------------------+-----+
|               title|count|
+--------------------+-----+
|    Fair Game (1995)| 1295|
| If Lucy Fell (1996)| 1136|
| Three Wishes (1995)|  365|
|Heavenly Creature...| 7681|
|Paris, France (1993)|   47|
+--------------------+-----+
only showing top 5 rows



In [39]:
df.createOrReplaceTempView("Movies")

In [71]:
rare_movies = spark.sql("""
          SELECT title as rare FROM Movies
          GROUP BY title
          HAVING COUNT(title) >= 1000
""")

In [72]:
rare_movies.show(5)

+--------------------+
|                rare|
+--------------------+
|    Fair Game (1995)|
| If Lucy Fell (1996)|
|Heavenly Creature...|
|Snow White and th...|
|Night of the Livi...|
+--------------------+
only showing top 5 rows



In [73]:
rare_movies.createOrReplaceTempView("Rare_Movies")

In [74]:
common_movies = spark.sql("""
    SELECT * FROM Movies
    INNER JOIN Rare_Movies ON
    Movies.title = Rare_Movies.rare
    ORDER BY RAND()
""")

In [75]:
common_movies.show(5)

+-------+--------------------+--------------------+--------+------+-------------------+--------------------+
|movieId|               title|              genres|  userId|rating|          timestamp|                rare|
+-------+--------------------+--------------------+--------+------+-------------------+--------------------+
|   3252|Scent of a Woman ...|               Drama| 24884.0|   4.0|2009-12-27 20:01:59|Scent of a Woman ...|
|   2324|Life Is Beautiful...|Comedy|Drama|Roma...| 42789.0|   4.0|2000-11-20 08:18:32|Life Is Beautiful...|
|  82459|    True Grit (2010)|             Western| 22695.0|   5.0|2012-01-14 07:18:15|    True Grit (2010)|
|   3033|   Spaceballs (1987)|       Comedy|Sci-Fi|131275.0|   3.5|2009-05-20 07:20:20|   Spaceballs (1987)|
|   1690|Alien: Resurrecti...|Action|Horror|Sci-Fi| 63080.0|   1.5|2007-12-13 02:40:00|Alien: Resurrecti...|
+-------+--------------------+--------------------+--------+------+-------------------+--------------------+
only showing top 5 

In [77]:
common_movies = common_movies.drop("rare")

In [80]:
common_movies.select("userId", "title", "rating").groupby("userId").pivot("title").limit(5).toPandas()

AttributeError: 'GroupedData' object has no attribute 'limit'