In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 40 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 76.8 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=025398714f24f425700acb406c4ca206f1bb264a1a2a35559e2c75dc2b8b4461
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [11]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, IntegerType, LongType
import codecs

In [12]:
def loadMovieNames():
    movieNames = {}
    # CHANGE THIS TO THE PATH TO YOUR u.ITEM FILE:
    with codecs.open("/content/drive/MyDrive/SparkCourse/ml-100k/u.item", "r", encoding='ISO-8859-1', errors='ignore') as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

In [13]:
spark = SparkSession.builder.appName("PopularMovies").getOrCreate()

nameDict = spark.sparkContext.broadcast(loadMovieNames())

In [14]:
# Create schema when reading u.data
schema = StructType([ \
                     StructField("userID", IntegerType(), True), \
                     StructField("movieID", IntegerType(), True), \
                     StructField("rating", IntegerType(), True), \
                     StructField("timestamp", LongType(), True)])

In [15]:
# Load up movie data as dataframe
moviesDF = spark.read.option("sep", "\t").schema(schema).csv("/content/drive/MyDrive/SparkCourse/ml-100k/u.data")

movieCounts = moviesDF.groupBy("movieID").count()

In [16]:
# Create a user-defined function to look up movie names from our broadcasted dictionary
def lookupName(movieID):
    return nameDict.value[movieID]

In [17]:
lookupNameUDF = func.udf(lookupName)

# Add a movieTitle column using our new udf
moviesWithNames = movieCounts.withColumn("movieTitle", lookupNameUDF(func.col("movieID")))

In [18]:
# Sort the results
sortedMoviesWithNames = moviesWithNames.orderBy(func.desc("count"))

# Grab the top 10
sortedMoviesWithNames.show(15, False)

# Stop the session
spark.stop()

+-------+-----+--------------------------------+
|movieID|count|movieTitle                      |
+-------+-----+--------------------------------+
|50     |583  |Star Wars (1977)                |
|258    |509  |Contact (1997)                  |
|100    |508  |Fargo (1996)                    |
|181    |507  |Return of the Jedi (1983)       |
|294    |485  |Liar Liar (1997)                |
|286    |481  |English Patient, The (1996)     |
|288    |478  |Scream (1996)                   |
|1      |452  |Toy Story (1995)                |
|300    |431  |Air Force One (1997)            |
|121    |429  |Independence Day (ID4) (1996)   |
|174    |420  |Raiders of the Lost Ark (1981)  |
|127    |413  |Godfather, The (1972)           |
|56     |394  |Pulp Fiction (1994)             |
|7      |392  |Twelve Monkeys (1995)           |
|98     |390  |Silence of the Lambs, The (1991)|
+-------+-----+--------------------------------+
only showing top 15 rows

