In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions

In [2]:
spark = SparkSession.builder.appName('movies').getOrCreate()

In [3]:
# Get the raw data
lines = spark.sparkContext.textFile('/home/ubuntu/spark_project/01-basics_andexampl/ml-100k/u.data')

In [4]:
# Convert it to a RDD of Row objects
movies = lines.map(lambda x: Row(movieID =int(x.split()[1])))

In [5]:
# Convert that to a DataFrame
movieDataset = spark.createDataFrame(movies)

In [6]:
# Some SQL-style magic to sort all movies by popularity in one line!
topMovieIDs = movieDataset.groupBy("movieID").count().orderBy("count", ascending=False).cache()

In [7]:
topMovieIDs.show()

+-------+-----+
|movieID|count|
+-------+-----+
|     50|  583|
|    258|  509|
|    100|  508|
|    181|  507|
|    294|  485|
|    286|  481|
|    288|  478|
|      1|  452|
|    300|  431|
|    121|  429|
|    174|  420|
|    127|  413|
|     56|  394|
|      7|  392|
|     98|  390|
|    237|  384|
|    117|  378|
|    172|  367|
|    222|  365|
|    313|  350|
+-------+-----+
only showing top 20 rows



In [39]:
top10 = topMovieIDs.take(10)

In [9]:
# Get the raw data
movies = spark.sparkContext.textFile('/home/ubuntu/spark_project/01-basics_andexampl/ml-100k/u.item')

In [10]:
# CConvert it to a RDD of Row objects
movie_id_names = movies.map(lambda x: Row(movie_id = int(x.split('|')[0]), movie_name=str(x.split('|')[1])))

In [13]:
# Convert that to a DataFrame
movie_id_datafram = spark.createDataFrame(movie_id_names)

In [14]:
movie_id_datafram.printSchema()

root
 |-- movie_id: long (nullable = true)
 |-- movie_name: string (nullable = true)



In [17]:
movie_id_datafram.show(2, truncate=False)

+--------+----------------+
|movie_id|movie_name      |
+--------+----------------+
|1       |Toy Story (1995)|
|2       |GoldenEye (1995)|
+--------+----------------+
only showing top 2 rows



In [21]:
# list_persons = map(lambda row: row.asDict(), df.collect())
final_data = dict(map(lambda row: row.asDict(), movie_id_datafram.collect()))

In [35]:
movie_name = movie_id_datafram.toPandas().set_index('movie_id').T.to_dict()


In [37]:
for (key, value) in movie_name.items():
    movie_name[key] = movie_name[key]['movie_name']

In [40]:
for result in top10:
    print("%s: %d" % (movie_name[result[0]], result[1]))

Star Wars (1977): 583
Contact (1997): 509
Fargo (1996): 508
Return of the Jedi (1983): 507
Liar Liar (1997): 485
English Patient, The (1996): 481
Scream (1996): 478
Toy Story (1995): 452
Air Force One (1997): 431
Independence Day (ID4) (1996): 429


In [41]:
spark.stop()