In [1]:
import findspark
findspark.init()

In [2]:

"""
Since Spark 2.x, Spark unified Spark APIs, DF, Datasets, & SQL.
SparkSession uses SparkContext internally.
"""

from pyspark.sql import SparkSession
ss = SparkSession.builder.master("local").appName("sparkDataFrame").getOrCreate()

ss

In [3]:
!dir .\ml-latest-small

 Volume in drive C has no label.
 Volume Serial Number is A88C-3222

 Directory of C:\Users\Administrator\veena\bigdata\ml-latest-small

09/27/2018  02:20 AM    <DIR>          .
09/27/2018  02:20 AM    <DIR>          ..
09/27/2018  02:20 AM           197,979 links.csv
09/27/2018  02:19 AM           494,431 movies.csv
09/27/2018  02:19 AM         2,483,723 ratings.csv
09/27/2018  02:20 AM             8,342 README.txt
09/27/2018  02:19 AM           118,660 tags.csv
               5 File(s)      3,303,135 bytes
               2 Dir(s)  449,109,102,592 bytes free


---

In [24]:

"""
Read data from CSV
"""

from pyspark.sql.types import StructType, IntegerType, DoubleType, StringType, LongType

schema_movies = (
    StructType()
    .add("movieId", IntegerType(), True)
    .add("title", StringType(), True)
    .add("genres", StringType(), True)
)
schema_ratings = (
    StructType()
    .add("userId", IntegerType(), True)
    .add("movieId", IntegerType(), True)
    .add("rating", DoubleType(), True)
    .add("timestamp", LongType(), True)
)

df_movies_full = (
    ss.read
    .format("csv")
    .option("header", True)
    .schema(schema_movies)
    .load("./ml-latest-small/movies.csv")
)

df_ratings_full = (
    ss.read
    .format("csv")
    .option("header", True)
    .schema(schema_ratings)
    .load("./ml-latest-small/ratings.csv")
)

In [25]:
print("-- Movies --")

df_movies_full.printSchema(), df_movies_full.show(4)

-- Movies --
root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
+-------+--------------------+--------------------+
only showing top 4 rows



(None, None)

In [26]:
print("-- Ratings --")

df_ratings_full.printSchema(), df_ratings_full.show(4)

-- Ratings --
root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: long (nullable = true)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
+------+-------+------+---------+
only showing top 4 rows



(None, None)

In [31]:

"""
Alternative: df_ratings.select("movieId", "rating")
"""

df_ratings = df_ratings_full[["movieId", "rating"]]
df_ratings.show(2)

+-------+------+
|movieId|rating|
+-------+------+
|      1|   4.0|
|      3|   4.0|
+-------+------+
only showing top 2 rows



In [91]:
df_movies_full.count(), df_ratings_full.count()

(9742, 100836)

In [84]:
df_ratings.groupBy("movieId").mean().first()

Row(movieId=1580, avg(movieId)=1580.0, avg(rating)=3.487878787878788)

In [46]:
"""
Sorted averaged ratings
"""

df_ratings_avg = \
df_ratings.groupBy("movieId").mean().sort("avg(rating)", ascending=False)[["movieId", "avg(rating)"]]

df_ratings_avg.show(2)

+-------+-----------+
|movieId|avg(rating)|
+-------+-----------+
|    496|        5.0|
|    148|        5.0|
+-------+-----------+
only showing top 2 rows



In [49]:
help(df_ratings_avg.join)

Help on method join in module pyspark.sql.dataframe:

join(other, on=None, how=None) method of pyspark.sql.dataframe.DataFrame instance
    Joins with another :class:`DataFrame`, using the given join expression.
    
    :param other: Right side of the join
    :param on: a string for the join column name, a list of column names,
        a join expression (Column), or a list of Columns.
        If `on` is a string or a list of strings indicating the name of the join column(s),
        the column(s) must exist on both sides, and this performs an equi-join.
    :param how: str, default ``inner``. Must be one of: ``inner``, ``cross``, ``outer``,
        ``full``, ``full_outer``, ``left``, ``left_outer``, ``right``, ``right_outer``,
        ``left_semi``, and ``left_anti``.
    
    The following performs a full outer join between ``df1`` and ``df2``.
    
    >>> df.join(df2, df.name == df2.name, 'outer').select(df.name, df2.height).collect()
    [Row(name=None, height=80), Row(name='Bob'

In [100]:
"""
Fetch top 10 first, then join with movies
"""

top = 10

df_ratings_top = df_ratings_avg.limit(top)

df_top10 = df_ratings_top.join(df_movies_full, on="movieId", how="inner")
df_top10 = df_top10.withColumnRenamed("avg(rating)", "rating_average")
df_top10.show()

+-------+--------------+--------------------+--------------------+
|movieId|rating_average|               title|              genres|
+-------+--------------+--------------------+--------------------+
|     53|           5.0|     Lamerica (1994)|     Adventure|Drama|
|    496|           5.0|What Happened Was...|Comedy|Drama|Roma...|
|   5513|           5.0|Martin Lawrence L...|  Comedy|Documentary|
|  84273|           5.0|Zeitgeist: Moving...|         Documentary|
| 113829|           5.0|One I Love, The (...|Comedy|Drama|Romance|
| 142444|           5.0|   The Editor (2015)|Comedy|Horror|Mys...|
| 147300|           5.0|Adventures Of She...|       Crime|Mystery|
| 150554|           5.0| The Love Bug (1997)|Adventure|Childre...|
| 152711|           5.0|Who Killed Chea V...|         Documentary|
| 173963|           5.0|      Empties (2007)|              Comedy|
+-------+--------------+--------------------+--------------------+



In [101]:
df_top10.toPandas()

Unnamed: 0,movieId,rating_average,title,genres
0,53,5.0,Lamerica (1994),Adventure|Drama
1,496,5.0,What Happened Was... (1994),Comedy|Drama|Romance|Thriller
2,5513,5.0,Martin Lawrence Live: Runteldat (2002),Comedy|Documentary
3,84273,5.0,Zeitgeist: Moving Forward (2011),Documentary
4,113829,5.0,"One I Love, The (2014)",Comedy|Drama|Romance
5,142444,5.0,The Editor (2015),Comedy|Horror|Mystery
6,147300,5.0,Adventures Of Sherlock Holmes And Dr. Watson: ...,Crime|Mystery
7,150554,5.0,The Love Bug (1997),Adventure|Children|Comedy|Fantasy
8,152711,5.0,Who Killed Chea Vichea? (2010),Documentary
9,173963,5.0,Empties (2007),Comedy


---

In [99]:
"""
A way to create a new column and add some computation..
"""

df_top10.withColumn("rating_plus_10", df_top10["avg(rating)"] + 10).show()

+-------+-----------+--------------------+--------------------+--------------+
|movieId|avg(rating)|               title|              genres|rating_plus_10|
+-------+-----------+--------------------+--------------------+--------------+
|     53|        5.0|     Lamerica (1994)|     Adventure|Drama|          15.0|
|    496|        5.0|What Happened Was...|Comedy|Drama|Roma...|          15.0|
|   5513|        5.0|Martin Lawrence L...|  Comedy|Documentary|          15.0|
|  84273|        5.0|Zeitgeist: Moving...|         Documentary|          15.0|
| 113829|        5.0|One I Love, The (...|Comedy|Drama|Romance|          15.0|
| 142444|        5.0|   The Editor (2015)|Comedy|Horror|Mys...|          15.0|
| 147300|        5.0|Adventures Of She...|       Crime|Mystery|          15.0|
| 150554|        5.0| The Love Bug (1997)|Adventure|Childre...|          15.0|
| 152711|        5.0|Who Killed Chea V...|         Documentary|          15.0|
| 173963|        5.0|      Empties (2007)|          

In [93]:
"""
Rename column
"""

df_top10.withColumnRenamed("avg(rating)", "rating_average").show()

+-------+--------------+--------------------+--------------------+
|movieId|rating_average|               title|              genres|
+-------+--------------+--------------------+--------------------+
|     53|           5.0|     Lamerica (1994)|     Adventure|Drama|
|    496|           5.0|What Happened Was...|Comedy|Drama|Roma...|
|   5513|           5.0|Martin Lawrence L...|  Comedy|Documentary|
|  84273|           5.0|Zeitgeist: Moving...|         Documentary|
| 113829|           5.0|One I Love, The (...|Comedy|Drama|Romance|
| 142444|           5.0|   The Editor (2015)|Comedy|Horror|Mys...|
| 147300|           5.0|Adventures Of She...|       Crime|Mystery|
| 150554|           5.0| The Love Bug (1997)|Adventure|Childre...|
| 152711|           5.0|Who Killed Chea V...|         Documentary|
| 173963|           5.0|      Empties (2007)|              Comedy|
+-------+--------------+--------------------+--------------------+



In [110]:
"""
Select all(*) columns
"""
df_ratings_full.select("*").show(2)


+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
+------+-------+------+---------+
only showing top 2 rows



In [111]:
"""
Aliasing a column in .select()
"""

df_ratings_full.select((df_ratings_full.rating + 10).alias("rating_plus_10")).show(10)

+--------------+
|rating_plus_10|
+--------------+
|          14.0|
|          14.0|
|          14.0|
|          15.0|
|          15.0|
|          13.0|
|          15.0|
|          14.0|
|          15.0|
|          15.0|
+--------------+
only showing top 10 rows



In [None]:
df_ratings_full.select(df_ratings_full.rating < 4.0).show(10)

In [113]:
"""
Filtering with..
.filter(), .where()

.where(): An alias for .filter()
"""

df_ratings_full.filter(df_ratings_full.rating > 4).show(2)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 2 rows



In [114]:
"""
Filtering with..
.filter(), .where()

.where(): An alias for .filter()
"""

df_ratings_full.where(df_ratings_full.rating > 4).show(2)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 2 rows



In [122]:
"""
Filtering with..
.filter(), .where()

.where(): An alias for .filter()
"""

df_ratings_full.filter((df_ratings_full.rating > 3) & (df_ratings_full.rating < 4)).show(2)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     2|   8798|   3.5|1445714960|
|     2|  91529|   3.5|1445714891|
+------+-------+------+----------+
only showing top 2 rows



In [123]:
"""
Filtering with..
Column construct

Why? 
"""

from pyspark.sql.functions import col

df_ratings_full.filter((col("rating") > 3) & (col("rating") < 4)).show(2)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     2|   8798|   3.5|1445714960|
|     2|  91529|   3.5|1445714891|
+------+-------+------+----------+
only showing top 2 rows



In [126]:
"""
SQL Functions
Sorting
"""

from pyspark.sql.functions import col, asc, desc

df_ratings_full.sort("rating").show(10)  # Default: Ascending
df_ratings_full.sort(col("rating")).show(10)  # Default: Ascending

df_ratings_full.sort(desc("rating")).show(10)  # Default: Ascending


+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     3|    720|   0.5|1306463595|
|     3|   2424|   0.5|1306464293|
|     3|    914|   0.5|1306463567|
|     3|   1263|   0.5|1306463569|
|     3|   2090|   0.5|1306464261|
|     3|   1272|   0.5|1306463624|
|     3|    647|   0.5|1306463619|
|     3|   1302|   0.5|1306464189|
|     3|     31|   0.5|1306463578|
|     3|   2018|   0.5|1306464175|
+------+-------+------+----------+
only showing top 10 rows

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     3|    720|   0.5|1306463595|
|     3|   2424|   0.5|1306464293|
|     3|    914|   0.5|1306463567|
|     3|   1263|   0.5|1306463569|
|     3|   2090|   0.5|1306464261|
|     3|   1272|   0.5|1306463624|
|     3|    647|   0.5|1306463619|
|     3|   1302|   0.5|1306464189|
|     3|     31|   0.5|1306463578|
|     3|   2018|   0.5|1306464175|
+------+-------+------+------

In [136]:
"""
SQL Functions
Avg
"""

from pyspark.sql.functions import col, avg, count

df_ratings_full.groupby("movieId").agg(count("userId")).show(2)

(
    df_ratings_full
    .groupby("movieId")
    .agg(count("userId")).withColumnRenamed("count(userId)", "total_ratings")
    .show(2)
)

(
    df_ratings_full
    .groupby("movieId")
    .agg(avg("rating")).withColumnRenamed("avg(rating)", "rating_average")
    .show(2)
)

+-------+-------------+
|movieId|count(userId)|
+-------+-------------+
|   1580|          165|
|   2366|           25|
+-------+-------------+
only showing top 2 rows

+-------+-------------+
|movieId|total_ratings|
+-------+-------------+
|   1580|          165|
|   2366|           25|
+-------+-------------+
only showing top 2 rows

+-------+-----------------+
|movieId|   rating_average|
+-------+-----------------+
|   1580|3.487878787878788|
|   2366|             3.64|
+-------+-----------------+
only showing top 2 rows



In [145]:
(
    df_ratings_full[["movieId", "rating"]]
    .groupby("movieId")
    .agg(count("rating")).withColumnRenamed("count(rating)", "rating_count")
).show(2)

+-------+------------+
|movieId|rating_count|
+-------+------------+
|   1580|         165|
|   2366|          25|
+-------+------------+
only showing top 2 rows



In [149]:
df_ratings_4up_top10 = \
(
    df_ratings_full[["movieId", "rating"]]
    .groupby("movieId")
#     .agg(count("rating")).withColumnRenamed("count(rating)", "rating_count")
    .agg(avg("rating")).withColumnRenamed("avg(rating)", "rating_average")
    .filter(col("rating_average") >= 4)
    .sort(desc("rating_average"))
    .limit(10)
)

df_ratings_4up_top10.show(2)

+-------+--------------+
|movieId|rating_average|
+-------+--------------+
| 173963|           5.0|
|  84273|           5.0|
+-------+--------------+
only showing top 2 rows



In [151]:
(
    df_ratings_4up_top10
    .join(
        df_movies_full,
        df_ratings_4up_top10.movieId == df_movies_full.movieId
    ).select(df_ratings_4up_top10.movieId, "rating_average", "title", "genres")  # Select only one of movieID columns
).show()

+-------+--------------+--------------------+--------------------+
|movieId|rating_average|               title|              genres|
+-------+--------------+--------------------+--------------------+
|     53|           5.0|     Lamerica (1994)|     Adventure|Drama|
|    496|           5.0|What Happened Was...|Comedy|Drama|Roma...|
|   5513|           5.0|Martin Lawrence L...|  Comedy|Documentary|
|  84273|           5.0|Zeitgeist: Moving...|         Documentary|
| 113829|           5.0|One I Love, The (...|Comedy|Drama|Romance|
| 147300|           5.0|Adventures Of She...|       Crime|Mystery|
| 150554|           5.0| The Love Bug (1997)|Adventure|Childre...|
| 152711|           5.0|Who Killed Chea V...|         Documentary|
| 157775|           5.0|Tenchi Muyô! In L...|    Animation|Comedy|
| 173963|           5.0|      Empties (2007)|              Comedy|
+-------+--------------+--------------------+--------------------+



In [154]:
df_ratings_full.columns

['userId', 'movieId', 'rating', 'timestamp']

In [158]:
"""
SQL Functions
Multiple aggregates
"""

df_top_movie_ratings = \
(
    df_ratings_full[["movieId", "userId", "rating"]]
    .groupby("movieId")
    .agg(count("userId").alias("total_ratings"),
         avg("rating").alias("ratings_average")
        )
    .filter(col("ratings_average") >= 4)
    .sort(desc("ratings_average"))
    .limit(10)
)
df_top_movie_ratings.show(2)

+-------+-------------+---------------+
|movieId|total_ratings|ratings_average|
+-------+-------------+---------------+
| 173963|            1|            5.0|
|  84273|            1|            5.0|
+-------+-------------+---------------+
only showing top 2 rows



In [166]:
df_top_movies = \
(
    df_top_movie_ratings
    .join(
        df_movies_full,
        df_top_movie_ratings.movieId == df_movies_full.movieId
    ).select(df_ratings_4up_top10.movieId, "total_ratings", "ratings_average", "title", "genres")
)
df_top_movies.show()

+-------+-------------+---------------+--------------------+--------------------+
|movieId|total_ratings|ratings_average|               title|              genres|
+-------+-------------+---------------+--------------------+--------------------+
|     53|            2|            5.0|     Lamerica (1994)|     Adventure|Drama|
|    496|            1|            5.0|What Happened Was...|Comedy|Drama|Roma...|
|   5513|            1|            5.0|Martin Lawrence L...|  Comedy|Documentary|
|  84273|            1|            5.0|Zeitgeist: Moving...|         Documentary|
| 113829|            1|            5.0|One I Love, The (...|Comedy|Drama|Romance|
| 147300|            1|            5.0|Adventures Of She...|       Crime|Mystery|
| 150554|            1|            5.0| The Love Bug (1997)|Adventure|Childre...|
| 152711|            1|            5.0|Who Killed Chea V...|         Documentary|
| 157775|            1|            5.0|Tenchi Muyô! In L...|    Animation|Comedy|
| 173963|       

In [164]:
!mkdir output

In [165]:
!dir out*

 Volume in drive C has no label.
 Volume Serial Number is A88C-3222

 Directory of C:\Users\Administrator\veena\bigdata

05/12/2021  02:20 AM    <DIR>          output
               0 File(s)              0 bytes
               1 Dir(s)  449,088,106,496 bytes free


In [169]:
"""
Writing to a file (locally)

Create 'output' folder
Opend cmd-line
Run: winutils.exe chmod -R 777 C:\users\Administrator\veena\bigdata\output
"""

df_top_movies.write.mode("overwrite").option("encoding", "utf-16").csv("output/Top_10_Movies_by_UserRatings.csv")

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 101-102: truncated \uXXXX escape (<ipython-input-169-2721d8c2b5cf>, line 7)

In [171]:
df_top_movies.toPandas().to_csv("output/Top_10_Movies_by_UserRatings.csv", index=False)