Spark SQL, DFs, are APIs.
<br>They talk to Spark's core written in Java


In [1]:
import findspark
findspark.init()

In [3]:
"""
Configure before creating SparkSession
"""

from pyspark.conf import SparkConf

config = SparkConf()
conf = \
(
    config
    .setMaster("spark://192.168.11.77:7077").setAppName("SparkSQL:CLUSTER")
    .set("spark.executor.memory", "2g")
    .set("spark.executor.cores", 4)
    .set("spark.cores.max", 4)
    .set("spark.driver.memory", "2g")
)

In [4]:
from pyspark.sql import SparkSession

ss = SparkSession.builder.config(conf=conf).getOrCreate()

---

In [7]:
from pyspark.sql.types import StructType, IntegerType, DoubleType, StringType, LongType

schema_movies = (
    StructType()
    .add("movieId", IntegerType(), True)
    .add("title", StringType(), True)
    .add("genres", StringType(), True)
)
schema_ratings = (
    StructType()
    .add("userId", IntegerType(), True)
    .add("movieId", IntegerType(), True)
    .add("rating", DoubleType(), True)
    .add("timestamp", LongType(), True)
)

df_movies_full = (
    ss.read
    .format("csv")
    .option("header", True)
    .schema(schema_movies)
    .load("hdfs://192.168.93.128:9000/input/movie_lens/movies.csv")
)

df_ratings_full = (
    ss.read
    .format("csv")
    .option("header", True)
    .schema(schema_ratings)
    .load("hdfs://192.168.93.128:9000/input/movie_lens/ratings.csv")
)

In [10]:
"""
DF should be regustered as a Spark SQL View

Register..
Create a temprorary view called "movies"
Create a temprorary view called "ratings"
"""

df_movies_full.createOrReplaceTempView("movies")
df_ratings_full.createOrReplaceTempView("ratings")


In [12]:
"""
Querying via ss.sql returns a DF

"""

df_movies = ss.sql("SELECT * FROM movies")
df_movies.show(4)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
+-------+--------------------+--------------------+
only showing top 4 rows



In [14]:
df_movies.printSchema(), df_movies.schema

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



(None,
 StructType(List(StructField(movieId,IntegerType,true),StructField(title,StringType,true),StructField(genres,StringType,true))))

In [19]:
df_movies.select("*").show(2)

+-------+----------------+--------------------+
|movieId|           title|              genres|
+-------+----------------+--------------------+
|      1|Toy Story (1995)|Adventure|Animati...|
|      2|  Jumanji (1995)|Adventure|Childre...|
+-------+----------------+--------------------+
only showing top 2 rows



In [24]:
show_n = 2
ss.sql("SELECT * FROM ratings").show(show_n)
ss.sql("SELECT movieId, rating FROM ratings").show(show_n)
ss.sql("SELECT movieId AS id, rating + .3 FROM ratings").show(show_n)
ss.sql("SELECT movieId, rating FROM ratings WHERE rating > 4").show(show_n)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
+------+-------+------+---------+
only showing top 2 rows

+-------+------+
|movieId|rating|
+-------+------+
|      1|   4.0|
|      3|   4.0|
+-------+------+
only showing top 2 rows

+---+------------------------------+
| id|(rating + CAST(0.3 AS DOUBLE))|
+---+------------------------------+
|  1|                           4.3|
|  3|                           4.3|
+---+------------------------------+
only showing top 2 rows

+-------+------+
|movieId|rating|
+-------+------+
|     47|   5.0|
|     50|   5.0|
+-------+------+
only showing top 2 rows



In [25]:
ss.sql("""
SELECT movieId, rating, COUNT(rating)
  FROM ratings
 WHERE rating > 4
 GROUP BY 1, 2
""").show(show_n)


+-------+------+-------------+
|movieId|rating|count(rating)|
+-------+------+-------------+
|    457|   5.0|           46|
|   3052|   5.0|           11|
+-------+------+-------------+
only showing top 2 rows



In [30]:
"""
SQL Indexing begins at 1
"""

ss.sql("""
SELECT movieId,
       UPPER(title) AS title_upper,
       LOWER(title) AS title_lower,
       INITCAP(title) AS title_init,
       LENGTH(title) title_len,
       SUBSTR(title, 1, 10) AS title_sub
  FROM movies
 LIMIT 10
""").show()


+-------+--------------------+--------------------+--------------------+---------+----------+
|movieId|         title_upper|         title_lower|          title_init|title_len| title_sub|
+-------+--------------------+--------------------+--------------------+---------+----------+
|      1|    TOY STORY (1995)|    toy story (1995)|    Toy Story (1995)|       16|Toy Story |
|      2|      JUMANJI (1995)|      jumanji (1995)|      Jumanji (1995)|       14|Jumanji (1|
|      3|GRUMPIER OLD MEN ...|grumpier old men ...|Grumpier Old Men ...|       23|Grumpier O|
|      4|WAITING TO EXHALE...|waiting to exhale...|Waiting To Exhale...|       24|Waiting to|
|      5|FATHER OF THE BRI...|father of the bri...|Father Of The Bri...|       34|Father of |
|      6|         HEAT (1995)|         heat (1995)|         Heat (1995)|       11|Heat (1995|
|      7|      SABRINA (1995)|      sabrina (1995)|      Sabrina (1995)|       14|Sabrina (1|
|      8| TOM AND HUCK (1995)| tom and huck (1995)| Tom And 

In [35]:
ss.sql("""
SELECT movieId,
       title,
       LENGTH(title) AS title_len,
       SUBSTR(title, 1, 10) AS title_sub,
       LEFT(title, 4) AS title_sub_left,
       RIGHT(title, 4) AS title_sub_right
  FROM movies
 LIMIT 10
""").show()


+-------+--------------------+---------+----------+--------------+---------------+
|movieId|               title|title_len| title_sub|title_sub_left|title_sub_right|
+-------+--------------------+---------+----------+--------------+---------------+
|      1|    Toy Story (1995)|       16|Toy Story |          Toy |           995)|
|      2|      Jumanji (1995)|       14|Jumanji (1|          Juma|           995)|
|      3|Grumpier Old Men ...|       23|Grumpier O|          Grum|           995)|
|      4|Waiting to Exhale...|       24|Waiting to|          Wait|           995)|
|      5|Father of the Bri...|       34|Father of |          Fath|           995)|
|      6|         Heat (1995)|       11|Heat (1995|          Heat|           995)|
|      7|      Sabrina (1995)|       14|Sabrina (1|          Sabr|           995)|
|      8| Tom and Huck (1995)|       19|Tom and Hu|          Tom |           995)|
|      9| Sudden Death (1995)|       19|Sudden Dea|          Sudd|           995)|
|   

In [36]:
ss.sql("""
SELECT movieId, rating, COUNT(rating) AS rating_total
  FROM ratings
 WHERE rating > 4
 GROUP BY 1, 2
HAVING rating_total > 100
""").show(show_n)


+-------+------+------------+
|movieId|rating|rating_total|
+-------+------+------------+
|    318|   5.0|         153|
|    260|   5.0|         104|
+-------+------+------------+
only showing top 2 rows



In [39]:
ss.sql("""
SELECT movieId, AVG(rating) AS rating_avg, COUNT(rating) AS rating_total
  FROM ratings
 WHERE rating > 4
 GROUP BY 1
HAVING rating_total >= 100
 ORDER BY 2 DESC
""").show(show_n)

+-------+-----------------+------------+
|movieId|       rating_avg|rating_total|
+-------+-----------------+------------+
|    260|          4.90625|         128|
|    110|4.896039603960396|         101|
+-------+-----------------+------------+
only showing top 2 rows



In [40]:
df_temp_movies = \
ss.sql("""
SELECT movieId, AVG(rating) AS rating_avg, COUNT(rating) AS rating_total
  FROM ratings
 WHERE rating > 4
 GROUP BY 1
HAVING rating_total >= 100
 ORDER BY 2 DESC
""")

In [41]:
df_temp_movies.explain(True)

== Parsed Logical Plan ==
'Sort [2 DESC NULLS LAST], true
+- 'UnresolvedHaving ('rating_total >= 100)
   +- 'Aggregate [1], ['movieId, 'AVG('rating) AS rating_avg#513, 'COUNT('rating) AS rating_total#514]
      +- 'Filter ('rating > 4)
         +- 'UnresolvedRelation `ratings`

== Analyzed Logical Plan ==
movieId: int, rating_avg: double, rating_total: bigint
Sort [rating_avg#513 DESC NULLS LAST], true
+- Filter (rating_total#514L >= cast(100 as bigint))
   +- Aggregate [movieId#7], [movieId#7, avg(rating#8) AS rating_avg#513, count(rating#8) AS rating_total#514L]
      +- Filter (rating#8 > cast(4 as double))
         +- SubqueryAlias `ratings`
            +- Relation[userId#6,movieId#7,rating#8,timestamp#9L] csv

== Optimized Logical Plan ==
Sort [rating_avg#513 DESC NULLS LAST], true
+- Filter (rating_total#514L >= 100)
   +- Aggregate [movieId#7], [movieId#7, avg(rating#8) AS rating_avg#513, count(rating#8) AS rating_total#514L]
      +- Project [movieId#7, rating#8]
         +- Fi

---

In [42]:
df_temp_movies = \
ss.sql("""
SELECT r.movieId, AVG(r.rating) AS rating_avg, COUNT(r.rating) AS rating_total
  FROM ratings AS r
 WHERE r.rating > 4
 GROUP BY 1
HAVING rating_total >= 100
 ORDER BY 2 DESC
""")

In [43]:
df_temp_movies.explain(True)

== Parsed Logical Plan ==
'Sort [2 DESC NULLS LAST], true
+- 'UnresolvedHaving ('rating_total >= 100)
   +- 'Aggregate [1], ['r.movieId, 'AVG('r.rating) AS rating_avg#530, 'COUNT('r.rating) AS rating_total#531]
      +- 'Filter ('r.rating > 4)
         +- 'SubqueryAlias `r`
            +- 'UnresolvedRelation `ratings`

== Analyzed Logical Plan ==
movieId: int, rating_avg: double, rating_total: bigint
Sort [rating_avg#530 DESC NULLS LAST], true
+- Filter (rating_total#531L >= cast(100 as bigint))
   +- Aggregate [movieId#7], [movieId#7, avg(rating#8) AS rating_avg#530, count(rating#8) AS rating_total#531L]
      +- Filter (rating#8 > cast(4 as double))
         +- SubqueryAlias `r`
            +- SubqueryAlias `ratings`
               +- Relation[userId#6,movieId#7,rating#8,timestamp#9L] csv

== Optimized Logical Plan ==
Sort [rating_avg#530 DESC NULLS LAST], true
+- Filter (rating_total#531L >= 100)
   +- Aggregate [movieId#7], [movieId#7, avg(rating#8) AS rating_avg#530, count(rating#

----

Delete below

---

In [None]:
"""
Writing to a file (locally)

Create 'output' folder
Opend cmd-line
Run: winutils.exe chmod -R 777 C:\users\Administrator\veena\bigdata\output
"""

# df_top_movies.write.mode("overwrite").csv("output/Top_10_Movies_by_UserRatings.csv")

In [None]:
df_top_movies.toPandas().to_csv("output/Top_10_Movies_by_UserRatings.csv", index=False)