Spark SQL, DFs, are APIs.
<br>They talk to Spark's core written in Java


In [1]:
import findspark
findspark.init()

In [2]:
"""
Configure before creating SparkSession
"""

from pyspark.conf import SparkConf

config = SparkConf()
conf = \
(
    config
    .setMaster("spark://192.168.11.77:7077").setAppName("SparkSQL:CLUSTER")
    .set("spark.executor.memory", "2g")
    .set("spark.executor.cores", 4)
    .set("spark.cores.max", 4)
    .set("spark.driver.memory", "2g")
)

In [3]:
from pyspark.sql import SparkSession

ss = SparkSession.builder.config(conf=conf).getOrCreate()

---

In [4]:
from pyspark.sql.types import StructType, IntegerType, DoubleType, StringType, LongType

schema_movies = (
    StructType()
    .add("movieId", IntegerType(), True)
    .add("title", StringType(), True)
    .add("genres", StringType(), True)
)
schema_ratings = (
    StructType()
    .add("userId", IntegerType(), True)
    .add("movieId", IntegerType(), True)
    .add("rating", DoubleType(), True)
    .add("timestamp", LongType(), True)
)

df_movies_full = (
    ss.read
    .format("csv")
    .option("header", True)
    .schema(schema_movies)
    .load("hdfs://192.168.93.128:9000/input/movie_lens/movies.csv")
)

df_ratings_full = (
    ss.read
    .format("csv")
    .option("header", True)
    .schema(schema_ratings)
    .load("hdfs://192.168.93.128:9000/input/movie_lens/ratings.csv")
)

In [5]:
df_movies_full.show(2), df_ratings_full.show(2)

+-------+----------------+--------------------+
|movieId|           title|              genres|
+-------+----------------+--------------------+
|      1|Toy Story (1995)|Adventure|Animati...|
|      2|  Jumanji (1995)|Adventure|Childre...|
+-------+----------------+--------------------+
only showing top 2 rows

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
+------+-------+------+---------+
only showing top 2 rows



(None, None)

In [None]:
"""
SparkContext
Spark core, RDD, DAG, Job, Task, Scheduler, Block Managers

SparkSession - 1 at a minimum
Additional sessions possible with newSession()
Entry point for Df/Dataset/.sql
All createOrReplaceTempViews() are scoped to a session.
"""

"""
# SparkContext - Spark Core, RDD, DAG, Job, Stage, Task, Schedulers, Block Manager...
# Only one spark context per application
# SparkSession - entry point for DF/DataSet/SQL, you need at least 1 spark session
   # every sesson shall have its own temp view, UDF/User Defined Function
# minimum 1 session needed for sql/df, we can also create additional 
# create new session newSession()
# all temp views are scoped to a spark session
# df.createOrREplaceTempView("ratings")
# df.createOrREplaceTempView("movies")
# df.createOrREplaceTempView("links")
# df.createOrREplaceTempView("tags")
# df.createOrREplaceTempView("users")
# ...
"""

In [None]:
"""
DF should be regustered as a Spark SQL View

Register..
Create a temprorary view called "movies"
Create a temprorary view called "ratings"
"""

df_movies_full.createOrReplaceTempView("movies")
df_ratings_full.createOrReplaceTempView("ratings")


In [11]:
"Create a global temp view to share between sessions"
df_ratings_full.createOrReplaceGlobalTempView("ratings")


ss.sql("SELECT * FROM ratings").show(2)


"""
Create a new session with the same SparkContext.
SparkContext is shared among sessions. This way, table cache (GlobalTempView) are shared between sessions.
Temp view and UDFs (User Defined Functions) are not shared.
"""
ss1 = ss.newSession()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
+------+-------+------+---------+
only showing top 2 rows



In [15]:
ss1.sql("SELECT * FROM global_temp.ratings").show(2)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
+------+-------+------+---------+
only showing top 2 rows



In [18]:
ss.sql("SELECT * FROM ratings").show(2)
ss.sql("SELECT * FROM global_temp.ratings").show(2)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
+------+-------+------+---------+
only showing top 2 rows

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
+------+-------+------+---------+
only showing top 2 rows



In [20]:
"""
Destroy view from temp cache
via SQL API <--
via Python/Scala API
"""

ss1.sql("DROP VIEW global_temp.ratings").show(1)

++
||
++
++



In [21]:
ss.sql("SELECT * FROM global_temp.ratings").show(1)

AnalysisException: "Table or view not found: `global_temp`.`ratings`; line 1 pos 14;\n'Project [*]\n+- 'UnresolvedRelation `global_temp`.`ratings`\n"

In [25]:
df_movies_full.createOrReplaceGlobalTempView("movies")
ss.sql("SELECT * FROM global_temp.movies LIMIT 10").show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
+-------+--------------------+--------------------+



In [26]:
"""
Destroy view from temp cache
via SQL API
via Python/Scala API <--
"""

ss1.catalog.dropGlobalTempView("movies")

In [27]:
ss1.sql("SELECT * FROM global_temp.movies LIMIT 10").show()

AnalysisException: "Table or view not found: `global_temp`.`movies`; line 1 pos 14;\n'GlobalLimit 10\n+- 'LocalLimit 10\n   +- 'Project [*]\n      +- 'UnresolvedRelation `global_temp`.`movies`\n"

In [None]:
# ratings is temp view reside in spark session
ss.sql("SELECT * FROM ratings").show(1)

#spark.catalog.dropTempView("rating") 
# OR 
ss.sql("DROP VIEW ratings")

# view got dropped, ERROR ratings view not found
ss.sql("SELECT * FROM ratings").show(1)

In [28]:
ss.sql("CREATE DATABASE product_db")

DataFrame[]

In [29]:
ss.sql("SHOW TABLES IN product_db").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |  ratings|       true|
+--------+---------+-----------+



In [30]:
ss.sql("SHOW TABLES IN default").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |  ratings|       true|
+--------+---------+-----------+



In [31]:
df_movies_full.createOrReplaceGlobalTempView("movies")
ss.sql("SHOW TABLES IN default").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |  ratings|       true|
+--------+---------+-----------+



In [32]:
df_movies_full.createOrReplaceTempView("movies")
ss.sql("SHOW TABLES IN default").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |   movies|       true|
|        |  ratings|       true|
+--------+---------+-----------+



In [33]:
df_movies_full.createOrReplaceGlobalTempView("movies_global")
df_movies_full.createOrReplaceTempView("movies_temp")
ss.sql("SHOW TABLES IN default").show()

+--------+-----------+-----------+
|database|  tableName|isTemporary|
+--------+-----------+-----------+
|        |     movies|       true|
|        |movies_temp|       true|
|        |    ratings|       true|
+--------+-----------+-----------+



In [38]:
ss.sql("CREATE DATABASE IF NOT EXISTS movies_db")
ss.sql("USE movies_db")
ss.sql("SHOW TABLES IN movies_db").show()

+--------+-----------+-----------+
|database|  tableName|isTemporary|
+--------+-----------+-----------+
|        |     movies|       true|
|        |movies_temp|       true|
|        |    ratings|       true|
+--------+-----------+-----------+



In [None]:
ss.sql("SHOW TABLES IN movies_db").show()

----

Delete below

---

In [None]:
"""
Writing to a file (locally)

Create 'output' folder
Opend cmd-line
Run: winutils.exe chmod -R 777 C:\users\Administrator\veena\bigdata\output
"""

# df_top_movies.write.mode("overwrite").csv("output/Top_10_Movies_by_UserRatings.csv")

In [None]:
df_top_movies.toPandas().to_csv("output/Top_10_Movies_by_UserRatings.csv", index=False)