In [2]:
# SQL, DF are just API , interfaces
import findspark
findspark.init()

In [3]:
from pyspark.conf import SparkConf
config = SparkConf()
config.setMaster("spark://192.168.11.71:7077").setAppName("SparkDataFrameGlobalTempView")
#config.setMaster("local[2]").setAppName("SparkDataFrameHdfs")
config.set("spark.executor.memory", "4g")
config.set("spark.executor.cores", 4)
config.set("spark.cores.max", 4)
config.set("spark.driver.memory", "4g")


from pyspark.sql import SparkSession
spark = SparkSession.builder.config(conf=config).getOrCreate()

In [4]:
from pyspark.sql.types import StructType, LongType,StringType, IntegerType, DoubleType

movieSchema = StructType()\
         .add("movieId", IntegerType(), True)\
         .add("title", StringType(), True)\
         .add("genres", StringType(), True)\


ratingSchema = StructType()\
         .add("userId", IntegerType(), True)\
         .add("movieId", IntegerType(), True)\
         .add("rating", DoubleType(), True)\
         .add("timestamp", StringType(), True)\

In [5]:
movieDf = spark.read.format("csv")\
          .option("header", True)\
          .schema(movieSchema)\
          .load("hdfs://192.168.93.128:9000/ml-latest-small/movies.csv")

ratingDf = spark.read.format("csv")\
          .option("header", True)\
          .schema(ratingSchema)\
          .load("hdfs://192.168.93.128:9000/ml-latest-small/ratings.csv")

In [6]:
movieDf.show(2)
ratingDf.show(2)

+-------+----------------+--------------------+
|movieId|           title|              genres|
+-------+----------------+--------------------+
|      1|Toy Story (1995)|Adventure|Animati...|
|      2|  Jumanji (1995)|Adventure|Childre...|
+-------+----------------+--------------------+
only showing top 2 rows

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
+------+-------+------+---------+
only showing top 2 rows



In [1]:
# SparkContext - Spark Core, RDD, DAG, Job, Stage, Task, Schedulers, Block Manager...
# Only one spark context per application

# SparkSession - entry point for DF/DataSet/SQL, you need at least 1 spark session
   # every sesson shall have its own temp view, UDF/User Defined Function
# minimum 1 session needed for sql/df, we can also create additional 
# create new session newSession()
# all temp views are scoped to a spark session
# df.createOrREplaceTempView("ratings")
# df.createOrREplaceTempView("movies")
# df.createOrREplaceTempView("links")
# df.createOrREplaceTempView("tags")
# df.createOrREplaceTempView("users")
# ...

In [8]:
ratingDf.createOrReplaceTempView("ratings")
# works
spark.sql("select * from ratings").show(1)

# create a new session on the same context
# Spark Context is shared among sessions, table caches
# temp view and User defined functions are not shared
spark2 = spark.newSession()

# error because ratings view not available in spark2 session
spark2.sql("select * from ratings") # won't work


+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
+------+-------+------+---------+
only showing top 1 row



AnalysisException: 'Table or view not found: ratings; line 1 pos 14'

In [16]:
# create a temp view in global level, that can be used in any session
movieDf.createOrReplaceGlobalTempView("movies")
spark2 = spark.newSession()
spark2.sql("select * from global_temp.movies").show(1)

# now try on spark
spark.sql("select * from global_temp.movies").show(1)

+-------+----------------+--------------------+
|movieId|           title|              genres|
+-------+----------------+--------------------+
|      1|Toy Story (1995)|Adventure|Animati...|
+-------+----------------+--------------------+
only showing top 1 row

+-------+----------------+--------------------+
|movieId|           title|              genres|
+-------+----------------+--------------------+
|      1|Toy Story (1995)|Adventure|Animati...|
+-------+----------------+--------------------+
only showing top 1 row



In [15]:
spark.sql("select * from movies").show(1)

AnalysisException: 'Table or view not found: movies; line 1 pos 14'

In [17]:
# drop using SQL API
# drop using Python/Scala api
# this should work, before droping the global temp view
spark.sql("select * from global_temp.movies").show(1)
# drop view

# to drop global temp view, pick any spark session
spark.catalog.dropGlobalTempView("movies")

# after dropping the view , ERROR, FAIL
spark.sql("select * from global_temp.movies").show(1) # FAIL with error


+-------+----------------+--------------------+
|movieId|           title|              genres|
+-------+----------------+--------------------+
|      1|Toy Story (1995)|Adventure|Animati...|
+-------+----------------+--------------------+
only showing top 1 row



AnalysisException: "Table or view not found: `global_temp`.`movies`; line 1 pos 14;\n'Project [*]\n+- 'UnresolvedRelation `global_temp`.`movies`\n"

In [19]:
# ratings is temp view reside in spark session
spark.sql("select * from ratings").show(1)

#spark.catalog.dropTempView("rating") 
# OR 
spark.sql("drop view ratings")

# view got dropped, ERROR ratings view not found

spark.sql("select * from ratings").show(1)


+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
+------+-------+------+---------+
only showing top 1 row



AnalysisException: 'Table or view not found: ratings; line 1 pos 14'

In [20]:
spark.sql("create database productdb")

DataFrame[]

In [22]:
spark.sql("show tables in productdb").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [23]:
spark.sql("show tables in default").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [25]:
spark.sql("show databases").show()

+------------+
|databaseName|
+------------+
|     default|
|   productdb|
+------------+



In [29]:

movieDf.createOrReplaceTempView("movies")
spark.sql("show tables in default").show()

spark.sql("select * from movies").show(1)


+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |   movies|       true|
+--------+---------+-----------+

+-------+----------------+--------------------+
|movieId|           title|              genres|
+-------+----------------+--------------------+
|      1|Toy Story (1995)|Adventure|Animati...|
+-------+----------------+--------------------+
only showing top 1 row



AnalysisException: "Table or view not found: `default`.`movies`; line 1 pos 14;\n'Project [*]\n+- 'UnresolvedRelation `default`.`movies`\n"

In [32]:
spark.sql("USE  productdb")
ratingDf.createOrReplaceTempView("movies")
spark.sql("show tables in productdb").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |   movies|       true|
+--------+---------+-----------+



In [33]:
spark.sql("show tables in default").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |   movies|       true|
+--------+---------+-----------+

