In [None]:
import findspark
findspark.init()
#findspark.find()

from os.path import abspath
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [None]:
warehouse_location = abspath('spark-warehouse')

spark = SparkSession \
    .builder \
    .appName("Working with Hive") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .enableHiveSupport() \
    .getOrCreate()

In [None]:
spark.conf.get("spark.sql.warehouse.dir")

In [None]:
spark.catalog.currentDatabase()

In [None]:
spark.catalog.listTables() 

In [None]:
spark.sql("drop database if exists sparkdemo cascade")
spark.sql("create database if not exists sparkdemo")
spark.sql("use sparkdemo")

In [None]:
spark.sql("DROP TABLE IF EXISTS movies")
spark.sql("DROP TABLE IF EXISTS ratings")
spark.sql("DROP TABLE IF EXISTS topRatedMovies")

### Managed tables using HiveQL

In [None]:
createMovies = """CREATE TABLE IF NOT EXISTS 
            movies (movieId INT, title STRING, genres STRING) 
            ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
            WITH SERDEPROPERTIES (
               'separatorChar' = ',',
               'quoteChar'     = '\"'
            )  
            """
    
loadMovies = """LOAD DATA LOCAL INPATH 'E:/PySpark/data/movielens/moviesNoHeader.csv' 
         OVERWRITE INTO TABLE movies"""
    
createRatings = """CREATE TABLE IF NOT EXISTS 
             ratings (userId INT, movieId INT, rating DOUBLE, timestamp LONG) 
             ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
                WITH SERDEPROPERTIES (
                   'separatorChar' = ',',
                   'quoteChar'     = '\"'
                )  
                """
    
loadRatings = """LOAD DATA LOCAL INPATH 'E:/PySpark/data/movielens/ratingsNoHeader.csv' 
         OVERWRITE INTO TABLE ratings"""

In [None]:
spark.sql(createMovies)
spark.sql(loadMovies)
spark.sql(createRatings)
spark.sql(loadRatings)

In [None]:
spark.catalog.listTables()

In [None]:
moviesDF = spark.sql("SELECT * FROM movies")
ratingsDF = spark.sql("SELECT * FROM ratings")

In [None]:
moviesDF.show(5, False)

In [None]:
ratingsDF.show(5, False)

In [None]:
summaryDf = ratingsDF \
            .groupBy("movieId") \
            .agg(count("rating").alias("ratingCount"), avg("rating").alias("ratingAvg")) \
            .filter("ratingCount > 25") \
            .orderBy(desc("ratingAvg")) \
            .limit(10)
            
summaryDf.show(10, False)

In [None]:
summaryDf2 = summaryDf.join(moviesDF, summaryDf["movieId"] == moviesDF["movieId"]) \
                .drop(summaryDf["movieId"]) \
                .select("movieId", "title", "ratingCount", "ratingAvg") \
                .orderBy(desc("ratingAvg")) \
                .coalesce(1)
    
summaryDf2.show(10, False)

In [None]:
summaryDf2.write \
   .mode("overwrite") \
   .format("csv") \
   .option("header", True) \
   .saveAsTable("topRatedMovies")

In [None]:
spark.catalog.listTables()

In [None]:
dtopratedmoviesDf = spark.sql("SELECT * FROM topratedmovies")
dtopratedmoviesDf.show(10, False)

### External tables

In [None]:
summaryDf2.write \
    .mode("overwrite") \
    .format("csv") \
    .option("header", True) \
    .option("path", "E:\\PySpark\\external\\topRatedMoviesExt") \
    .saveAsTable("topratedmoviesext")

In [None]:
spark.catalog.listTables()

In [None]:
spark.sql("SELECT * FROM topratedmoviesext").show()

In [None]:
# Drop an external table
spark.sql("DROP TABLE IF EXISTS topratedmoviesext")

In [None]:
spark.catalog.listTables()

In [None]:
spark.sql("SELECT * FROM topratedmoviesext").show()

### Recreating table from exisitng data

In [49]:
createExternalTable = """CREATE EXTERNAL TABLE IF NOT EXISTS 
            topratedmoviesext (movieId INT,title STRING, ratingCount INT, ratingAvg DOUBLE) 
            ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
            WITH SERDEPROPERTIES (
               'separatorChar' = ',',
               'quoteChar'     = '\"'
            )  
            LOCATION 'E:/PySpark/external/topRatedMoviesExt'"""

In [52]:
spark.sql(createExternalTable)

DataFrame[]

In [53]:
spark.catalog.listTables()

[Table(name='movies', database='sparkdemo', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='ratings', database='sparkdemo', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='topratedmovies', database='sparkdemo', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='topratedmoviesext', database='sparkdemo', description=None, tableType='EXTERNAL', isTemporary=False)]

In [54]:
spark.table("topratedmoviesext").show()

+-------+--------------------+-----------+-----------------+
|movieid|               title|ratingcount|        ratingavg|
+-------+--------------------+-----------+-----------------+
|movieId|               title|ratingCount|        ratingAvg|
|    858|Godfather, The (1...|        200|           4.4875|
|    318|Shawshank Redempt...|        311|4.487138263665595|
|   1945|On the Waterfront...|         29|4.448275862068965|
|    926|All About Eve (1950)|         38|4.434210526315789|
|   1217|          Ran (1985)|         26|4.423076923076923|
|    969|African Queen, Th...|         50|             4.42|
|   2064|   Roger & Me (1989)|         42|4.392857142857143|
|    913|Maltese Falcon, T...|         62|4.387096774193548|
|   1221|Godfather: Part I...|        135|4.385185185185185|
|     50|Usual Suspects, T...|        201|4.370646766169155|
+-------+--------------------+-----------+-----------------+



In [55]:
spark.sql("drop database if exists sparkdemo cascade")

DataFrame[]