In [11]:
from os.path import expanduser,join,abspath

from pyspark.sql import SparkSession
from pyspark.sql import Row

In [12]:
# warehouse_location points to the default location for managed databases and tables
warehouse_location = abspath('spark-warehouse') 

spark = SparkSession.builder.appName('hive_df_exp').config("spark.sql.warehouse.dir", warehouse_location).getOrCreate()

In [13]:
# spark is an existing SparkSession
spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive")
spark.sql("LOAD DATA LOCAL INPATH '/dnbusr1/sambasivaraot/PySpark/input_data/kv1.txt' INTO TABLE src")

AnalysisException: "Hive support is required to CREATE Hive TABLE (AS SELECT);;\n'CreateTable `src`, Ignore\n"

In [7]:
# Queries are expressed in HiveQL
spark.sql("SELECT * FROM src").show()

In [None]:
# Aggregation queries are also supported.
spark.sql("SELECT COUNT(*) FROM src").show()

In [None]:
# The results of SQL queries are themselves DataFrames and support all normal functions.
sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key")

In [None]:
# The items in DataFrames are of type Row, which allows you to access each column by ordinal.
stringsDS = sqlDF.rdd.map(lambda row: "Key: %d, Value: %s" % (row.key, row.value))
for record in stringsDS.collect():
    print(record)

In [None]:
# You can also use DataFrames to create temporary views within a SparkSession.
Record = Row("key", "value")
recordsDF = spark.createDataFrame([Record(i, "val_" + str(i)) for i in range(1, 101)])
recordsDF.createOrReplaceTempView("records")

In [None]:
# Queries can then join DataFrame data with data stored in Hive.
spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show()

### Specifying storage format for Hive tables