In [None]:
Parquet is a columnar format that is supported by many other data processing systems. 
Spark SQL provides support for both reading and writing Parquet files that automatically preserves the schema of the original data. 
When writing Parquet files, all columns are automatically converted to be nullable for compatibility reasons.

In [1]:
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import HiveContext,SQLContext

In [2]:
spark = SparkSession.builder.appName('paquet_df_exp').getOrCreate()

In [3]:
jsonDF = spark.read.json('/dnbusr1/sambasivaraot/PySpark/input_data/people.json')

In [4]:
jsonDF.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [6]:
jsonDF.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [8]:
# DataFrames can be saved as Parquet files
jsonDF.write.parquet('people.parquet')

In [9]:
# Read in the Parquet file 
prqt = spark.read.parquet('people.parquet')

In [11]:
prqt.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [12]:
# Parquet files can also be used to create a temporary view and then used in SQL statements.
prqt.createOrReplaceTempView('people_prqt')

In [15]:
teenagers = spark.sql("SELECT * FROM people_prqt WHERE age >= 13 AND age <= 19")

In [17]:
teenagers.show()

+---+------+
|age|  name|
+---+------+
| 19|Justin|
+---+------+



### Schema Merging

In [26]:
from pyspark.sql import Row

# spark is from the previous example.
# Create a simple DataFrame, stored into a partition directory

sc = spark.sparkContext

squaresDF = spark.createDataFrame(sc.range(1,6).map(lambda i : Row(single=i,doublr=i ** 2)))
squaresDF.write.parquet('data/test_table/key=1')


In [27]:
cubesDF = spark.createDataFrame(sc.range(6,11).map(lambda i : Row(single=i,triple=i ** 3)))
cubesDF.write.parquet('data/test_table/key=2')

In [28]:
mergeDF = spark.read.option('mergeSchema','true').parquet("data/test_table")
mergeDF.show()

+------+------+------+---+
|doublr|single|triple|key|
+------+------+------+---+
|  null|     8|   512|  2|
|  null|     7|   343|  2|
|  null|     6|   216|  2|
|  null|     9|   729|  2|
|  null|    10|  1000|  2|
|     1|     1|  null|  1|
|     4|     2|  null|  1|
|    16|     4|  null|  1|
|     9|     3|  null|  1|
|    25|     5|  null|  1|
+------+------+------+---+



### Metadata Refreshing

In [None]:
# spark is an existing SparkSession
spark.catalog.refreshTable("my_table")