# Create Dataframe

In [0]:
# Define the data
data =[("James ","","Smith","36636","M",3000),
              ("Michael ","Rose","","40288","M",4000),
              ("Robert ","","Williams","42114","M",4000),
              ("Maria ","Anne","Jones","39192","F",4000),
              ("Jen","Mary","Brown","","F",-1)]

# Define Column List
columns=["firstname","middlename","lastname","dob","gender","salary"]

# Create a Dataframe
df=spark.createDataFrame(data,columns)
df.show(truncate = False)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|dob  |gender|salary|
+---------+----------+--------+-----+------+------+
|James    |          |Smith   |36636|M     |3000  |
|Michael  |Rose      |        |40288|M     |4000  |
|Robert   |          |Williams|42114|M     |4000  |
|Maria    |Anne      |Jones   |39192|F     |4000  |
|Jen      |Mary      |Brown   |     |F     |-1    |
+---------+----------+--------+-----+------+------+



# Writing into Parquet File

In [0]:
# Writing Dataframe into Parquet file
df.write.parquet("/dbfs/temp/people.parquet")

In [0]:
# Reading Parquet into Dataframe
parDF = spark.read.parquet("/dbfs/temp/people.parquet")
parDF.createOrReplaceTempView("people")
p_sql = spark.sql("""
          SELECT * FROM people where salary = 4000
          """)
p_sql.display()

firstname,middlename,lastname,dob,gender,salary
Robert,,Williams,42114,M,4000
Maria,Anne,Jones,39192,F,4000
Michael,Rose,,40288,M,4000


# ParquetPartition

In [0]:
# Create Parquet Partition File
df.write.partitionBy("gender", "salary") \
    .mode("overwrite")\
        .parquet("/dbfs/temp/people2.parquet")

# Reading the parquet file
df2 = spark.read.parquet("/dbfs/temp/people2.parquet/gender=M")
df2.show()

+---------+----------+--------+-----+------+
|firstname|middlename|lastname|  dob|salary|
+---------+----------+--------+-----+------+
|  Robert |          |Williams|42114|  4000|
| Michael |      Rose|        |40288|  4000|
|   James |          |   Smith|36636|  3000|
+---------+----------+--------+-----+------+



# Create View

In [0]:
parSQL = spark.sql("""
                   CREATE TEMPORARY VIEW person2
                   USING parquet OPTIONS (path '/dbfs/temp/people2.parquet/gender=F')
                   """)
spark.sql("SELECT * From person3").display()

firstname,middlename,lastname,dob,salary
Maria,Anne,Jones,39192.0,4000
Jen,Mary,Brown,,-1
