In [None]:
# the default data source (parquet unless otherwise configured by spark.sql.sources.default) 
df = spark.read.load("examples/src/main/resources/users.parquet")
df.select("name", "favorite_color").write.save("namesAndFavColors.parquet")

In [None]:
# Manually Specifying Options
df = spark.read.load("examples/src/main/resources/people.json", format="json")
df.select("name", "age").write.save("namesAndAges.parquet", format="parquet")

In [None]:
# To load a CSV file you can use:
df = spark.read.load("examples/src/main/resources/people.csv",format="csv", sep=":", inferSchema="true", header="true")

In [None]:
# The extra options are also used during write operation. 
# For example, you can control bloom filters and dictionary encodings for ORC data sources. 
# The following ORC example will create bloom filter on favorite_color and use dictionary encoding for name and favorite_color. 
# For Parquet, there exists parquet.enable.dictionary, too.

# read df
df = spark.read.orc("examples/src/main/resources/users.orc")

# write df
df.write.format("orc")
    .option("orc.bloom.filter.columns", "favorite_color")
    .option("orc.dictionary.key.threshold", "1.0")
    .save("users_with_options.orc")

In [None]:
# Run_SQL_on_files_directly
df = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`")

In [None]:
# Save Modes
"error" or "errorifexists" (default) - if data already exists, an exception is expected to be thrown.
"append"    - if data/table already exists, contents of the DataFrame are expected to be appended to existing data.
"overwrite" - if data/table already exists, existing data is expected to be overwritten by the contents of the DataFrame.
"ignore"    - if data already exists, the save operation is expected not to save the contents of the DataFrame and not to change the existing data.
             This is similar to a CREATE TABLE IF NOT EXISTS in SQL.

In [None]:
# Saving to Persistent Tables



For file-based data source, e.g. text, parquet, json, etc. you can specify a custom table path via the path option, 
e.g. df.write.option("path", "/some/path").saveAsTable("t").
When the table is dropped, the custom table path will not be removed and the table data is still there. 
If no custom table path is specified, Spark will write data to a default table path under the warehouse directory. 
When the table is dropped, the default table path will be removed too.


In [None]:
# Bucketing, Sorting and Partitioning

# For file-based data source, it is also possible to bucket and sort or partition the output. 
  Bucketing and sorting are applicable only to persistent tables:
    
df.write.bucketBy(42, "name").sortBy("age").saveAsTable("people_bucketed")

# while partitioning can be used with both save and saveAsTable when using the Dataset APIs.

df.write.partitionBy("favorite_color").format("parquet").save("namesPartByColor.parquet")

# It is possible to use both partitioning and bucketing for a single table:

df = spark.read.parquet("examples/src/main/resources/users.parquet")

df.write
    .partitionBy("favorite_color")
    .bucketBy(42, "name")
    .saveAsTable("people_partitioned_bucketed")
    

partitionBy creates a directory structure as described in the Partition Discovery section. 
Thus, it has limited applicability to columns with high cardinality. 
In contrast bucketBy distributes data across a fixed number of buckets and 
can be used when a number of unique values is unbounded.    
    