In [None]:
%run "./Setup.ipynb"

In [None]:
from pyspark.sql.functions import *

In [None]:
from pyspark.sql.types import *

### JSON format

In [None]:
#json_path = "E:\\PySpark\\data\\flight-data\json\\2015-summary.json"
#json_path = "E:\\PySpark\\data\\flight-data\json\\*-summary.json"
json_path = "E:\\PySpark\\data\\flight-data\json"

In [None]:
json_df = spark.read.json(json_path)
json_df.show()

In [None]:
json_df.rdd.getNumPartitions()

In [None]:
json_df.count()

In [None]:
spark.conf.get('spark.sql.files.maxPartitionBytes')

#### show command

In [None]:
json_df.show()
#json_df.show(30)
#json_df.show(30, False)
#json_df.show(3, False, True)

### Programmatic Schema

In [None]:
json_df = spark.read.json(json_path)

In [None]:
json_df.printSchema()

In [None]:
type(json_df.schema)

In [None]:
json_schema = StructType(
    [
        StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
        StructField("DEST_COUNTRY_NAME", StringType(), True),
        StructField("count", IntegerType(), True)
    ]
)

In [None]:
json_schema_2 = "DEST_COUNTRY_NAME STRING, ORIGIN_COUNTRY_NAME STRING, count BIGINT"

In [None]:
json_df_1 = spark.read.schema(json_schema_2).json(json_path)
#json_df_1 = spark.read.json(json_path, schema=json_schema)

In [None]:
json_df_1.show()

In [None]:
json_df_1.printSchema()

In [None]:
json_df_2 = json_df_1.where("count > 200")
json_df_2.show()

In [None]:
json_df_2.count()

In [None]:
json_df_2.write.json("E:\\PySpark\\output\\json")

In [None]:
json_df_2.rdd.getNumPartitions()

#### Using multi-line JSON files

In [None]:
multi_line_json_path = "E:\\PySpark\\data\\users_multiline.json"

In [None]:
multi_line_json_df = spark.read.json(multi_line_json_path, multiLine=True)
multi_line_json_df.show()

#### Using nested JSON files

In [None]:
nested_json_path = "E:\\PySpark\\data\\users-nested.json"

In [None]:
nested_json_df = spark.read.json(nested_json_path)
nested_json_df.show()

In [None]:
nested_json_df.printSchema()

In [None]:
nested_json_df.schema

In [None]:
#nested_json_df.select("userid", "name", "address.city", "address.state").show()
nested_json_df.select("userid", "name", "address.*").show()

### Parquet format

In [None]:
parquet_path = "E:\\PySpark\\data\\flight-data\\parquet\\2010-summary.parquet"

In [None]:
#parquet_df = spark.read.format("parquet").load(parquet_path)
parquet_df = spark.read.parquet(parquet_path)
parquet_df.show()

In [None]:
parquet_df_2 = parquet_df.where("count > 100 and DEST_COUNTRY_NAME = 'United States'")
parquet_df_2.show()

In [None]:
parquet_df_2.write.mode("overwrite").parquet("E:\\PySpark\\output\\parquet")

In [None]:
parquet_df_2\
    .write\
    .mode("overwrite")\
    .option("compression", "gzip") \
    .parquet("E:\\PySpark\\output\\parquet")

In [None]:
parquet_df_2.rdd.getNumPartitions()

### ORC format

In [None]:
orc_path = "E:\\PySpark\\data\\flight-data\\orc\\2010-summary.orc"

In [None]:
#orc_df = spark.read.format("orc").load(orc_path)
orc_df = spark.read.orc(orc_path)
orc_df.show()

In [None]:
orc_path_2 = orc_df.where("count > 100")
orc_path_2.show()

In [None]:
#orc_path_2.write.format("orc").save("E:\\PySpark\\output\\orc")
orc_path_2.write.orc("E:\\PySpark\\output\\orc")

### CSV format

    - Represents any delimited text file format

In [None]:
#csv_path = "E:\\PySpark\\data\\flight-data\\csv\\2015-summary.csv"
csv_path = "E:\\PySpark\\data\\flight-data\\csv"

In [None]:
csv_schema = "destination STRING, origin STRING, count BIGINT"
#csv_schema = "destination STRING, origin STRING"

In [None]:
#csv_df = spark.read.csv(csv_path).toDF("destination", "origin", "count")
#csv_df = spark.read.csv(csv_path, header=True, inferSchema=True)
csv_df = spark.read.csv(csv_path, header=True, schema=csv_schema)    #programmatic schema

csv_df.show()

In [None]:
csv_df.printSchema()

In [None]:
csv_df_2 = csv_df.where("count > 1000")
csv_df_2.show()

In [None]:
#csv_df_2.write.csv("E:\\PySpark\\output\\csv", header=True, mode="overwrite")
#csv_df_2.write.csv("E:\\PySpark\\output\\csv", header=True, mode="overwrite", sep="\t")
csv_df_2.write.csv("E:\\PySpark\\output\\csv", header=True, mode="overwrite", sep=";")

In [None]:
csv_df_2.rdd.getNumPartitions()

In [None]:
csv_df_2 = spark.read.csv("E:\\PySpark\\output\\csv", header=True, sep=";")
csv_df_2.show()

### Text format

In [116]:
text_path = "E:\\PySpark\\data\\wordcount.txt"

In [127]:
text_df = spark.read.text(text_path)

In [128]:
text_df.show(50, truncate=False)

+----------------------------------------------+
|value                                         |
+----------------------------------------------+
|spark is a general purpose execution framework|
|spark can run on hadoop                       |
|scala is preferred language for spark         |
|spark also supports java and python           |
|spark is a general purpose execution framework|
|spark can run on hadoop                       |
|scala is preferred language for spark         |
|spark also supports java and python           |
|python spark scala java pyspark hadoop        |
|spark python spark rdd rdd rdd sql spark      |
|pythom machine learning spark sql rdd rdd     |
|spark is a general purpose execution framework|
|spark can run on hadoop                       |
|scala is preferred language for spark         |
|spark also supports java and python           |
|spark is a general purpose execution framework|
|spark can run on hadoop                       |
|scala is preferred 

In [129]:
text_df.count()

33

In [130]:
text_df.printSchema()

root
 |-- value: string (nullable = true)



In [139]:
wordcount_df = text_df \
    .select(explode(split("value", " ")).alias("word")) \
    .groupBy("word") \
    .count()
    
wordcount_df.show(truncate=False)

+---------+-----+
|word     |count|
+---------+-----+
|spark    |39   |
|is       |12   |
|a        |6    |
|general  |6    |
|purpose  |6    |
|execution|6    |
|framework|6    |
|can      |6    |
|run      |6    |
|on       |6    |
|hadoop   |9    |
|scala    |9    |
|preferred|6    |
|language |6    |
|for      |6    |
|also     |6    |
|supports |6    |
|java     |9    |
|and      |6    |
|python   |12   |
+---------+-----+
only showing top 20 rows



In [136]:
wordcount_df.printSchema()

root
 |-- split(value,  , -1): array (nullable = true)
 |    |-- element: string (containsNull = true)



In [142]:
wordcount_df.select("word").write.text("E:\\PySpark\\output\\text")

In [150]:
wordcount_df \
    .withColumn("word", concat( col("word"), lit(","), col("count"))) \
    .select("word") \
    .write.text("E:\\PySpark\\output\\text")