#  Read CSV

In [101]:
from pyspark.sql import SparkSession

In [102]:
# mycsv = "./data/All_Reports_20200331_Income and Expense.csv"  # Should be some file on your system
mycsv = "./data/Data8317.csv"  # Should be some file on your system
partField = "Age"
filterValue = "2"
tableName = "mytable"
partTableName = "partmytable"
entity = "myentity"
nonPartFile= "non-part-" + entity + ".parquet"
partFile = "part-" + entity + ".parquet"

In [103]:
spark = SparkSession.builder.appName("SimpleApp").getOrCreate()

In [104]:
! ls -l /home/notebooks/data/Data8317.csv

-rwxr-xr-x 1 root root 857219761 Sep 30  2019 /home/notebooks/data/Data8317.csv


In [105]:
csvdataDF = spark.read.load(mycsv, format="csv", sep=",", inferSchema="true", header="true")

In [106]:
csvdataDF.registerTempTable(tableName)

In [107]:
print("CSV data count ", csvdataDF.count())

CSV data count  34959672


In [108]:
csvdataDF.show(2)

+----+---+------+---+----+-----+
|Year|Age|Ethnic|Sex|Area|count|
+----+---+------+---+----+-----+
|2018|  0|     1|  1|  01|  807|
|2018|  0|     1|  1|  02| 5109|
+----+---+------+---+----+-----+
only showing top 2 rows



In [109]:
csvdataDF.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Ethnic: integer (nullable = true)
 |-- Sex: integer (nullable = true)
 |-- Area: string (nullable = true)
 |-- count: string (nullable = true)



In [110]:
filteredCSVDF = spark.sql("select * from " + tableName + " where " + partField + "=" + filterValue )

In [111]:
print("Filtered CSV data count ", filteredCSVDF.count())

Filtered CSV data count  708642


# Non-partitioned Parquet

## Fresh data

In [114]:
csvdataDF.write.format("parquet").mode("overwrite").save(nonPartFile)

In [115]:
! ls -l /home/notebooks/non-part-myentity.parquet

total 0
-rwxr-xr-x 1 root root       0 Jul 27 07:42 _SUCCESS
-rwxr-xr-x 1 root root 4294933 Jul 27 07:41 part-00000-2197ae5b-d80d-4023-b977-4f4d1163ac09-c000.snappy.parquet
-rwxr-xr-x 1 root root 2664292 Jul 27 07:41 part-00001-2197ae5b-d80d-4023-b977-4f4d1163ac09-c000.snappy.parquet
-rwxr-xr-x 1 root root 5798042 Jul 27 07:42 part-00002-2197ae5b-d80d-4023-b977-4f4d1163ac09-c000.snappy.parquet
-rwxr-xr-x 1 root root 4187727 Jul 27 07:42 part-00003-2197ae5b-d80d-4023-b977-4f4d1163ac09-c000.snappy.parquet
-rwxr-xr-x 1 root root 5381904 Jul 27 07:42 part-00004-2197ae5b-d80d-4023-b977-4f4d1163ac09-c000.snappy.parquet
-rwxr-xr-x 1 root root 4524794 Jul 27 07:42 part-00005-2197ae5b-d80d-4023-b977-4f4d1163ac09-c000.snappy.parquet
-rwxr-xr-x 1 root root 1346239 Jul 27 07:42 part-00006-2197ae5b-d80d-4023-b977-4f4d1163ac09-c000.snappy.parquet


In [116]:
! du -h --apparent-size /home/notebooks/non-part-myentity.parquet

28M	/home/notebooks/non-part-myentity.parquet


In [117]:
pqDF = spark.read.load(nonPartFile, format="parquet")

In [118]:
print("PQ Non-partitioned partitioned count ", pqDF.count())

PQ Non-partitioned partitioned count  34959672


In [119]:
pqDF.show(2)

+----+---+------+---+------+-----+
|Year|Age|Ethnic|Sex|  Area|count|
+----+---+------+---+------+-----+
|2018|110|    61|  9|120300|    0|
|2018|110|    69|  1|120300|  ..C|
+----+---+------+---+------+-----+
only showing top 2 rows



## Appending data

In [120]:
csvdataDF.write.format("parquet").mode("append").save(nonPartFile)

In [121]:
pqDF = spark.read.load(nonPartFile, format="parquet")
print("PQ Non-partitioned partitioned count ", pqDF.count())

PQ Non-partitioned partitioned count  69919344


# Partitioned Parquet file

# Fresh data

.mode(...).option("compression", "gzip") can be used for gzip compression

In [122]:
csvdataDF.write.partitionBy(partField).format("parquet").mode("overwrite").save(partFile)

In [127]:
! ls -l /home/notebooks/part-myentity.parquet

total 0
drwxrwxrwx 1 root root 4096 Jul 27 07:46 'Age=0'
drwxrwxrwx 1 root root 4096 Jul 27 07:46 'Age=1'
drwxrwxrwx 1 root root 4096 Jul 27 07:46 'Age=10'
drwxrwxrwx 1 root root 4096 Jul 27 07:46 'Age=100'
drwxrwxrwx 1 root root 4096 Jul 27 07:46 'Age=101'
drwxrwxrwx 1 root root 4096 Jul 27 07:46 'Age=102'
drwxrwxrwx 1 root root 4096 Jul 27 07:46 'Age=103'
drwxrwxrwx 1 root root 4096 Jul 27 07:46 'Age=104'
drwxrwxrwx 1 root root 4096 Jul 27 07:46 'Age=105'
drwxrwxrwx 1 root root 4096 Jul 27 07:46 'Age=106'
drwxrwxrwx 1 root root 4096 Jul 27 07:46 'Age=107'
drwxrwxrwx 1 root root 4096 Jul 27 07:46 'Age=108'
drwxrwxrwx 1 root root 4096 Jul 27 07:46 'Age=109'
drwxrwxrwx 1 root root 4096 Jul 27 07:46 'Age=11'
drwxrwxrwx 1 root root 4096 Jul 27 07:46 'Age=110'
drwxrwxrwx 1 root root 4096 Jul 27 07:46 'Age=111'
drwxrwxrwx 1 root root 4096 Jul 27 07:46 'Age=112'
drwxrwxrwx 1 root root 4096 Jul 27 07:46 'Age=113'
drwxrwxrwx 1 root root 4096 Jul 27 07:46 'Age=114'
drwxrwxrw

In [129]:
pqDF = spark.read.load(partFile, format="parquet")

In [130]:
print("PQ Non-partitioned partitioned count ", pqDF.count())

PQ Non-partitioned partitioned count  34959672


In [131]:
pqDF.registerTempTable(partTableName)

In [132]:
filteredPartDF = spark.sql("select * from " + partTableName  + " where " + partField + "=" + filterValue )

In [133]:
print("Filtered Partitioned data count ", filteredPartDF.count())

Filtered Partitioned data count  708642
