In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.types import *
spark = SparkSession.builder.enableHiveSupport().getOrCreate()

## Hive

In [3]:
spark.sql('show tables').show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
| default|    pokes|      false|
+--------+---------+-----------+



In [7]:
print(spark.sql('show create table pokes').first().createtab_stmt)

CREATE TABLE `pokes`(`foo` INT, `bar` STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
  'serialization.format' = '1'
)
STORED AS
  INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
TBLPROPERTIES (
  'transient_lastDdlTime' = '1569241824'
)



In [11]:
spark.sql('describe formatted pokes').show(truncate=False)

+----------------------------+----------------------------------------------------------+-------+
|col_name                    |data_type                                                 |comment|
+----------------------------+----------------------------------------------------------+-------+
|foo                         |int                                                       |null   |
|bar                         |string                                                    |null   |
|                            |                                                          |       |
|# Detailed Table Information|                                                          |       |
|Database                    |default                                                   |       |
|Table                       |pokes                                                     |       |
|Owner                       |root                                                      |       |
|Created Time       

In [18]:
%%time
spark.table('pokes').count()

CPU times: user 814 µs, sys: 32 µs, total: 846 µs
Wall time: 150 ms


500

In [19]:
spark.catalog.createTable('badges', path='/data/stackoverflow/Badges')

DataFrame[Id: int, UserId: int, Name: string, Date: timestamp, Class: int, TagBased: boolean]

In [20]:
spark.sql('show tables').show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
| default|   badges|      false|
| default|    pokes|      false|
+--------+---------+-----------+



In [23]:
spark.range(10).write.saveAsTable('numbers')

## Various formats

In [26]:
! cat /usr/local/spark/examples/src/main/resources/people.txt

Michael, 29
Andy, 30
Justin, 19


In [25]:
spark.read.text('file:///usr/local/spark/examples/src/main/resources/people.txt').show()

+-----------+
|      value|
+-----------+
|Michael, 29|
|   Andy, 30|
| Justin, 19|
+-----------+



In [27]:
! cat /usr/local/spark/examples/src/main/resources/people.csv

name;age;job
Jorge;30;Developer
Bob;32;Developer


```
---
   |
    --- part00000.csv
    --- part00001.csv
```

In [31]:
spark.read.csv('file:///usr/local/spark/examples/src/main/resources/people.csv', sep=';', header=True).show()

+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jorge| 30|Developer|
|  Bob| 32|Developer|
+-----+---+---------+



In [32]:
! cat /usr/local/spark/examples/src/main/resources/people.json

{"name":"Michael"}
{"name":"Andy", "age":30}
{"name":"Justin", "age":19}


In [36]:
spark.read.json('file:///usr/local/spark/examples/src/main/resources/people.json').show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [44]:
spark.read.json('file:///home/jovyan/work/sample_file.json', multiLine=True).first().adres.miasto

'Katowice'

In [45]:
spark.read.json('file:///home/jovyan/work/sample_file.json', multiLine=True).printSchema()

root
 |-- adres: struct (nullable = true)
 |    |-- miasto: string (nullable = true)
 |-- imię: string (nullable = true)
 |-- wiek_dzieci: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [46]:
spark.read.json('file:///home/jovyan/work/sample_file.json', multiLine=True) \
    .select('adres.miasto').show()

+--------+
|  miasto|
+--------+
|Katowice|
+--------+



In [48]:
! cat /usr/local/spark/examples/src/main/resources/users.avro

Objavro.schema�{"type": "record", "namespace": "example.avro", "name": "User", "fields": [{"type": "string", "name": "name"}, {"type": ["string", "null"], "name": "favorite_color"}, {"type": {"items": "int", "type": "array"}, "name": "favorite_numbers"}]}avro.codecnull n�~��B;{���/�л0Alyssa( Ben red n�~��B;{���/�л

In [50]:
spark.read.format("avro").load('file:///usr/local/spark/examples/src/main/resources/users.avro').show()

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+



In [51]:
! cat /usr/local/spark/examples/src/main/resources/users.orc

ORC  
P A  

     "
AlyssaBenP <  !

	 L"
redredP  

   P +  

   (^P   AlyssaBen  Fc  �@  red  B �  F@  �e�  t
 #!h


    ` �  
M
P 
"
AlyssaBenP X
"
redredPX
P X
(^P XJ ���
w1 e("/namefavorite_colorf(numbers"�B
"0:P :"
AlyssaBenP X:"
redredPX:5XX:(^P X@�NH ���" (R0��ORC

In [53]:
spark.read.format('orc').load('file:///usr/local/spark/examples/src/main/resources/users.orc').show()

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+



In [54]:
! cat /usr/local/spark/examples/src/main/resources/users.parquet

0      red 88,
      @   	         \Hexample.avro.User % name%  %favorite_color%  5 favorite_numbers %array <&% nameDH&  &P5 favorite_color<@&P  &�% (favorite_numbersarray
ZZ&�  � avro.schema�{"type":"record","name":"User","namespace":"example.avro","fields":[{"name":"name","type":"string"},{"name":"favorite_color","type":["string","null"]},{"name":"favorite_numbers","type":{"type":"array","items":"int"}}]} parquet-mr version 1.4.3 �  PAR1

In [57]:
spark.read.parquet('file:///usr/local/spark/examples/src/main/resources/users.parquet').show()

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+



## JDBC

In [58]:
reader = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:mysql://some-mysql")  \
    .option("dbtable", 'db.apps_countries') \
    .option("user", 'root') \
    .option("password", 'my-secret-pw') \
    .option('driver', 'com.mysql.cj.jdbc.Driver') \
    .load()

In [62]:
substring?

In [65]:
reader.withColumn('first_letter', substring('country_name', 0, 1)).show()

+---+------------+-------------------+------------+
| id|country_code|       country_name|first_letter|
+---+------------+-------------------+------------+
|  1|          AF|        Afghanistan|           A|
|  2|          AL|            Albania|           A|
|  3|          DZ|            Algeria|           A|
|  4|          DS|     American Samoa|           A|
|  5|          AD|            Andorra|           A|
|  6|          AO|             Angola|           A|
|  7|          AI|           Anguilla|           A|
|  8|          AQ|         Antarctica|           A|
|  9|          AG|Antigua and Barbuda|           A|
| 10|          AR|          Argentina|           A|
| 11|          AM|            Armenia|           A|
| 12|          AW|              Aruba|           A|
| 13|          AU|          Australia|           A|
| 14|          AT|            Austria|           A|
| 15|          AZ|         Azerbaijan|           A|
| 16|          BS|            Bahamas|           B|
| 17|       

## Caching

In [70]:
reader = reader.cache()

In [84]:
reader.show()

+---+------------+-------------------+
| id|country_code|       country_name|
+---+------------+-------------------+
|  1|          AF|        Afghanistan|
|  2|          AL|            Albania|
|  3|          DZ|            Algeria|
|  4|          DS|     American Samoa|
|  5|          AD|            Andorra|
|  6|          AO|             Angola|
|  7|          AI|           Anguilla|
|  8|          AQ|         Antarctica|
|  9|          AG|Antigua and Barbuda|
| 10|          AR|          Argentina|
| 11|          AM|            Armenia|
| 12|          AW|              Aruba|
| 13|          AU|          Australia|
| 14|          AT|            Austria|
| 15|          AZ|         Azerbaijan|
| 16|          BS|            Bahamas|
| 17|          BH|            Bahrain|
| 18|          BD|         Bangladesh|
| 19|          BB|           Barbados|
| 20|          BY|            Belarus|
+---+------------+-------------------+
only showing top 20 rows



In [86]:
badges = spark.table('badges').cache()

In [87]:
badges.select('Id').show()

+-----+
|   Id|
+-----+
|82946|
|82947|
|82949|
|82950|
|82951|
|82952|
|82953|
|82954|
|82955|
|82956|
|82957|
|82958|
|82959|
|82960|
|82961|
|82962|
|82963|
|82964|
|82965|
|82966|
+-----+
only showing top 20 rows



In [88]:
badges2 = spark.table('badges').select('Id').cache()

In [89]:
badges2.show()

+-----+
|   Id|
+-----+
|82946|
|82947|
|82949|
|82950|
|82951|
|82952|
|82953|
|82954|
|82955|
|82956|
|82957|
|82958|
|82959|
|82960|
|82961|
|82962|
|82963|
|82964|
|82965|
|82966|
+-----+
only showing top 20 rows



In [91]:
badges.unpersist()

DataFrame[Id: int, UserId: int, Name: string, Date: timestamp, Class: int, TagBased: boolean]

In [92]:
spark.table('badges').select('Id').unpersist()

DataFrame[Id: int]

## Reading from ES

```
es_index = spark.read.format('......es.loader').load('index-2019.09.25')
es_index.where(col('col1') == 4) # ---> transformed to es:9200/index-2019-09-25/_search?query=col1:4
```

https://github.com/elastic/elasticsearch-hadoop

## Challenges with CSVs

In [98]:
spark.read.csv('file:///home/jovyan/work/sample.csv', header=True)

DataFrame[imie: string, nazwisko: string, miasto: string, lubi_sparka: string, wiek: string]

In [105]:
spark.read.csv('file:///home/jovyan/work/sample.csv', header=True, inferSchema=True) 

DataFrame[imie: string, nazwisko: string, miasto: string, lubi_sparka: string, wiek: string, data_urodzenia: string, ulubione_liczby: string]

## Reading own Avro file

Check if the file created in pure python is readable by spark

In [106]:
spark.read.format('avro').load('file:///home/jovyan/work/data.avro')

DataFrame[Id: int, UserId: int, Name: string, Date: timestamp, Class: string, TagBased: boolean]

In [107]:
!ls -la data.avro

-rw-r--r-- 1 jovyan users 650 Sep 25 09:12 data.avro
