In [1]:
import datalabframework as dlf

In [3]:
dlf.project.profile('test')
dlf.utils.pretty_print(dlf.params.metadata())

engines:
  spark:
    config:
      jobname: default
      master: spark://spark-master:7077
      packages:
      - com.oracle:ojdbc14:10.2.0.4.0
    context: spark
loggers:
  stream:
    enable: true
    severity: info
profile: test
providers:
  ingest:
    format: parquet
    hostname: hdfs-nn
    path: /data/ingest
    service: hdfs
    write:
      coalesce: 2
      options:
        mode: append
        partitionBy:
        - date
      repartition: 4
  source:
    database: MMSOFF
    hostname: 172.16.60.18
    password: qazwsxedcrfv
    port: 1521
    read:
      cache: true
      repartition: 4
    service: oracle
    username: sys
resources:
  .in:
    path: actor
    provider: source
  .out:
    path: actor
    provider: ingest
    write:
      option:
        mode: append
        partitionBy:
        - date
variables:
  a: 5
  b: hello



In [4]:
engine = dlf.engines.get('spark')
spark = engine.context()

PYSPARK_SUBMIT_ARGS:  --packages com.oracle:ojdbc14:10.2.0.4.0 pyspark-shell


Exception: Java gateway process exited before sending its port number

### MYSQL

In [4]:
# read by resource alias
df_src = engine.read('in')
df_src.show(5)

repartition  4
coalesce  None
cache True
jdbc:mysql://mysql:3306/sakila
+--------+----------+---------+-------------------+
|actor_id|first_name|last_name|        last_update|
+--------+----------+---------+-------------------+
|     105|    SIDNEY|    CROWE|2006-02-15 04:34:33|
|     172|   GROUCHO| WILLIAMS|2006-02-15 04:34:33|
|      74|     MILLA|   KEITEL|2006-02-15 04:34:33|
|      48|   FRANCES|DAY-LEWIS|2006-02-15 04:34:33|
|      65|    ANGELA|   HUDSON|2006-02-15 04:34:33|
+--------+----------+---------+-------------------+
only showing top 5 rows



In [5]:
df_src.rdd.getNumPartitions()

4

In [6]:
# read by resource path and provider
df_src = engine.read(path='actor', provider='source')
df_src.show(5)

repartition  4
coalesce  None
cache True
jdbc:mysql://mysql:3306/sakila
+--------+----------+---------+-------------------+
|actor_id|first_name|last_name|        last_update|
+--------+----------+---------+-------------------+
|     105|    SIDNEY|    CROWE|2006-02-15 04:34:33|
|     172|   GROUCHO| WILLIAMS|2006-02-15 04:34:33|
|      74|     MILLA|   KEITEL|2006-02-15 04:34:33|
|      48|   FRANCES|DAY-LEWIS|2006-02-15 04:34:33|
|      65|    ANGELA|   HUDSON|2006-02-15 04:34:33|
+--------+----------+---------+-------------------+
only showing top 5 rows



In [7]:
from pyspark.sql import functions as F
df = df_src.withColumn('date', F.to_date('last_update'))
df.show(5)

+--------+----------+---------+-------------------+----------+
|actor_id|first_name|last_name|        last_update|      date|
+--------+----------+---------+-------------------+----------+
|     105|    SIDNEY|    CROWE|2006-02-15 04:34:33|2006-02-15|
|     172|   GROUCHO| WILLIAMS|2006-02-15 04:34:33|2006-02-15|
|      74|     MILLA|   KEITEL|2006-02-15 04:34:33|2006-02-15|
|      48|   FRANCES|DAY-LEWIS|2006-02-15 04:34:33|2006-02-15|
|      65|    ANGELA|   HUDSON|2006-02-15 04:34:33|2006-02-15|
+--------+----------+---------+-------------------+----------+
only showing top 5 rows



### HDFS

In [8]:
# write by resource alias 
engine.write(df, 'out', mode='overwrite')

repartition  4
coalesce  2
cache False
hdfs://hdfs-nn:8020//data/ingest/actor


In [9]:
# write by resource path and provider alias
engine.write(df, path='actor', provider='ingest', partitionBy=['date'], mode='overwrite')

repartition  4
coalesce  2
cache False
hdfs://hdfs-nn:8020//data/ingest/actor


## Post write checks

In [10]:
# read back from hdfs in parquet format
df_trg = engine.read('out', 'ingest')

repartition  None
coalesce  None
cache False
hdfs://hdfs-nn:8020//data/ingest/actor


In [12]:
assert(df.count()==df_trg.count())
assert(df.subtract(df_trg).count()==0)