In [0]:
dbutils.widgets.text("myname", "Gopal")
name = dbutils.widgets.get("myname")  # read data from widget
print ("name", name)

dbutils.widgets.remove("myname")

name Gopal


In [0]:
dbutils.widgets.removeAll()

In [0]:
# Convert CSV files into PArquet format
# Delta Lake: Move data from Bronze to Silver
# Row based CSV foramt to Column based Parquet


dbutils.widgets.text("moviesPath", "abfss://bronze@gksdatalake.dfs.core.windows.net/movies/")
dbutils.widgets.text("moviesTargetPath", "abfss://silver@gksdatalake.dfs.core.windows.net/movies/")

In [0]:
MOVIES_PATH = dbutils.widgets.get("moviesPath") 
print (MOVIES_PATH)

abfss://bronze@gksdatalake.dfs.core.windows.net/movies/


In [0]:
# Spark Session, entry point for spark sql
spark

<pyspark.sql.connect.session.SparkSession at 0x7f917c733290>

In [0]:
# read movies data
# spark.read is BATCH PROCESSING
# read is LAZY EVAL, TRANSFORMATION
moviesDf = spark.read.option("header", True).csv(MOVIES_PATH)
moviesDf.printSchema() # used Job 1
moviesDf.show(5) # ACTION, Job 2

root
 |-- movieId: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [0]:
# inferSchema - not recommended, let spark to build a schema by scanning data
# if you use inferSchema, use the sampleRatio with %, it won't scan all data
# sampleRatio = 0.01, it will scan 1% of data, 0.1 means 10% of data
# inferSchema with all data scan can be very expensive, assume your input is 2 TB size
# the read data woun't be useful in later action. EVERY ACTION IS INDEPENENT
# uncomment CTRL + /

movieDf = spark.read\
                .option("header", True)\
                .option ("inferSchema", "true")\
                .option ("samplingRatio", 0.01)\
                .csv(MOVIES_PATH)

# since we have not given SCHEMA, printSchema execute action for column name and datatypes

movieDf.printSchema()

movieDf.show(5) # ACTION function, create a job

# watch out, without header, with header, with inferSchema, without inferSchema
# with header, without schema, with inferSchema = 3 jobs
#     job 1 - to obtain column names from csv
#     job 2 - to infer schema, that scan the data to find the datatypes for columns
#     job 3 - for .show()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [0]:
# how to create schema programatically instead of using inferSchema
from pyspark.sql.types import StructType, LongType, StringType, IntegerType, DoubleType

In [0]:
# True is nullable, False is non nullable
# give your own column name, datatypes
movieSchema = StructType()\
                .add("movieId", IntegerType(), True)\
                .add("title", StringType(), True)\
                .add("genres", StringType(), True)

In [0]:
# movieDf with schema we have, to avoid additional job creation, scan data overload
# since we provide schema, header = true, is to skip the first line in the csv
# spark.conf.set("fs.azure.account.key.ugdatalake4.dfs.core.windows.net", "<your-storage-account-key>")
# spark\
#     .read\
#     .option("header", True)\
#     .schema(movieSchema)\
#     .csv(MOVIES_PATH)\
#     .show(5)

movieDf = (spark
            .read
            .option("header", True)
            .schema(movieSchema) # now , we don't run a job for schema
            .csv(MOVIES_PATH)
            )

movieDf.printSchema()
movieDf.show(5)
# make a note of number of jobs its create, compare with previous shell

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [0]:
# movie data, convert to parquet as is
# silver container
# movies directory

# move to silver container
# Put 
# Put MOVIE_TARGET_PATH as part of the widget

MOVIE_TARGET_PATH = dbutils.widgets.get("moviesTargetPath")
# overwrite - delete old data
# write - ACTION - trigger a job
movieDf.write.mode("overwrite").parquet(MOVIE_TARGET_PATH)

In [0]:
import json

manifest = {
    "inputCsvPath": MOVIES_PATH,
    "resultParquetPath": MOVIE_TARGET_PATH,
    "result": "SUCCESS"
}

print (json.dumps(manifest))

dbutils.notebook.exit(json.dumps(manifest))