# Basic program ETL Spark ( load, fix data types, fix duplicity, write and read) 

# for running all steps you need go menu Cell/RunAll

# *

# Starting

# *





## Import librarys and initial Spark 

In [1]:
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import functions as F
from pyspark.sql import Window
import uuid
from datetime import datetime
conf = SparkConf().setAppName("ETL").setMaster("local")
sc = SparkContext(conf=conf)
spark = SQLContext(sc)
logger = spark._jvm.org.apache.log4j
print(sc)
print(spark)

<SparkContext master=local appName=ETL>
<pyspark.sql.context.SQLContext object at 0x7efca4217c90>


## Method for convert data types from according map

In [2]:
def fix_dtypes(df, list_col_dtypes):

    dtypes = df.dtypes
    for row in dtypes:
        column = row[0]
        _dtypes = list_col_dtypes.get(column)
        if _dtypes is None:
            raise Exception("Column {} type {} doesn't in mapping, please verify.".format(column, _dtypes))

        if (_dtypes in ["double", "integer", "float"]) | (_dtypes.find("decimal") >= 0):
            df = df.withColumn(column, F.when(F.trim(F.col(column)).isNull(), "0")
                               .otherwise(F.trim(F.col(column))))
            df = df.withColumn(column, F.col(column).cast(_dtypes))
        elif _dtypes in ["string", "timestamp", "boolean"]:
            df = df.withColumn(column, F.trim(F.col(column)).cast(_dtypes))

    return df

## Load file CSV that was write in another program ( write_csv.ipynb )

In [3]:
def load_csv_spark(path):
    
    df = spark.read.csv(path, header=True)
    
    return df

## Load mapping that has data types corret, primary key of data frame and column that garantees uniqueless of line. 

In [4]:
def load_mapping():
    
    list_col_dtypes = {"id": "integer",
                       "randow": "string",
                       "update_timestamp": "timestamp",
                       "boolean": "boolean"}
    
    col_name_last_update = "update_timestamp"
    
    primary_key = ["id"]
    
    return [primary_key, col_name_last_update, list_col_dtypes]

## Method for remove duplicity of dataframe

In [5]:
def fix_duplicity(df, list_col_dtypes, primary_key, col_name_last_update):
    
    df = fix_dtypes(df, list_col_dtypes)
    window = Window. \
        partitionBy(primary_key). \
        orderBy(df[col_name_last_update].asc())
    _df = df.withColumn("rank_tmp", F.row_number().over(window))
    _df = _df.filter("rank_tmp=1")
    df = _df.drop("rank_tmp")
    
    return df

## Write file parquet in path

In [6]:
def write_parquet(df, path):
    
    df.write.mode('overwrite').parquet(path)

## Read file parquet in path

In [7]:
def read_parquet(path):
    
    df = spark.read.parquet(path)
    
    return df

## Method process that final code output time execution 

In [8]:
def process(number_rows):
    #Read file in storage
    path_csv = "/home/jovyan/work/artifacts/data/input/users/generated_{}_rows.csv".format(number_rows)

    df = load_csv_spark(path_csv)

    #Load mapping
    data = load_mapping()

    primary_key = data[0]
    col_name_last_update = data[1]
    list_col_dtypes = data[2]

    #Duplicity in file - Fix
    df = fix_duplicity(df, list_col_dtypes, primary_key, col_name_last_update)

    path_parquet = "/home/jovyan/work/artifacts/data/output/spark/"

    #Write parquet
    write_parquet(df, path_parquet)

    #Read parquet
    df = read_parquet(path_parquet)
    
    return df

## Call process and you put the number of line that you write file csv before.

In [9]:
start = datetime.now()
df = process(number_rows=100)
end = datetime.now()
print(end - start)

0:00:12.231312


# Now we go use the output dataframe for explore some comands

## Is this show data with order by column

In [10]:
start = datetime.now()
df.alias("a").orderBy(F.col("a.id")).show(100, False)
end = datetime.now()
print(end - start)

+---+------------------------------------+--------------------------+-------+
|id |randow                              |update_timestamp          |boolean|
+---+------------------------------------+--------------------------+-------+
|0  |43d8cb63-1517-4a88-be00-64765230cdda|2020-06-10 09:32:30.348379|false  |
|1  |2a907d55-6d08-456d-ae29-365f5f487334|2020-06-10 09:32:30.348444|true   |
|2  |7386f1cd-f447-4d2f-a21b-08dd9551322b|2020-06-10 09:32:30.348609|false  |
|3  |5ea8c896-4e26-4f74-a452-536acbffa2a7|2020-06-10 09:32:30.348628|true   |
|4  |7e7c8723-7302-489d-85cf-1814375d4a17|2020-06-10 09:32:30.34865 |false  |
|5  |d724459d-d4ac-4301-b201-a511c62327bd|2020-06-10 09:32:30.348672|true   |
|6  |77be4393-af62-4c7c-8f9d-d1961c5930e5|2020-06-10 09:32:30.348682|false  |
|7  |8471d318-1db3-48ad-b848-f1d47be325ae|2020-06-10 09:32:30.348691|true   |
|8  |c33c5742-1ea3-43e6-9b5f-9b933e0d1d05|2020-06-10 09:32:30.3487  |false  |
|9  |dade68f6-cbb5-4e1f-8299-3ef2e3602a6c|2020-06-10 09:32:30.34

## Is this method show how you transformer DataFrame df in table at memory

In [11]:
start = datetime.now()
df.createOrReplaceTempView("df")
end = datetime.now()
print(end - start)

0:00:00.037403


## Is this script use table created above in memory. You can use common sql

In [12]:
start = datetime.now()
spark.sql("select * from df order by id").show(100,False)
end = datetime.now()
print(end - start)

+---+------------------------------------+--------------------------+-------+
|id |randow                              |update_timestamp          |boolean|
+---+------------------------------------+--------------------------+-------+
|0  |43d8cb63-1517-4a88-be00-64765230cdda|2020-06-10 09:32:30.348379|false  |
|1  |2a907d55-6d08-456d-ae29-365f5f487334|2020-06-10 09:32:30.348444|true   |
|2  |7386f1cd-f447-4d2f-a21b-08dd9551322b|2020-06-10 09:32:30.348609|false  |
|3  |5ea8c896-4e26-4f74-a452-536acbffa2a7|2020-06-10 09:32:30.348628|true   |
|4  |7e7c8723-7302-489d-85cf-1814375d4a17|2020-06-10 09:32:30.34865 |false  |
|5  |d724459d-d4ac-4301-b201-a511c62327bd|2020-06-10 09:32:30.348672|true   |
|6  |77be4393-af62-4c7c-8f9d-d1961c5930e5|2020-06-10 09:32:30.348682|false  |
|7  |8471d318-1db3-48ad-b848-f1d47be325ae|2020-06-10 09:32:30.348691|true   |
|8  |c33c5742-1ea3-43e6-9b5f-9b933e0d1d05|2020-06-10 09:32:30.3487  |false  |
|9  |dade68f6-cbb5-4e1f-8299-3ef2e3602a6c|2020-06-10 09:32:30.34

## Is this script makes a count on the dataframe

In [13]:
start = datetime.now()
print("number of line on the DF {}".format(df.count()))
end = datetime.now()
print(end - start)

number of line on the DF 100
0:00:00.275248
