### Libraries and session

In [24]:
import pyspark
import boto3
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, DateType, DecimalType
from pyspark.sql.functions import year, to_date, month, dayofmonth,  from_unixtime, unix_timestamp

In [23]:
# ip and environments
environment = 'prd'

# Source
system_source = "bscs"
system_table ="bscs_mpusntab"

# Set the bucket and folder paths
source_bucket = 'landing-zone'
source_folder = f'database/{system_source}/{system_table}'

lakehouse_bucket = 'lakehouse'
lakehouse_folder = 'iceberg'

# table destination settings
dest_db_catalog = 'iceberg'
dest_db_schema = 'bronze'
dest_db_table = system_table
dest_final_db = f'{dest_db_catalog}.{dest_db_schema}'
dest_final_table = f'{dest_final_db}.{dest_db_table}'

# Spark identification and settings
appname = f'BRONZE_{dest_final_db}.{dest_final_table}'
log_level = 'WARN' # Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN

# Set your MinIO credentials
s3_endpoint = 'http://minio:9000'
s3_access_key = 'minio'
s3_secret_key = 'minio123'

In [27]:
spark = SparkSession.builder\
    .appName(appname)\
    .getOrCreate()

24/03/18 10:56:17 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [25]:
print("=================================================")
spark.sparkContext.setLogLevel(log_level)
print(pyspark.SparkConf().getAll())

[('spark.hadoop.hive.cli.print.header', 'true'), ('spark.hadoop.fs.s3a.connection.ssl.enabled', 'false'), ('spark.hadoop.fs.s3a.path.style.access', 'true'), ('spark.app.submitTime', '1710759026296'), ('spark.sql.catalog.spark_catalog', 'org.apache.iceberg.spark.SparkSessionCatalog'), ('spark.hadoop.fs.s3.endpoint', 'http://minio:9000'), ('spark.hadoop.fs.s3.access.key', 'minio'), ('spark.hadoop.fs.s3.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem'), ('spark.sql.catalog.iceberg.uri', 'thrift://hive-metastore:9083'), ('spark.sql.catalog.iceberg.s3.endpoint', 'http://minio:9000'), ('spark.sql.catalog.iceberg.s3.path-style-access', 'true'), ('spark.hadoop.fs.s3.path.style.access', 'true'), ('spark.hive.metastore.uris', 'thrift://hive-metastore:9083'), ('spark.master', 'local[*]'), ('spark.submit.deployMode', 'client'), ('spark.hadoop.fs.s3a.access.key', 'minio'), ('spark.sql.catalog.iceberg.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO'), ('spark.app.name', 'BRONZE_iceberg.bronze.iceberg.b

### Read from the source

In [26]:
s3 = boto3.client('s3', endpoint_url=s3_endpoint, aws_access_key_id=s3_access_key, aws_secret_access_key=s3_secret_key)

In [28]:
# List all files in the source directory
file_list = []
paginator = s3.get_paginator('list_objects_v2')

for result in paginator.paginate(Bucket=f"{environment}-{source_bucket}", Prefix=source_folder):
    
    if 'Contents' in result:
        for item in result['Contents']:
            file_list.append(item['Key'])


#### Data Contract

In [29]:
df_source_schema = StructType([
                        StructField("sncode", StringType()),
                        StructField("des", StringType()),
                        StructField("shdes", StringType()),
                        StructField("snind", StringType()),
                        StructField("rec_version", StringType()),
                        StructField("dwh_etl_history_fk", StringType()),
                        StructField("flg_processed", StringType()),
                        StructField("flg_error", StringType()),
                        StructField("error_desc", StringType()),
                        StructField("stg_record_load_date", StringType())
                ])

In [30]:
num_columns_contract = len(df_source_schema.fields)
print("Number of columns of contract:", num_columns_contract)

Number of columns of contract: 10


In [31]:
df_source_data = spark.createDataFrame([], schema=df_source_schema)

In [32]:
# reading files in the source
for file_name in file_list:

    print(f'File in processing: {file_name}')
    
    df = spark.read.format("csv") \
                    .option("header", "true") \
                    .option("delimiter", ",") \
                    .schema(df_source_schema) \
                    .load(f"s3a://{environment}-{source_bucket}/{file_name}")
    
    # df.show(5)
    
    if len(df.columns) == num_columns_contract:
        print('No of columns matched')
        df_source_data = df_source_data.union(df)

File in processing: database/bscs/bscs_mpusntab/20240315/00615c07-c7f0-45d1-b740-352e7bd6f6cd.csv
No of columns matched
File in processing: database/bscs/bscs_mpusntab/20240315/02b7687a-f53f-4636-bee6-d592ce950bde.csv
No of columns matched
File in processing: database/bscs/bscs_mpusntab/20240315/02f8c896-27b4-44ee-ae07-09f087b61a2b.csv
No of columns matched
File in processing: database/bscs/bscs_mpusntab/20240315/0334be5a-98f6-4596-a518-773d644cbc9a.csv
No of columns matched
File in processing: database/bscs/bscs_mpusntab/20240315/03367321-ebde-4de3-8d4e-c208187e40e8.csv
No of columns matched
File in processing: database/bscs/bscs_mpusntab/20240315/03c6eafc-27b9-494a-8993-aacc95c6c415.csv
No of columns matched
File in processing: database/bscs/bscs_mpusntab/20240315/03ca5dc9-5373-492e-a332-b28958a71fc4.csv
No of columns matched
File in processing: database/bscs/bscs_mpusntab/20240315/069cd2fe-8bac-4750-83d4-5d68f50b9b84.csv
No of columns matched
File in processing: database/bscs/bscs_m

In [33]:
# print("No of lines to load: ", len(df_source_data))
df_source_data.show(10)

24/03/18 10:56:39 WARN DAGScheduler: Broadcasting large task binary with size 1104.1 KiB
24/03/18 10:56:40 WARN DAGScheduler: Broadcasting large task binary with size 1104.1 KiB
                                                                                

+------+--------------------+-----+-----+-----------+------------------+-------------+---------+----------+--------------------+
|sncode|                 des|shdes|snind|rec_version|dwh_etl_history_fk|flg_processed|flg_error|error_desc|stg_record_load_date|
+------+--------------------+-----+-----+-----------+------------------+-------------+---------+----------+--------------------+
|   814|V5P50-240 any+fre...|V5P52|    Y|          2|              null|         null|     null|      null|                null|
|   815|V5P50-240 any+fre...|V5P53|    Y|          2|              null|         null|     null|      null|                null|
|   816|V5P50-800 V&F+fre...|V5P58|    Y|          2|              null|         null|     null|      null|                null|
|   817|V5P50-800 V&F+fre...|V5P59|    Y|          2|              null|         null|     null|      null|                null|
|   818|V5100any-P20-200V...|V5122|    Y|          2|              null|         null|     null| 

In [34]:
df_source_data.describe()

DataFrame[summary: string, sncode: string, des: string, shdes: string, snind: string, rec_version: string, dwh_etl_history_fk: string, flg_processed: string, flg_error: string, error_desc: string, stg_record_load_date: string]

### DDL on lakehouse

#### Data base

In [35]:
##creating db
sql_db_create = f"""
CREATE DATABASE IF NOT EXISTS {dest_final_db} COMMENT '' LOCATION 's3a://{environment}-{lakehouse_bucket}/{dest_db_catalog}/{dest_db_schema}/'
"""
print(sql_db_create)
spark.sql(sql_db_create)


CREATE DATABASE IF NOT EXISTS iceberg.bronze COMMENT '' LOCATION 's3a://prd-lakehouse/iceberg/bronze/'



DataFrame[]

#### Dest table

In [36]:
sql_ddl_drop_table = f"""
    DROP TABLE IF EXISTS {dest_final_table}
"""

In [37]:
sql_ddl_create_table = f"""
        create table if not exists {dest_final_table}
        (
			sncode string,
			des string,
			shdes string,
			snind string,
			rec_version string,
			dwh_etl_history_fk string,
			flg_processed string,
			flg_error string,
			error_desc string,
			stg_record_load_date string
        ) 
        using iceberg
        """        

#### SQL DDL Execution

In [38]:
## drop table
spark.sql(sql_ddl_drop_table)

## create table
spark.sql(sql_ddl_create_table)

DataFrame[]

### Small transformation

In [39]:
# # some transformations
#     df = df.withColumn("duration", df["duration"].cast("double"))
#     # df = df.withColumn("event_date", to_date(df["record_opening_time"], "yyyyMMddHHmmss"))
#     # to_date(df["record_opening_time"], "yyyyMMddHHmmss")

#     df.withColumn("event_date", from_unixtime(unix_timestamp("record_opening_time", "yyyyMMddHHmmss")))

#     df.select('event_date').show()

### Write table

In [40]:
# wrintint the data on lakehouse
df_source_data.writeTo(f'{dest_final_table}').append()

                                                                                

In [41]:
table = spark.table(f'{dest_final_table}')
print(table.printSchema())
print(f"No of Records: {table.count()}")

root
 |-- sncode: string (nullable = true)
 |-- des: string (nullable = true)
 |-- shdes: string (nullable = true)
 |-- snind: string (nullable = true)
 |-- rec_version: string (nullable = true)
 |-- dwh_etl_history_fk: string (nullable = true)
 |-- flg_processed: string (nullable = true)
 |-- flg_error: string (nullable = true)
 |-- error_desc: string (nullable = true)
 |-- stg_record_load_date: string (nullable = true)

None
No of Records: 245705
