### Libraries and session

In [4]:
import pyspark
import boto3
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, DateType, DecimalType
from pyspark.sql.functions import year, to_date, month, dayofmonth,  from_unixtime, unix_timestamp

In [5]:
# ip and environments
environment = 'prd'

# Source
system_source = "bscs"
system_table ="bscs_mpusptab"

# Set the bucket and folder paths
source_bucket = 'landing-zone'
source_folder = f'database/{system_source}/{system_table}'

lakehouse_bucket = 'lakehouse'
lakehouse_folder = 'iceberg'

# table destination settings
dest_db_catalog = 'iceberg'
dest_db_schema = 'bronze'
dest_db_table = system_table
dest_final_db = f'{dest_db_catalog}.{dest_db_schema}'
dest_final_table = f'{dest_final_db}.{dest_db_table}'

# Spark identification and settings
appname = f'BRONZE_{dest_final_db}.{dest_final_table}'
log_level = 'WARN' # Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN

# Set your MinIO credentials
s3_endpoint = 'http://minio:9000'
s3_access_key = 'minio'
s3_secret_key = 'minio123'

In [6]:
spark = SparkSession.builder\
    .appName(appname)\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/18 10:50:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
print("=================================================")
spark.sparkContext.setLogLevel(log_level)
print(pyspark.SparkConf().getAll())

[('spark.hadoop.hive.cli.print.header', 'true'), ('spark.hadoop.fs.s3a.connection.ssl.enabled', 'false'), ('spark.hadoop.fs.s3a.path.style.access', 'true'), ('spark.app.submitTime', '1710759026296'), ('spark.sql.catalog.spark_catalog', 'org.apache.iceberg.spark.SparkSessionCatalog'), ('spark.hadoop.fs.s3.endpoint', 'http://minio:9000'), ('spark.hadoop.fs.s3.access.key', 'minio'), ('spark.hadoop.fs.s3.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem'), ('spark.sql.catalog.iceberg.uri', 'thrift://hive-metastore:9083'), ('spark.sql.catalog.iceberg.s3.endpoint', 'http://minio:9000'), ('spark.sql.catalog.iceberg.s3.path-style-access', 'true'), ('spark.hadoop.fs.s3.path.style.access', 'true'), ('spark.hive.metastore.uris', 'thrift://hive-metastore:9083'), ('spark.master', 'local[*]'), ('spark.submit.deployMode', 'client'), ('spark.hadoop.fs.s3a.access.key', 'minio'), ('spark.sql.catalog.iceberg.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO'), ('spark.app.name', 'BRONZE_iceberg.bronze.iceberg.b

### Read from the source

In [8]:
s3 = boto3.client('s3', endpoint_url=s3_endpoint, aws_access_key_id=s3_access_key, aws_secret_access_key=s3_secret_key)

In [9]:
# List all files in the source directory
file_list = []
paginator = s3.get_paginator('list_objects_v2')

for result in paginator.paginate(Bucket=f"{environment}-{source_bucket}", Prefix=source_folder):
    
    if 'Contents' in result:
        for item in result['Contents']:
            file_list.append(item['Key'])


#### Data Contract

In [10]:
df_source_schema = StructType([
                    StructField("spcode", StringType()),
                    StructField("des", StringType()),
                    StructField("shdes", StringType()),
                    StructField("sptype", StringType()),
                    StructField("rec_version", StringType()),
                    StructField("dwh_etl_history_fk", StringType()),
                    StructField("flg_processed", StringType()),
                    StructField("flg_error", StringType()),
                    StructField("error_desc", StringType()),
                    StructField("stg_record_load_date", StringType())
                ])

In [11]:
num_columns_contract = len(df_source_schema.fields)
print("Number of columns of contract:", num_columns_contract)

Number of columns of contract: 10


In [12]:
df_source_data = spark.createDataFrame([], schema=df_source_schema)

In [13]:
# reading files in the source
for file_name in file_list:

    print(f'File in processing: {file_name}')
    
    df = spark.read.format("csv") \
                    .option("header", "true") \
                    .option("delimiter", ",") \
                    .schema(df_source_schema) \
                    .load(f"s3a://{environment}-{source_bucket}/{file_name}")
    
    # df.show(5)
    
    if len(df.columns) == num_columns_contract:
        print('No of columns matched')
        df_source_data = df_source_data.union(df)

File in processing: database/bscs/bscs_mpusptab/20240315/0004f819-7ecd-4779-95a5-fb5de52a4363.csv


24/03/18 10:50:41 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


No of columns matched
File in processing: database/bscs/bscs_mpusptab/20240315/000c3f64-8096-4eb4-901e-e9f86a60be80.csv
No of columns matched
File in processing: database/bscs/bscs_mpusptab/20240315/00174dbf-4ff2-4d4f-82b5-8c871b223b3c.csv
No of columns matched
File in processing: database/bscs/bscs_mpusptab/20240315/003d43d8-ab2f-4516-b83f-be0364ed8c9d.csv
No of columns matched
File in processing: database/bscs/bscs_mpusptab/20240315/006a7604-1b80-4cc4-8ca4-29bba5815c21.csv
No of columns matched
File in processing: database/bscs/bscs_mpusptab/20240315/00b61383-5a18-4332-98da-466301fbce90.csv
No of columns matched
File in processing: database/bscs/bscs_mpusptab/20240315/00f816dc-6c15-44a1-a32a-b288dd326437.csv
No of columns matched
File in processing: database/bscs/bscs_mpusptab/20240315/0165555e-dc55-4f17-88a2-1cc1b08e1f07.csv
No of columns matched
File in processing: database/bscs/bscs_mpusptab/20240315/01a67c03-5ce3-4d95-bb5b-a1deaee38ef9.csv
No of columns matched
File in processing

In [14]:
# print("No of lines to load: ", len(df_source_data))
# df_source_data.show(10)

In [15]:
# df_source_data.describe()

### DDL on lakehouse

#### Data base

In [16]:
##creating db
sql_db_create = f"""
CREATE DATABASE IF NOT EXISTS {dest_final_db} COMMENT '' LOCATION 's3a://{environment}-{lakehouse_bucket}/{dest_db_catalog}/{dest_db_schema}/'
"""
print(sql_db_create)
spark.sql(sql_db_create)


CREATE DATABASE IF NOT EXISTS iceberg.bronze COMMENT '' LOCATION 's3a://prd-lakehouse/iceberg/bronze/'



DataFrame[]

#### Dest table

In [17]:
sql_ddl_drop_table = f"""
    DROP TABLE IF EXISTS {dest_final_table}
"""

In [18]:
sql_ddl_create_table = f"""
        create table if not exists {dest_final_table}
        (
			spcode string,
			des string,
			shdes string,
			sptype string,
			rec_version string,
			dwh_etl_history_fk string,
			flg_processed string,
			flg_error string,
			error_desc string,
			stg_record_load_date string
        ) 
        using iceberg
        """        

#### SQL DDL Execution

In [19]:
## drop table
spark.sql(sql_ddl_drop_table)

## create table
spark.sql(sql_ddl_create_table)

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.


DataFrame[]

### Small transformation

In [20]:
# # some transformations
#     df = df.withColumn("duration", df["duration"].cast("double"))
#     # df = df.withColumn("event_date", to_date(df["record_opening_time"], "yyyyMMddHHmmss"))
#     # to_date(df["record_opening_time"], "yyyyMMddHHmmss")

#     df.withColumn("event_date", from_unixtime(unix_timestamp("record_opening_time", "yyyyMMddHHmmss")))

#     df.select('event_date').show()

### Write table

In [21]:
# wrintint the data on lakehouse
df_source_data.writeTo(f'{dest_final_table}').append()

24/03/18 10:52:53 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
                                                                                

In [22]:
table = spark.table(f'{dest_final_table}')
print(table.printSchema())
print(f"No of Records: {table.count()}")

root
 |-- spcode: string (nullable = true)
 |-- des: string (nullable = true)
 |-- shdes: string (nullable = true)
 |-- sptype: string (nullable = true)
 |-- rec_version: string (nullable = true)
 |-- dwh_etl_history_fk: string (nullable = true)
 |-- flg_processed: string (nullable = true)
 |-- flg_error: string (nullable = true)
 |-- error_desc: string (nullable = true)
 |-- stg_record_load_date: string (nullable = true)

None
No of Records: 244426
