In [1]:
#partition by datetime

In [2]:
import boto3

In [3]:
# ip and environments
environment = 'prd'

# Set the bucket and folder paths
source_bucket = 'landing-zone'
source_folder = 'files/pscore/ascll'

lakehouse_bucket = 'lakehouse' 
lakehouse_folder = 'bronze'

# table destination settings
dest_db_catalog = 'iceberg'
dest_db_schema = 'pscore'
dest_db_table = 'sgw'
dest_final_db = f'{dest_db_catalog}.{dest_db_schema}'
dest_final_table = f'{dest_final_db}.{dest_db_table}'

# Spark identification and settings
appname = 'SGW_from_landing_to_bronze'
log_level = 'WARN' # Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN

# Set your MinIO credentials
s3_endpoint = 'http://minio:9000'
s3_access_key = 'minio'
s3_secret_key = 'minio123'

In [4]:
import pyspark
# import boto3
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
from pyspark.sql.functions import year, to_date, month, dayofmonth,  from_unixtime, unix_timestamp


In [5]:
spark = SparkSession.builder\
    .appName(appname)\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/21 07:54:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
spark.sparkContext.setLogLevel(log_level)
print(pyspark.SparkConf().getAll())

[('spark.hadoop.hive.cli.print.header', 'true'), ('spark.hadoop.fs.s3a.connection.ssl.enabled', 'false'), ('spark.hadoop.fs.s3a.path.style.access', 'true'), ('spark.app.submitTime', '1711007643600'), ('spark.sql.catalog.spark_catalog', 'org.apache.iceberg.spark.SparkSessionCatalog'), ('spark.hadoop.fs.s3.endpoint', 'http://minio:9000'), ('spark.app.name', 'SGW_from_landing_to_bronze'), ('spark.hadoop.fs.s3.access.key', 'minio'), ('spark.hadoop.fs.s3.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem'), ('spark.sql.catalog.iceberg.uri', 'thrift://hive-metastore:9083'), ('spark.sql.catalog.iceberg.s3.endpoint', 'http://minio:9000'), ('spark.sql.catalog.iceberg.s3.path-style-access', 'true'), ('spark.hadoop.fs.s3.path.style.access', 'true'), ('spark.hive.metastore.uris', 'thrift://hive-metastore:9083'), ('spark.master', 'local[*]'), ('spark.submit.deployMode', 'client'), ('spark.hadoop.fs.s3a.access.key', 'minio'), ('spark.sql.catalog.iceberg.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO'), (

In [7]:
schema = StructType([
                    StructField("record_type", StringType()),
                    StructField("network_initiated_pdp_context", StringType()),
                    StructField("imsi", StringType()),
                    StructField("msisdn", StringType()),
                    StructField("imei", StringType()),
                    StructField("charging_id", StringType()),
                    StructField("ggsn_pgw_address", StringType()),
                    StructField("sgsn_sgw_address", StringType()),
                    StructField("ms_nw_capability", StringType()),
                    StructField("pdp_pdn_type", StringType()),
                    StructField("served_pdp_address", StringType()),
                    StructField("dynamic_address_flag", StringType()),
                    StructField("access_point_name_ni", StringType()),
                    StructField("record_sequence_number", StringType()),
                    StructField("record_sequence_number_meg", StringType()),
                    StructField("node_id", StringType()),
                    StructField("local_sequence_number", StringType()),
                    StructField("charging_characteristics", StringType()),
                    StructField("record_opening_time", StringType()),
                    StructField("duration", StringType()),
                    StructField("rat_type", StringType()),
                    StructField("cause_for_record_closing", StringType()),
                    StructField("diagnostic", StringType()),
                    StructField("volume_uplink", StringType()),
                    StructField("volume_downlink", StringType()),
                    StructField("total_volume", StringType()),
                    StructField("lac_or_tac", StringType()),
                    StructField("ci_or_eci", StringType()),
                    StructField("rac", StringType()),
                    StructField("rnc_unsent_data_volume", StringType()),
                    StructField("req_alloc_ret_priority", StringType()),
                    StructField("neg_alloc_ret_priority", StringType()),
                    StructField("req_traffic_class", StringType()),
                    StructField("neg_traffic_class", StringType()),
                    StructField("qci", StringType()),
                    StructField("req_max_bitrate_uplink", StringType()),
                    StructField("req_max_bitrate_downlink", StringType()),
                    StructField("req_guar_bitrate_uplink", StringType()),
                    StructField("req_guar_bitrate_downlink", StringType()),
                    StructField("neg_max_bitrate_uplink", StringType()),
                    StructField("neg_max_bitrate_downlink", StringType()),
                    StructField("neg_guar_bitrate_uplink", StringType()),
                    StructField("neg_guar_bitrate_downlink", StringType()),
                    StructField("mccmnc", StringType()),
                    StructField("country_name", StringType()),
                    StructField("input_filename", StringType()),
                    StructField("output_filename", StringType()),
                    StructField("event_date", TimestampType())
                ])

In [8]:
spark.sql(f"""
CREATE DATABASE IF NOT EXISTS iceberg.raw COMMENT '' LOCATION 's3a://prd-lakehouse/iceberg/raw/'
""")

DataFrame[]

In [9]:
show_databases_df = spark.sql("SHOW CATALOGS")
show_databases_df.show()

+-------------+
|      catalog|
+-------------+
|      iceberg|
|spark_catalog|
+-------------+



In [10]:
show_databases_df = spark.sql("SHOW DATABASES")
show_databases_df.show()

+---------+
|namespace|
+---------+
|  default|
|      raw|
+---------+



In [11]:
show_databases_df = spark.sql("SHOW TABLES")
show_databases_df.show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [16]:
# Execute SHOW DATABASES query for Iceberg catalog
show_iceberg_databases_df = spark.sql("SHOW TABLES IN raw")

# Show the result
show_iceberg_databases_df.show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|      raw|      sgw|      false|
+---------+---------+-----------+



In [13]:
spark.sql(f"""
CREATE TABLE IF NOT EXISTS iceberg.raw.sgw 
""")

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.


Py4JJavaError: An error occurred while calling o49.sql.
: org.apache.iceberg.exceptions.NotFoundException: Location does not exist: s3a://datalake/iceberg/raw/sgw/metadata/00005-bbe479c2-2016-4899-a8ad-26b426588061.metadata.json
	at org.apache.iceberg.aws.s3.S3InputStream.openStream(S3InputStream.java:194)
	at org.apache.iceberg.aws.s3.S3InputStream.positionStream(S3InputStream.java:177)
	at org.apache.iceberg.aws.s3.S3InputStream.read(S3InputStream.java:107)
	at org.apache.iceberg.shaded.com.fasterxml.jackson.core.json.ByteSourceJsonBootstrapper.ensureLoaded(ByteSourceJsonBootstrapper.java:539)
	at org.apache.iceberg.shaded.com.fasterxml.jackson.core.json.ByteSourceJsonBootstrapper.detectEncoding(ByteSourceJsonBootstrapper.java:133)
	at org.apache.iceberg.shaded.com.fasterxml.jackson.core.json.ByteSourceJsonBootstrapper.constructParser(ByteSourceJsonBootstrapper.java:256)
	at org.apache.iceberg.shaded.com.fasterxml.jackson.core.JsonFactory._createParser(JsonFactory.java:1655)
	at org.apache.iceberg.shaded.com.fasterxml.jackson.core.JsonFactory.createParser(JsonFactory.java:1083)
	at org.apache.iceberg.shaded.com.fasterxml.jackson.databind.ObjectMapper.readValue(ObjectMapper.java:3666)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:273)
	at org.apache.iceberg.TableMetadataParser.read(TableMetadataParser.java:266)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$0(BaseMetastoreTableOperations.java:189)
	at org.apache.iceberg.BaseMetastoreTableOperations.lambda$refreshFromMetadataLocation$1(BaseMetastoreTableOperations.java:208)
	at org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)
	at org.apache.iceberg.util.Tasks$Builder.runSingleThreaded(Tasks.java:219)
	at org.apache.iceberg.util.Tasks$Builder.run(Tasks.java:203)
	at org.apache.iceberg.util.Tasks$Builder.run(Tasks.java:196)
	at org.apache.iceberg.BaseMetastoreTableOperations.refreshFromMetadataLocation(BaseMetastoreTableOperations.java:208)
	at org.apache.iceberg.BaseMetastoreTableOperations.refreshFromMetadataLocation(BaseMetastoreTableOperations.java:185)
	at org.apache.iceberg.BaseMetastoreTableOperations.refreshFromMetadataLocation(BaseMetastoreTableOperations.java:180)
	at org.apache.iceberg.hive.HiveTableOperations.doRefresh(HiveTableOperations.java:178)
	at org.apache.iceberg.BaseMetastoreTableOperations.refresh(BaseMetastoreTableOperations.java:97)
	at org.apache.iceberg.BaseMetastoreTableOperations.current(BaseMetastoreTableOperations.java:80)
	at org.apache.iceberg.BaseMetastoreCatalog.loadTable(BaseMetastoreCatalog.java:47)
	at org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.lambda$doComputeIfAbsent$14(BoundedLocalCache.java:2406)
	at java.base/java.util.concurrent.ConcurrentHashMap.compute(Unknown Source)
	at org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.doComputeIfAbsent(BoundedLocalCache.java:2404)
	at org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.computeIfAbsent(BoundedLocalCache.java:2387)
	at org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.LocalCache.computeIfAbsent(LocalCache.java:108)
	at org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.LocalManualCache.get(LocalManualCache.java:62)
	at org.apache.iceberg.CachingCatalog.loadTable(CachingCatalog.java:166)
	at org.apache.iceberg.spark.SparkCatalog.load(SparkCatalog.java:641)
	at org.apache.iceberg.spark.SparkCatalog.loadTable(SparkCatalog.java:159)
	at org.apache.spark.sql.connector.catalog.TableCatalog.tableExists(TableCatalog.java:163)
	at org.apache.spark.sql.execution.datasources.v2.CreateTableExec.run(CreateTableExec.scala:42)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:512)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:104)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:512)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:488)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.Dataset.<init>(Dataset.scala:219)
	at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:99)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:96)
	at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:640)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:630)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:662)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.base/java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Unknown Source)
Caused by: software.amazon.awssdk.services.s3.model.NoSuchKeyException: The specified key does not exist. (Service: S3, Status Code: 404, Request ID: 17BEB8659A45882F, Extended Request ID: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8)
	at software.amazon.awssdk.core.internal.http.CombinedResponseHandler.handleErrorResponse(CombinedResponseHandler.java:125)
	at software.amazon.awssdk.core.internal.http.CombinedResponseHandler.handleResponse(CombinedResponseHandler.java:82)
	at software.amazon.awssdk.core.internal.http.CombinedResponseHandler.handle(CombinedResponseHandler.java:60)
	at software.amazon.awssdk.core.internal.http.CombinedResponseHandler.handle(CombinedResponseHandler.java:41)
	at software.amazon.awssdk.core.internal.http.pipeline.stages.HandleResponseStage.execute(HandleResponseStage.java:40)
	at software.amazon.awssdk.core.internal.http.pipeline.stages.HandleResponseStage.execute(HandleResponseStage.java:30)
	at software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)
	at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallAttemptTimeoutTrackingStage.execute(ApiCallAttemptTimeoutTrackingStage.java:73)
	at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallAttemptTimeoutTrackingStage.execute(ApiCallAttemptTimeoutTrackingStage.java:42)
	at software.amazon.awssdk.core.internal.http.pipeline.stages.TimeoutExceptionHandlingStage.execute(TimeoutExceptionHandlingStage.java:78)
	at software.amazon.awssdk.core.internal.http.pipeline.stages.TimeoutExceptionHandlingStage.execute(TimeoutExceptionHandlingStage.java:40)
	at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallAttemptMetricCollectionStage.execute(ApiCallAttemptMetricCollectionStage.java:50)
	at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallAttemptMetricCollectionStage.execute(ApiCallAttemptMetricCollectionStage.java:36)
	at software.amazon.awssdk.core.internal.http.pipeline.stages.RetryableStage.execute(RetryableStage.java:81)
	at software.amazon.awssdk.core.internal.http.pipeline.stages.RetryableStage.execute(RetryableStage.java:36)
	at software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)
	at software.amazon.awssdk.core.internal.http.StreamManagingStage.execute(StreamManagingStage.java:56)
	at software.amazon.awssdk.core.internal.http.StreamManagingStage.execute(StreamManagingStage.java:36)
	at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallTimeoutTrackingStage.executeWithTimer(ApiCallTimeoutTrackingStage.java:80)
	at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallTimeoutTrackingStage.execute(ApiCallTimeoutTrackingStage.java:60)
	at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallTimeoutTrackingStage.execute(ApiCallTimeoutTrackingStage.java:42)
	at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallMetricCollectionStage.execute(ApiCallMetricCollectionStage.java:48)
	at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallMetricCollectionStage.execute(ApiCallMetricCollectionStage.java:31)
	at software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)
	at software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)
	at software.amazon.awssdk.core.internal.http.pipeline.stages.ExecutionFailureExceptionReportingStage.execute(ExecutionFailureExceptionReportingStage.java:37)
	at software.amazon.awssdk.core.internal.http.pipeline.stages.ExecutionFailureExceptionReportingStage.execute(ExecutionFailureExceptionReportingStage.java:26)
	at software.amazon.awssdk.core.internal.http.AmazonSyncHttpClient$RequestExecutionBuilderImpl.execute(AmazonSyncHttpClient.java:193)
	at software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.invoke(BaseSyncClientHandler.java:103)
	at software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.doExecute(BaseSyncClientHandler.java:171)
	at software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.lambda$execute$0(BaseSyncClientHandler.java:68)
	at software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.measureApiCallSuccess(BaseSyncClientHandler.java:179)
	at software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.execute(BaseSyncClientHandler.java:62)
	at software.amazon.awssdk.core.client.handler.SdkSyncClientHandler.execute(SdkSyncClientHandler.java:52)
	at software.amazon.awssdk.awscore.client.handler.AwsSyncClientHandler.execute(AwsSyncClientHandler.java:63)
	at software.amazon.awssdk.services.s3.DefaultS3Client.getObject(DefaultS3Client.java:4478)
	at org.apache.iceberg.aws.s3.S3InputStream.openStream(S3InputStream.java:192)
	... 77 more


In [17]:
show_databases_df = spark.sql("SHOW TABLES")
show_databases_df.show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [None]:
# Execute SHOW DATABASES query for Iceberg catalog
show_iceberg_databases_df = spark.sql("SHOW TABLES IN raw")

# Show the result
show_iceberg_databases_df.show()

In [None]:
table = spark.sql(f"""
        CREATE TABLE IF NOT EXISTS iceberg.raw.{dest_db_table}
        (
            record_type STRING,
            network_initiated_pdp_context STRING,
            imsi STRING,
            msisdn STRING,
            imei STRING,
            charging_id STRING,
            ggsn_pgw_address STRING,
            sgsn_sgw_address STRING,
            ms_nw_capability STRING,
            pdp_pdn_type STRING,
            served_pdp_address STRING,
            dynamic_address_flag STRING,
            access_point_name_ni STRING,
            record_sequence_number STRING,
            record_sequence_number_meg STRING,
            node_id STRING,
            local_sequence_number STRING,
            charging_characteristics STRING,
            record_opening_time STRING,
            duration STRING,
            rat_type STRING,
            cause_for_record_closing STRING,
            diagnostic STRING,
            volume_uplink STRING,
            volume_downlink STRING,
            total_volume STRING,
            lac_or_tac STRING,
            ci_or_eci STRING,
            rac STRING,
            rnc_unsent_data_volume STRING,
            req_alloc_ret_priority STRING,
            neg_alloc_ret_priority STRING,
            req_traffic_class STRING,
            neg_traffic_class STRING,
            qci STRING,
            req_max_bitrate_uplink STRING,
            req_max_bitrate_downlink STRING,
            req_guar_bitrate_uplink STRING,
            req_guar_bitrate_downlink STRING,
            neg_max_bitrate_uplink STRING,
            neg_max_bitrate_downlink STRING,
            neg_guar_bitrate_uplink STRING,
            neg_guar_bitrate_downlink STRING,
            mccmnc STRING,
            country_name STRING,
            input_filename STRING,
            output_filename STRING,
            event_date Timestamp
        ) 
        USING iceberg
        PARTITIONED BY (event_date)
        """)
        

In [None]:
s3 = boto3.client('s3', endpoint_url=s3_endpoint, aws_access_key_id=s3_access_key, aws_secret_access_key=s3_secret_key)

In [None]:
# List all files in the source directory
file_list = []
paginator = s3.get_paginator('list_objects_v2')

for result in paginator.paginate(Bucket=f"{environment}-{source_bucket}", Prefix=source_folder):
    
    if 'Contents' in result:
        for item in result['Contents']:
            file_list.append(item['Key'])


In [None]:
# reading files in the source
for file_name in file_list:

    print(f'File in processing: {file_name}')
    
    df = spark.read.format("csv") \
                    .option("header", "false") \
                    .option("delimiter", ";") \
                    .schema(schema) \
                    .load(f"s3a://{environment}-{source_bucket}/{file_name}")
    # some transformations
    df = df.withColumn("duration", df["duration"].cast("double"))
    # df = df.withColumn("event_date", to_date(df["record_opening_time"], "yyyyMMddHHmmss"))
    # to_date(df["record_opening_time"], "yyyyMMddHHmmss")

    # df.withColumn("event_date", from_unixtime(unix_timestamp("record_opening_time", "yyyyMMddHHmmss")))

    df.select('imsi').show()
    
    # wrinte the data on lakehouse
    df.writeTo(f'iceberg.raw.{dest_db_table}').append()


In [None]:
tb = spark.table(f'iceberg.raw.{dest_db_table}')
print(tb.printSchema())

In [None]:
print(f"No of Records: {tb.count()}")