# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [2]:
%%configure
{
   "--datalake-formats": "iceberg",
    "--conf": "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions --conf spark.sql.catalog.glue_catalog=org.apache.iceberg.spark.SparkCatalog --conf spark.sql.catalog.glue_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog --conf spark.sql.catalog.glue_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO --conf spark.sql.catalog.glue_catalog.warehouse=file:///tmp/spark-warehouse --conf spark.sql.defaultCatalog=glue_catalog"
}  

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
The following configurations have been updated: {'--datalake-formats': 'iceberg', '--conf': 'spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions --conf spark.sql.catalog.glue_catalog=org.apache.iceberg.spark.SparkCatalog --conf spark.sql.catalog.glue_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog --conf spark.sql.catalog.glue_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO --conf spark.sql.catalog.glue_catalog.warehouse=file:///tmp/spark-warehouse --conf spark.sql.defaultCatalog=glue_catalog'}


####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

from datetime import datetime
import pandas as pd
from pyspark.sql.functions import to_timestamp

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Current idle_timeout is None minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 4.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 5
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 5
Idle Timeout: 2880
Session ID: b27f5a4d-6c7d-4afa-9343-6c635dbb4cfd
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
--datalake-formats iceberg
--conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions --conf spark.sql.catalog.glue_catalog=org.apache.iceberg.spark.SparkCatalog --conf spark.sql.catalog.glue_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog --conf spark.sql.catalog.glue_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO --conf spark.sql.catalog.glue_catalog.warehouse=file:///tmp/spark-warehouse --conf spark.sql.defaultCatalog=glue_c

In [16]:
from datetime import datetime
import pandas as pd
from pyspark.sql.functions import to_timestamp
from awsglue import DynamicFrame




In [17]:
def sparkSqlQuery(glueContext, query, mapping, transformation_ctx) -> DynamicFrame:
    for alias, frame in mapping.items():
        frame.toDF().createOrReplaceTempView(alias)
    result = spark.sql(query)
    return DynamicFrame.fromDF(result, glueContext, transformation_ctx)




In [18]:
dyf_kobo_bronze = glueContext.create_dynamic_frame.from_catalog(database="tgsn_bronze", table_name="kobo_moth", transformation_ctx="dyf_kobo_bronze")




In [19]:
query_to_silver = '''
SELECT 
_id as ID,
to_timestamp(start) as start_date,
to_timestamp(end) as end_date,
deviceid,
interviewer_username,
pre_interview_instructions,
camp_name,
gps_coordinates,
respondent_gender,
interview_introduction,
consent_understood,
end_interview_1,
consent_request,
accuracy_request,
consent_signature,
to_date(consent_date) as consent_date,
consent_rejection_reason,
end_interview_2,
end_interview_3,
respondent_name_first,
respondent_name_last,
respondent_name_other,
campidcard_yes_no,
campidcard_cardnumber,
campidcard_caseid,
to_date(respondent_birth_date) as respondent_birth_date,
respondent_birth_country,
respondent_birth_place,
respondent_nationality_first,
respondent_nationality_second,
respondent_nationality_differentatbirth_yes_no,
respondent_nationality_atbirth,
education,
prewar_residence_country,
prewar_residence_province,
prewar_residence_place,
prewar_occupation_yes_no,
prewar_occupation,
cast(departure_year as int) as departure_year,
departure_month,
cast(camp_arrival_year as int) as camp_arrival_year,
camp_arrival_month,
camp_occupation_yes_no,
camp_occupation,
camp_occupation_other,
health_physical_treated,
health_physical_untreated,
health_physical_symptoms,
health_psychological_treated,
health_psychological_untreated,
health_psychological_symptoms,
pss_interest,
family_introduction,
marital_status,
wives_number,
children_yes_no,
children_number,
other_relatives_yes_no,
relatives_number,
reintegration_preference,
reintegration_preference_reason,
photo_question,
photo_respondent,
photo_children,
interview_conclusion,
post_interview_instructions,
respondent_comfort,
respondent_comprehension,
interviewer_feedback_notes,
final_instructions,
to_date(_submission_time) as submission_time,
_submitted_by as submitted_by,
_attachments as attachments
FROM kobo_moth_bronze
WHERE to_date(_submission_time) >= current_date()
order by _id
'''




In [24]:
dyf_kobo_to_silver = sparkSqlQuery(glueContext, query = query_to_silver, mapping = {"kobo_moth_bronze":dyf_kobo_bronze}, transformation_ctx = "df_kobo_to_silver")




In [25]:
df_kobo_to_silver = dyf_kobo_to_silver.toDF()




In [26]:
df_kobo_to_silver.select('id','start_date', 'respondent_name_first','submission_time').show()

+---+--------------------+---------------------+---------------+
| id|          start_date|respondent_name_first|submission_time|
+---+--------------------+---------------------+---------------+
| 83|2025-01-21 14:21:...|                 Amir|     2025-01-21|
+---+--------------------+---------------------+---------------+


In [28]:
# Script generated for node Amazon S3
additional_options = {}
tables_collection = spark.catalog.listTables("tgsn_silver")
table_names_in_db = [table.name for table in tables_collection]
table_exists = "kobo_moth" in table_names_in_db
if table_exists:
    df_kobo_to_silver.sortWithinPartitions("submission_time") \
        .writeTo("glue_catalog.tgsn_silver.kobo_moth") \
        .tableProperty("format-version", "2") \
        .tableProperty("location", "s3://tgsn-silver-bucket/kobo/Moth/meta/tgsn_silver/kobo_moth") \
        .tableProperty("write.parquet.compression-codec", "gzip") \
        .options(**additional_options) \
.append()
else:
    df_kobo_to_silver.writeTo("glue_catalog.tgsn_silver.kobo_moth") \
        .tableProperty("format-version", "2") \
        .tableProperty("location", "s3://tgsn-silver-bucket/kobo/Moth/meta/tgsn_silver/kobo_moth") \
        .tableProperty("write.parquet.compression-codec", "gzip") \
        .options(**additional_options) \
        .partitionedBy("submission_time") \
.create()


