# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

from datetime import datetime
import pandas as pd
from pyspark.sql.functions import to_timestamp, to_date

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Current idle_timeout is None minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 4.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 5
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 5
Idle Timeout: 2880
Session ID: 5d508551-3801-411c-a3ed-8a04a9e93ad6
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
Waiting for session 5d508551-3801-411c-a3ed-8a04a9e93ad6 to get into ready status...
Session 5d508551-3801-411c-a3ed-8a04a9e93ad6 ha

In [2]:
# Read CSV files from KoBo landing layer
dyf_kobo_landing = glueContext.create_dynamic_frame.from_options(
    format_options={"quoteChar": "\"", "withHeader": True, "separator": "|"}, 
    connection_type="s3", format="csv", 
    connection_options={"paths": ["s3://tgsn-landing/kobo_data_landing/"], "recurse": True}, 
    transformation_ctx="dyf_kobo_landing")





In [3]:
dyf_kobo_landing.printSchema()

root
|-- start: string
|-- end: string
|-- deviceid: string
|-- interviewer_username: string
|-- pre_interview_instructions: string
|-- camp_name: string
|-- gps_coordinates: string
|-- respondent_gender: string
|-- interview_introduction: string
|-- consent_understood: string
|-- end_interview_1: string
|-- consent_request: string
|-- accuracy_request: string
|-- consent_signature: string
|-- consent_date: string
|-- consent_rejection_reason: string
|-- end_interview_2: string
|-- end_interview_3: string
|-- respondent_name_first: string
|-- respondent_name_last: string
|-- respondent_name_other: string
|-- campidcard_yes_no: string
|-- campidcard_cardnumber: string
|-- campidcard_caseid: string
|-- respondent_birth_date: string
|-- respondent_birth_country: string
|-- respondent_birth_place: string
|-- respondent_nationality_first: string
|-- respondent_nationality_second: string
|-- respondent_nationality_differentatbirth_yes_no: string
|-- respondent_nationality_atbirth: string
|--

#### Convert the DynamicFrame to a Spark DataFrame


In [4]:
spark_df_kobo = dyf_kobo_landing.toDF()



In [5]:
spark_df_kobo = spark_df_kobo.withColumn("_submission_time_date",to_date("_submission_time"))




In [6]:
spark_df_kobo_today = spark_df_kobo.where(spark_df_kobo['_submission_time_date'] >= datetime.today().strftime('%Y-%m-%d'))
#spark_df_kobo_today = spark_df_kobo




In [7]:
spark_df_kobo_today.select('respondent_name_first', '_submission_time','_submission_time_date').show()

+---------------------+-------------------+---------------------+
|respondent_name_first|   _submission_time|_submission_time_date|
+---------------------+-------------------+---------------------+
|                 John|2024-11-21T14:58:01|           2024-11-21|
|              Manuela|2024-11-21T15:16:23|           2024-11-21|
|                  Ann|2024-11-22T15:06:18|           2024-11-22|
|                 Ezra|2024-11-25T14:25:44|           2024-11-25|
|                  Rex|2024-11-26T14:20:55|           2024-11-26|
|                  Rob|2024-11-28T15:27:04|           2024-11-28|
|                     |2024-11-29T10:35:00|           2024-11-29|
|                     |2024-12-02T09:50:24|           2024-12-02|
|                 Tess|2024-12-03T09:18:28|           2024-12-03|
|                 Lulu|2024-12-05T09:13:13|           2024-12-05|
|              Maymuna|2024-12-05T09:15:26|           2024-12-05|
+---------------------+-------------------+---------------------+


#### Write the data in the DynamicFrame to a location in Amazon S3 and a table for it in the AWS Glue Data Catalog

In [8]:
from awsglue.dynamicframe import DynamicFrame

# Convert Spark DataFrame to Glue DynamicFrame 
dyf_kobo = DynamicFrame.fromDF(spark_df_kobo_today, glueContext)

s3output = glueContext.getSink(
  path="s3://tgsn-bronze/kobo/Moth/",
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  partitionKeys=['_submission_time_date'],
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output"
)
s3output.setCatalogInfo(
  catalogDatabase="tgsn_bronze", catalogTableName="kobo_moth"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(dyf_kobo)

<awsglue.dynamicframe.DynamicFrame object at 0x7f68aa2520b0>
