# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

from datetime import datetime
import pandas as pd
from pyspark.sql.functions import to_timestamp, to_date, col, when, lit
from awsglue.dynamicframe import DynamicFrame

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Current idle_timeout is None minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 4.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 5
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 5
Idle Timeout: 2880
Session ID: fb53a4a0-6e02-45ab-a15a-231b33baa575
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
Waiting for session fb53a4a0-6e02-45ab-a15a-231b33baa575 to get into ready status...
Session fb53a4a0-6e02-45ab-a15a-231b33baa575 ha

In [2]:
# Read CSV files from KoBo landing layer
dyf_sunbird_tweets_landing = glueContext.create_dynamic_frame.from_options(
    format_options={"quoteChar": "\"", "withHeader": True, "separator": ","}, 
    connection_type="s3", format="csv", 
    connection_options={"paths": ["s3://tgsn-landing/sunbird/x/tweets/"], "recurse": True}, 
    transformation_ctx="dyf_kobo_landing")





In [3]:
dyf_sunbird_tweets_landing.count()

13041


In [4]:
dyf_sunbird_tweets_landing.printSchema()

root
|-- tweet_id: string
|-- target: string
|-- source: string
|-- author_id: string
|-- tag: string
|-- keyword: string
|-- created_at: string
|-- location: string
|-- tweet_content: string
|-- edit_history_tweet_ids: string
|-- edit_controls_is_edit_eligible: string
|-- edit_controls_editable_until: string
|-- edit_controls_edits_remaining: string
|-- conversation_id: string
|-- note_tweet_text: string
|-- in_reply_to_user_id: string
|-- referenced_tweet_ids: string
|-- referenced_tweet_types: string
|-- media_keys: string
|-- poll_duration_minutes: string
|-- poll_end: string
|-- poll_ids: string
|-- poll_options: string
|-- poll_voting_status: string
|-- geo_place_id: string
|-- geo_coordinates: string
|-- entities_hashtags: string
|-- entities_urls: string
|-- entities_mentions: string
|-- withheld_copyright: string
|-- withheld_country_codes: string
|-- public_metrics_retweet_count: string
|-- public_metrics_reply_count: string
|-- public_metrics_like_count: string
|-- public_me

#### Convert the DynamicFrame to a Spark DataFrame


In [5]:
df_sunbird_tweets = dyf_sunbird_tweets_landing.toDF()



In [6]:
df_sunbird_tweets.select(df_sunbird_tweets.columns[:7]).show()

+-------------------+---------------+---------------+-------------------+-------+--------------------+--------------------+
|           tweet_id|         target|         source|          author_id|    tag|             keyword|          created_at|
+-------------------+---------------+---------------+-------------------+-------+--------------------+--------------------+
|1856445136462377117|      lidialake|               |          541661569|       | "Qatar" (CherylW...|2024-11-12T21:13:...|
|1856340465953517886|  BScharen95153|  CherylWroteIt|1806169910323650560|mention| "Qatar" (CherylW...|2024-11-12T14:17:...|
|1856230702930391064|   berman_allen|  CherylWroteIt|         4229027320|mention| "Qatar" (CherylW...|2024-11-12T07:01:...|
|1856166551818322170| ken_twindragon|   EarlChristy1|          415125689|retweet| "Qatar" (CherylW...|2024-11-12T02:46:...|
|1856145340589846968|   isadoramay12|   MichealYerty|         2680119463|retweet| "Qatar" (CherylW...|2024-11-12T01:21:...|
|1856145

In [14]:
df_sunbird_tweets = df_sunbird_tweets.withColumn("collection_date", lit(datetime.today().strftime('%Y-%m-%d')))




In [6]:
#spark_df_kobo_today = spark_df_kobo.where(spark_df_kobo['_submission_time_date'] >= datetime.today().strftime('%Y-%m-%d'))
#spark_df_kobo_today = spark_df_kobo

In [25]:

for column in df_sunbird_tweets.columns:
    df_sunbird_tweets = df_sunbird_tweets.withColumn(column, when(df_sunbird_tweets[f'{column}']=='' ,None).otherwise(df_sunbird_tweets[f'{column}']))




In [26]:
for column in df_sunbird_tweets.columns:
    cnt = df_sunbird_tweets.where(df_sunbird_tweets[f'{column}'].isNotNull()).count()
    print(f'Column: {column} - {cnt}')

Column: tweet_id - 13041
Column: target - 13041
Column: source - 12875
Column: author_id - 13041
Column: tag - 12875
Column: keyword - 13041
Column: created_at - 13041
Column: location - 13041
Column: tweet_content - 13041
Column: edit_history_tweet_ids - 13041
Column: edit_controls_is_edit_eligible - 13041
Column: edit_controls_editable_until - 13041
Column: edit_controls_edits_remaining - 13041
Column: conversation_id - 13041
Column: note_tweet_text - 115
Column: in_reply_to_user_id - 1510
Column: referenced_tweet_ids - 13035
Column: referenced_tweet_types - 13035
Column: media_keys - 105
Column: poll_duration_minutes - 0
Column: poll_end - 0
Column: poll_ids - 0
Column: poll_options - 0
Column: poll_voting_status - 0
Column: geo_place_id - 10
Column: geo_coordinates - 0
Column: entities_hashtags - 69
Column: entities_urls - 530
Column: entities_mentions - 12874
Column: withheld_copyright - 0
Column: withheld_country_codes - 0
Column: public_metrics_retweet_count - 13041
Column: publ

#### Write the data in the DynamicFrame to a location in Amazon S3 and a table for it in the AWS Glue Data Catalog

In [15]:
# Convert Spark DataFrame to Glue DynamicFrame 
dyf_sunbird_tweets = DynamicFrame.fromDF(df_sunbird_tweets, glueContext)

s3output = glueContext.getSink(
  path="s3://tgsn-bronze/sunbird/x/tweets/",
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  partitionKeys=['collection_date'],
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output"
)
s3output.setCatalogInfo(
  catalogDatabase="tgsn_bronze", catalogTableName="sunbird_x_tweets"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(dyf_sunbird_tweets)

<awsglue.dynamicframe.DynamicFrame object at 0x7fa644ba2e60>
