# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [2]:
%%configure
{
   "--datalake-formats": "iceberg",
    "--conf": "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions --conf spark.sql.catalog.glue_catalog=org.apache.iceberg.spark.SparkCatalog --conf spark.sql.catalog.glue_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog --conf spark.sql.catalog.glue_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO --conf spark.sql.catalog.glue_catalog.warehouse=file:///tmp/spark-warehouse --conf spark.sql.defaultCatalog=glue_catalog"
}  

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
The following configurations have been updated: {'--datalake-formats': 'iceberg', '--conf': 'spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions --conf spark.sql.catalog.glue_catalog=org.apache.iceberg.spark.SparkCatalog --conf spark.sql.catalog.glue_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog --conf spark.sql.catalog.glue_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO --conf spark.sql.catalog.glue_catalog.warehouse=file:///tmp/spark-warehouse --conf spark.sql.defaultCatalog=glue_catalog'}


####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

from datetime import datetime
import pandas as pd
from pyspark.sql.functions import to_timestamp

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Current idle_timeout is None minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 4.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 5
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 5
Idle Timeout: 2880
Session ID: 961b49ad-e285-4887-8d50-aaeecb7c5b12
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
--datalake-formats iceberg
--conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions --conf spark.sql.catalog.glue_catalog=org.apache.iceberg.spark.SparkCatalog --conf spark.sql.catalog.glue_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog --conf spark.sql.catalog.glue_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO --conf spark.sql.catalog.glue_catalog.warehouse=file:///tmp/spark-warehouse --conf spark.sql.defaultCatalog=glue_c

In [2]:
from datetime import datetime
import pandas as pd
from pyspark.sql.functions import to_timestamp, to_date, col, when, lit




In [3]:
dyf_sunbird_tweets_bronze = glueContext.create_dynamic_frame.from_catalog(database="tgsn_bronze", table_name="sunbird_x_tweets", transformation_ctx="dyf_sunbird_bronze")




In [17]:
dyf_sunbird_tweets_bronze.printSchema()

root
|-- tweet_id: string
|-- target: string
|-- source: string
|-- author_id: string
|-- tag: string
|-- keyword: string
|-- created_at: string
|-- location: string
|-- tweet_content: string
|-- edit_history_tweet_ids: string
|-- edit_controls_is_edit_eligible: string
|-- edit_controls_editable_until: string
|-- edit_controls_edits_remaining: string
|-- conversation_id: string
|-- note_tweet_text: string
|-- in_reply_to_user_id: string
|-- referenced_tweet_ids: string
|-- referenced_tweet_types: string
|-- media_keys: string
|-- poll_duration_minutes: string
|-- poll_end: string
|-- poll_ids: string
|-- poll_options: string
|-- poll_voting_status: string
|-- geo_place_id: string
|-- geo_coordinates: string
|-- entities_hashtags: string
|-- entities_urls: string
|-- entities_mentions: string
|-- withheld_copyright: string
|-- withheld_country_codes: string
|-- public_metrics_retweet_count: string
|-- public_metrics_reply_count: string
|-- public_metrics_like_count: string
|-- public_me

In [4]:
df_sunbird_tweets_bronze = dyf_sunbird_tweets_bronze.toDF()



In [5]:
for column in df_sunbird_tweets_bronze.columns:
    df_sunbird_tweets_bronze = \
    df_sunbird_tweets_bronze.withColumn(column, \
                                        when(df_sunbird_tweets_bronze[f'{column}']=='' ,None) \
                                        .otherwise(df_sunbird_tweets_bronze[f'{column}']))




In [15]:
empty_col = []
almost_empty_col = []
threshold = 0.1

total_count = df_sunbird_tweets_bronze.count()
for column in df_sunbird_tweets_bronze.columns:
    cnt = df_sunbird_tweets_bronze.where(df_sunbird_tweets_bronze[f'{column}'].isNotNull()).count()
    if cnt == 0:
        empty_col.append(column)
    elif cnt/total_count < threshold:
        almost_empty_col.append(column)

print(f'Empty columns: {empty_col}')
print(f'-------------')
print(f'Almost empty columns: {almost_empty_col}')

Empty columns: ['poll_duration_minutes', 'poll_end', 'poll_ids', 'poll_options', 'poll_voting_status', 'geo_coordinates', 'withheld_copyright', 'withheld_country_codes', 'source_app', 'place_contained_within', 'place_country', 'place_country_code', 'place_full_name', 'place_name', 'place_type']
-------------
Almost empty columns: ['note_tweet_text', 'media_keys', 'geo_place_id', 'entities_hashtags', 'entities_urls', 'place_id']


In [9]:
# Create or replace table
df_sunbird_tweets_bronze.writeTo("glue_catalog.tgsn_silver.sunbird_x_tweets") \
        .tableProperty("format-version", "2") \
        .tableProperty("location", "s3://tgsn-silver-bucket/sunbird/x/tweets/tgsn_silver/sunbird_x_tweets") \
        .tableProperty("write.parquet.compression-codec", "gzip") \
        .options(**additional_options) \
        .partitionedBy("collection_date") \
.createOrReplace()


