# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

from datetime import datetime
import pandas as pd
from pyspark.sql.functions import to_timestamp, to_date, col, when, lit
from awsglue.dynamicframe import DynamicFrame

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Current idle_timeout is None minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 4.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 5
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 5
Idle Timeout: 2880
Session ID: 946a69eb-bed3-4599-837c-e58c94be1a77
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
Waiting for session 946a69eb-bed3-4599-837c-e58c94be1a77 to get into ready status...
Session 946a69eb-bed3-4599-837c-e58c94be1a77 ha

In [33]:
# Read CSV files from Sunbird X Tweets
dyf_sunbird_tweets_landing = glueContext.create_dynamic_frame.from_options(
    format_options={"quoteChar": "\"", "withHeader": True, "separator": ","}, 
    connection_type="s3", format="csv", 
    connection_options={"paths": ["s3://tgsn-landing/sunbird/x/tweets/"], "recurse": True}, 
    transformation_ctx="dyf_kobo_landing")

# Read CSV files from Sunbird X Users
dyf_sunbird_users_landing = glueContext.create_dynamic_frame.from_options(
    format_options={"quoteChar": "\"", "withHeader": True, "separator": ","}, 
    connection_type="s3", format="csv", 
    connection_options={"paths": ["s3://tgsn-landing/sunbird/x/users/"], "recurse": True}, 
    transformation_ctx="dyf_kobo_landing")




In [6]:
#dyf_sunbird_tweets_landing.count()

# 472215

In [7]:
#dyf_sunbird_users_landing.count()

# 413269

In [50]:
df_sunbird_tweets.printSchema()

root
 |-- tweet_id: string (nullable = true)
 |-- target: string (nullable = true)
 |-- source: string (nullable = true)
 |-- author_id: string (nullable = true)
 |-- tag: string (nullable = true)
 |-- keyword: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- location: string (nullable = true)
 |-- tweet_content: string (nullable = true)
 |-- edit_history_tweet_ids: string (nullable = true)
 |-- edit_controls_is_edit_eligible: string (nullable = true)
 |-- edit_controls_editable_until: string (nullable = true)
 |-- edit_controls_edits_remaining: string (nullable = true)
 |-- conversation_id: string (nullable = true)
 |-- note_tweet_text: string (nullable = true)
 |-- in_reply_to_user_id: string (nullable = true)
 |-- referenced_tweet_ids: string (nullable = true)
 |-- referenced_tweet_types: string (nullable = true)
 |-- media_keys: string (nullable = true)
 |-- poll_duration_minutes: string (nullable = true)
 |-- poll_end: string (nullable = true)
 |-- poll_ids:

#### Convert the DynamicFrame to a Spark DataFrame


In [35]:
df_sunbird_tweets = dyf_sunbird_tweets_landing.toDF()

df_sunbird_users = dyf_sunbird_users_landing.toDF()




In [42]:
df_sunbird_tweets = df_sunbird_tweets.drop(df_sunbird_tweets.columns[49])




In [43]:
len(df_sunbird_tweets.columns)

49


In [None]:
df_sunbird_tweets.show()

In [37]:
df_sunbird_tweets.where("tweet_content like '%/n %'").select('tweet_id','tweet_content').show(10, False)

+-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|tweet_id           |tweet_content                                                                                                                                                                                                                                                                                                        |
+-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|186

In [49]:
df_sunbird_tweets.select(df_sunbird_tweets.columns[48]).show(5)

+----------+
|place_type|
+----------+
|          |
|          |
|          |
|          |
|          |
+----------+
only showing top 5 rows


In [46]:
df_sunbird_tweets = df_sunbird_tweets.withColumn("collection_date", lit(datetime.today().strftime('%Y-%m-%d')))

df_sunbird_users = df_sunbird_users.withColumn("collection_date", lit(datetime.today().strftime('%Y-%m-%d')))




#### Write the data in the DynamicFrame to a location in Amazon S3 and a table for it in the AWS Glue Data Catalog

In [47]:
# Convert Spark DataFrame to Glue DynamicFrame 
dyf_sunbird_tweets = DynamicFrame.fromDF(df_sunbird_tweets, glueContext)

# Save result to Bronze layer - Tweets
s3output = glueContext.getSink(
  path="s3://tgsn-bronze/sunbird/x/tweets/",
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  partitionKeys=['collection_date'],
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output"
)
s3output.setCatalogInfo(
  catalogDatabase="tgsn_bronze", catalogTableName="sunbird_x_tweets"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(dyf_sunbird_tweets)

<awsglue.dynamicframe.DynamicFrame object at 0x7fd0c16622c0>


In [15]:
# Convert Spark DataFrame to Glue DynamicFrame 
dyf_sunbird_users = DynamicFrame.fromDF(df_sunbird_users, glueContext)

# Save result to Bronze layer - Users
s3output = glueContext.getSink(
  path="s3://tgsn-bronze/sunbird/x/users/",
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  partitionKeys=['collection_date'],
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output"
)
s3output.setCatalogInfo(
  catalogDatabase="tgsn_bronze", catalogTableName="sunbird_x_users"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(dyf_sunbird_users)

<awsglue.dynamicframe.DynamicFrame object at 0x7fd0c1663640>
