In [164]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql.types import *

sc = SparkContext.getOrCreate()
spark = SparkSession.builder.getOrCreate()
sqlContext = SQLContext(sc)

In [165]:
layer = 1
table = "tweets"
path = f"./layer{layer}/{table}/"

In [166]:
tweets = spark.read.parquet(path)

In [167]:
tweets.count()

1497

In [168]:
list(tweets.schema)

[StructField(tweet_id,StringType,true),
 StructField(created_at,StringType,true),
 StructField(text,StringType,true),
 StructField(hashtags,StringType,true),
 StructField(retweet_count,IntegerType,true),
 StructField(possibly_sensitive,BooleanType,true),
 StructField(lang,StringType,true),
 StructField(user_id,StringType,true),
 StructField(user_name,StringType,true),
 StructField(user_description,StringType,true),
 StructField(user_verification,BooleanType,true),
 StructField(user_followers_count,IntegerType,true),
 StructField(user_friends_count,IntegerType,true),
 StructField(user_created_at,StringType,true),
 StructField(user_location,StringType,true),
 StructField(etl_load,TimestampType,true),
 StructField(etl_load_partition_year,IntegerType,true),
 StructField(etl_load_partition_month,IntegerType,true),
 StructField(etl_load_partition_day,IntegerType,true),
 StructField(etl_load_partition_hour,IntegerType,true)]

In [209]:
layer = 2
table = "fact_tweet"
path = f"./layer{layer}/{table}/"

'./layer2/fact_tweet/'

In [210]:
fact_tweet = (
    tweets
    .withColumn(
        "created_at",
        F.from_utc_timestamp(
            F.to_timestamp(
                F.regexp_replace(
                    F.regexp_replace("created_at", "^[A-Za-z]{3} ", ""),
                    "\+0000 ", ""),
                "MMM dd HH:mm:ss yyyy"
            ),
            "GMT-3"
        )
    )
    .withColumn(
        "hashtags", 
        F.when(
            F.col("hashtags") != '',
            F.regexp_replace("hashtags", ",\s$", "")
        )
        .otherwise(None)
    )
    .withColumn("created_at_partition_year", F.date_format("created_at", "yyyy"))
    .withColumn("created_at_partition_month", F.date_format("created_at", "MM"))
    .withColumn("created_at_partition_day", F.date_format("created_at", "dd"))
    .withColumn("created_at_partition_hour", F.date_format("created_at", "HH"))
    .select(
        "tweet_id",    
        "created_at",
        "text",
        "hashtags",
        F.col("retweet_count").cast(IntegerType()),
        F.col("possibly_sensitive").cast(BooleanType()),
        "lang",   
        "user_id",
        "created_at_partition_year",
        "created_at_partition_month",
        "created_at_partition_day",
        "created_at_partition_hour",
    )
)

fact_tweet.write\
          .partitionBy("created_at_partition_year", 
                       "created_at_partition_month",
                       "created_at_partition_day",
                       "created_at_partition_hour",)\
          .format("parquet")\
          .mode("append")\
          .save(path)

In [29]:
dim_user = (
    tweets
    .select(
         "user_id",
         "user_name",
         "user_description",
         "user_verification",
         "user_followers_count",
         "user_friends_count",
         "user_created_at",
         "user_location",
    )
)