In [229]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql.types import *

sc = SparkContext.getOrCreate()
spark = SparkSession.builder.getOrCreate()
sqlContext = SQLContext(sc)

In [230]:
layer = 1
table = "tweets"
path = f"./layer{layer}/{table}/"
tweets = spark.read.parquet(path)

In [232]:
fact_tweet = (
    tweets
    .withColumn(
        "created_at",
        F.from_utc_timestamp(
            F.to_timestamp(
                F.regexp_replace(
                    F.regexp_replace("created_at", "^[A-Za-z]{3} ", ""),
                    "\+0000 ", ""),
                "MMM dd HH:mm:ss yyyy"
            ),
            "GMT-3"
        )
    )
    .withColumn(
        "hashtags", 
        F.when(
            F.col("hashtags") != '',
            F.regexp_replace("hashtags", ",\s$", "")
        )
        .otherwise(None)
    )
    .withColumn("created_at_partition_year", F.date_format("created_at", "yyyy"))
    .withColumn("created_at_partition_month", F.date_format("created_at", "MM"))
    .withColumn("created_at_partition_day", F.date_format("created_at", "dd"))
    .withColumn("created_at_partition_hour", F.date_format("created_at", "HH"))
    .select(
        "tweet_id",    
        "created_at",
        "text",
        "hashtags",
        F.col("retweet_count").cast(IntegerType()),
        F.col("possibly_sensitive").cast(BooleanType()),
        "lang",   
        "user_id",
        "created_at_partition_year",
        "created_at_partition_month",
        "created_at_partition_day",
        "created_at_partition_hour",
    )
)
fact_tweet.toPandas()

Unnamed: 0,tweet_id,created_at,text,hashtags,retweet_count,possibly_sensitive,lang,user_id,created_at_partition_year,created_at_partition_month,created_at_partition_day,created_at_partition_hour
0,1358589255191248898,2021-02-07 22:31:34,"RT @mirianmirandas: Palmeiras sem mundial, e e...",,0,,pt,1320759564481011712,2021,02,07,22
1,1358589255782719490,2021-02-07 22:31:34,RT @Palmeiras: Ainda temos muita coisa pela fr...,,0,,pt,99367945,2021,02,07,22
2,1358589256218906625,2021-02-07 22:31:34,@LipoviskCR @ViniGod321 @feliped1s_ @VR_SEP @D...,,0,,pt,1276404290140151808,2021,02,07,22
3,1358589257548525571,2021-02-07 22:31:35,Abel Ferreira teve coragem de escalar o time t...,Palmeiras,0,,pt,1220008824880451584,2021,02,07,22
4,1358589259784028165,2021-02-07 22:31:35,RT @welingtondiogo: @gugachacra Não chora! Com...,,0,False,pt,1298780250030116869,2021,02,07,22
...,...,...,...,...,...,...,...,...,...,...,...,...
1552,1358928417023012866,2021-02-08 20:59:16,VOU CHORAAAR 😭,,0,,pt,1259958191502307333,2021,02,08,20
1553,1358928777359863822,2021-02-08 21:00:42,só falta o rica 😍😍,,0,,pt,1180475979212431360,2021,02,08,21
1554,1358928778588811264,2021-02-08 21:00:43,RT @arenasbt: 🔥 Cair na semifinal do Mundial d...,ArenaSBT,0,,pt,1319822403262959617,2021,02,08,21
1555,1358928773324881920,2021-02-08 21:00:41,As vão pro djabo kkkkkkkkkk,,0,,pt,608249788,2021,02,08,21


In [234]:
dim_user = (
    tweets
    .withColumn(
        "user_created_at",
        F.from_utc_timestamp(
            F.to_timestamp(
                F.regexp_replace(
                    F.regexp_replace("user_created_at", "^[A-Za-z]{3} ", ""),
                    "\+0000 ", ""),
                "MMM dd HH:mm:ss yyyy"
            ),
            "GMT-3"
        )
    )
    .select(
         "user_id",
         "user_name",
         "user_description",
         F.col("user_verification").cast(BooleanType()),
         F.col("user_followers_count").cast(IntegerType()),
         F.col("user_friends_count").cast(IntegerType()),
         "user_created_at",
         "user_location",
    )
)
dim_user.toPandas()

Unnamed: 0,user_id,user_name,user_description,user_verification,user_followers_count,user_friends_count,user_created_at,user_location
0,1320759564481011712,CRF🧜🏻‍♀️,•CRF 1985🔴⚫,False,74,76,2020-10-26 13:10:11,
1,99367945,Gabriella ~,,False,209,135,2009-12-25 19:04:18,Londrina
2,1276404290140151808,GUSTA ˢᶜᶜᵖ,ℂ𝕆ℝ𝕀𝕋ℍ𝕀𝔸ℕ𝕆 𝕄𝔸𝕃𝕆ℚ𝕌𝔼𝕀ℝ𝕆 𝕊𝕆𝔽ℝ𝔼𝔻𝕆ℝ 𝔾ℝ𝔸ℂ𝔸𝕊 𝔸 𝔻𝔼𝕌𝕊 !...,False,71,69,2020-06-26 03:40:44,code Gusta #ad
3,1220008824880451584,FORA BANANA GALIOTTE,"Torço pro Maior do Brasil. Fora Luiz Adriano, ...",False,44,230,2020-01-22 12:42:31,
4,1298780250030116869,KUSHINA COSPLAY,NÃO LIGO PARA O SEU CANCELAMENTO\nNÃO MUDA NAD...,False,3,22,2020-08-26 21:32:04,
...,...,...,...,...,...,...,...,...
1552,1259958191502307333,X caloteiro 🇮🇹,fodase bro 💚,False,707,805,2020-05-11 18:27:07,
1553,1180475979212431360,grandchamp ⓟ,@palmeiras💍,False,43,23,2019-10-05 10:33:25,🇮🇹
1554,1319822403262959617,Wilton Amaral,,False,20,111,2020-10-23 23:06:12,"São Paulo, Brasil"
1555,608249788,graciano,,False,1946,1000,2012-06-14 12:42:24,
