In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder\
            .master("local[*]") \
            .appName("user-comments") \
            .getOrCreate()

In [3]:
df = spark.read.parquet("../data/raw/pq/*/*")

In [4]:
df_authors = spark.read \
    .option("header", "true") \
    .csv("../data/sampled/users")

In [5]:
df_authors = df_authors.withColumnRenamed("year", "source_year") \
                       .withColumnRenamed("subreddit_id", "source_subreddit")

In [6]:
df_sampled = df.join(df_authors, \
               df.author == df_authors.sampled_authors, \
               "inner")\
                    .drop(df_authors.sampled_authors)

In [11]:
df_sampled = df_sampled.withColumn("permalink", 
                             F.concat(
                                F.lit("https://www.reddit.com/r/"),
                                F.col("subreddit"),
                                F.lit("/comments/"),
                                F.regexp_replace(F.col("link_id"), "t3_", ""),
                                F.lit("/_/"),
                                F.col("id")
                             )
                            )

In [12]:
# save dataframes:
df_comments_history_full = df_sampled.select("author",
                                              "subreddit_id",
                                              "subreddit",
                                              "year",
                                              "month",
                                              "body",
                                              "created_utc_ts",
                                              "link_id",
                                              "permalink",
                                              "source_subreddit",
                                              "source_year")

df_comments_history_full \
            .coalesce(1) \
            .write \
            .option("header", True) \
            .mode("overwrite") \
            .parquet("../data/sampled/comments_history/")

In [11]:
# Permalink:
# https://www.reddit.com/r/{subreddit}/comments/{link_id_without_t3}/_/{comment_id}

# link id:
# t3_...: the post ID (submission ID)

# in parent id:
# t3_...: parent is a post
# t1_...:parent is a comment