In [7]:
import random
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, ArrayType

In [8]:
spark = SparkSession.builder \
            .appName("user-sampling") \
            .master("local[*]") \
            .getOrCreate()

In [9]:
df = spark.read.parquet("../data/raw/pq/*/*")

In [8]:
# user sampling:
def sample_users(users):
    users = [user for user in users if user != "[deleted]"]
    if not users:
        return []
    ratio = min(max(1, int(len(users) * 0.02)), 50)
    return random.sample(users, ratio)

sample_users_func = F.udf(sample_users, ArrayType(StringType()))

In [9]:
df_users_group = df.select("year", "subreddit_id", "author")\
    .groupby("year", "subreddit_id") \
    .agg(F.collect_set("author").alias("unique_authors"))

In [10]:
df_users_sampled = df_users_group.withColumn("sampled_authors", sample_users_func("unique_authors"))

In [11]:
df_users_sampled.show(10)

+----+------------+--------------------+----------------+
|year|subreddit_id|      unique_authors| sampled_authors|
+----+------------+--------------------+----------------+
|2007|     t5_22i0|[jan, alrond, bay...|        [alrond]|
|2007|     t5_21n6|[Princesss, Tanai...|       [dend395]|
|2007|     t5_21nj|[MalloyMan, anand...|   [brainburger]|
|2007|     t5_247d|[TamaraBergfeld, ...|[TamaraBergfeld]|
|2007|     t5_4od4|[martindurf, fr_l...|        [fr_lut]|
|2007|     t5_247c|[33rr, turghata, ...|      [turghata]|
|2007|      t5_vf2|[ragedlinker, but...|      [uoouoo35]|
|2007|     t5_247i|[kit1980, antariu...|   [litemanager]|
|2007|     t5_22i2|  [frntk, [deleted]]|         [frntk]|
|2007|     t5_2478|[hamlet_tk, shrin...|  [colletthabbo]|
+----+------------+--------------------+----------------+
only showing top 10 rows



In [10]:
# user sampling (optimal)
df_users_grouped = df.select("year", "subreddit_id", "author") \
    .groupby("year", "subreddit_id") \
    .agg(F.collect_set(
             F.when(F.col("author") != "[deleted]", F.col("author"))
              ) \
          .alias("unique_authors"))

In [11]:
# ratio = min(max(1, int(len(users) * 0.02)), 50)
df_users_sampled = df_users_grouped.withColumn(
    "sampled_authors",
    F.slice(
        F.shuffle("unique_authors"),
        1,
        F.least(  # max is 50
            F.greatest(  # min is 1
                F.floor(F.size("unique_authors") * 0.02).cast("int"),
                F.lit(1)
            ),
            F.lit(50)
        )
    )
)

In [12]:
sampled_authors_flat = df_users_sampled.select("year", "subreddit_id",
      F.explode("sampled_authors")\
       .alias("sampled_authors")
  )

In [13]:
sampled_authors_flat.coalesce(1)\
    .write\
    .mode("overwrite")\
    .option("header", "true")\
    .csv("../data/sampled/users")

In [14]:
# join (keeps only sampled users
df_final = df.join(df_users_sampled,
        (df.year == df_users_sampled.year) & \
        (df.subreddit_id == df_users_sampled.subreddit_id) & \
         F.array_contains(df_users_sampled.sampled_authors, df.author),
         how="inner") \
    .select(df['*'])

In [15]:
start_date = 2006
end_date = 2007
for year in range(start_date, end_date + 1):
    df_year = df_final.filter(F.col("year") == year)
    df_year \
        .coalesce(1) \
        .write\
        .mode("overwrite")\
        .option("header", "true")\
        .parquet(f"../data/sampled/comments/users_comments_{year}")