In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import time
from datetime import datetime

spark_session = SparkSession \
    .builder \
    .master("spark://spark-master:7077") \
    .appName("Primary Test") \
    .config("spark.dynamicAllocation.enabled", True) \
    .config("spark.dynamicAllocation.shuffleTracking.enabled",True) \
    .config("spark.shuffle.service.enabled", False) \
    .config("spark.driver.port", 9999) \
    .config("spark.blockManager.port", 10005) \
    .config("spark.dynamicAllocation.executorIdleTimeout","30s") \
    .getOrCreate()

sc = spark_session.sparkContext

In [None]:
start_time = time.time()

df = spark_session.read.json('hdfs://hdfs:9000/user/ubuntu/corpus-webis-tldr-17.json')
df_split = df.withColumn("content_array", split(col("content"), " "))
df_exploded = df_split.select("id", "subreddit", explode("content_array").alias("word"))
df_exploded_lower = df_exploded.withColumn("word", when(col("word") == "I", col("word")).otherwise(lower(col("word"))))
df_filtered = df_exploded_lower.filter(~col("word").rlike("[^a-zA-Z0-9]")).filter(col("word").isNotNull() & (col("word") != ""))
df_swear_list = ['anal', 'anus', 'arse', 'ass', 'balls', 'ballsack', 'bastard', 'biatch', 'bitch', 'bloody', 'blow job', 'blowjob', 'bollock', 'bollok', 'boner', 'boob', 'bugger', 'bum', 'butt', 'buttplug', 'clitoris', 'cock', 'coon', 'crap', 'cunt', 'damn', 'dick', 'dildo', 'dyke', 'f u c k', 'fag', 'feck', 'felching', 'fellate', 'fellatio', 'flange', 'fuck', 'fudge packer', 'fudgepacker', 'God damn', 'Goddamn', 'hell', 'homo', 'jerk', 'jizz', 'knob end', 'knobend', 'labia', 'lmao', 'lmfao', 'muff', 'nigga', 'nigger', 'omg', 'penis', 'piss', 'poop', 'prick', 'pube', 'pussy', 'queer', 's hit', 'scrotum', 'sex', 'sh1t', 'shit', 'slut', 'smegma', 'spunk', 'tit', 'tosser', 'turd', 'twat', 'vagina', 'wank', 'whore', 'wtf']
df_filtered_swearWords = df_filtered.filter(col("word").isin(df_swear_list)).select("subreddit", "word")
word_frequencies = df_filtered_swearWords.groupBy("subreddit").agg(collect_list("word").alias("curse_words"))
df_exploded1 = word_frequencies.select("subreddit", explode("curse_words").alias("word"))
result_list = df_exploded1.groupBy("subreddit","word").count().sort(desc("count"))

end_time = time.time()

In [ ]:
elapsed_time = end_time - start_time
print(f"Processing time: {elapsed_time} seconds")
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
result_list.write.csv(f'curse_word_frequencies_{timestamp}.csv', header=True, mode='overwrite')

time_results = spark_session.createDataFrame([(timestamp, elapsed_time)], ["timestamp", "processing_time"])
time_results.write.csv(f'time_results_{timestamp}.csv', header=True, mode='overwrite')

In [None]:
sc.stop()