In [ ]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StringType
import re
import time
from datetime import datetime

############################################################
# Setup session and files
############################################################

spark_session = SparkSession \
    .builder \
    .master("spark://spark-master:7077") \
    .appName("Primary Test") \
    .config("spark.dynamicAllocation.enabled", True) \
    .config("spark.dynamicAllocation.shuffleTracking.enabled",True) \
    .config("spark.shuffle.service.enabled", False) \
    .config("spark.driver.port", 9999) \
    .config("spark.blockManager.port", 10005) \
    .config("spark.dynamicAllocation.executorIdleTimeout","30s") \
    .getOrCreate()

sc = spark_session.sparkContext

In [ ]:
start_time = time.time()

df = spark_session.read.json('hdfs://hdfs:9000/user/ubuntu/corpus-webis-tldr-17.json')
df.take(1)

In [ ]:
############################################################
# Process data
############################################################

import json

word_array = ["idiot", "dumbass", "asshole", "cunt"]
process_body_udf = udf(lambda field: re.sub(r'[^a-zA-Z0-9\s\nåäöÅÄÖ]', '', field.lower()), StringType())

df = df.withColumn("content", process_body_udf("content"))

def map_content(row):
    subreddit = row["subreddit"]
    result = [(subreddit, word) for word in word_array if len(re.findall(r'\b'+word+r'\b', row["content"], re.IGNORECASE)) > 0]
    return result

rdd = df.rdd.flatMap(map_content)

rdd.take(10)

df_result = rdd.toDF(["subreddit", "word"])

df_result.show()

In [ ]:
rdd_map = rdd.map(lambda x: (x[0], (x[1], 1)))
rdd_map.take(10)

In [ ]:
rdd_pair = rdd_map.map(lambda x: ((x[0], x[1][0]), x[1][1]))
rdd_pair.take(10)

In [ ]:
from operator import add

rdd_counts = rdd_pair.reduceByKey(add)
rdd_counts.take(10)

In [ ]:
rdd_subreddit= rdd_counts.map(lambda x: (x[0][0], x[0][1], x[1]))
rdd_subreddit.take(10)

In [ ]:
df = rdd_subreddit.toDF(["subreddit", "word", "count"])

pivoted_df = df.groupBy("subreddit").pivot("word").sum("count").na.fill(0)

pivoted_df.show()

In [None]:
sc.stop()