In [1]:
from pyspark.sql import SparkSession


############################################################
# Setup session and files
############################################################

spark_session = SparkSession \
    .builder \
    .master("spark://spark-master:7077") \
    .appName("tessst") \
    .config("spark.dynamicAllocation.enabled", True) \
    .config("spark.dynamicAllocation.shuffleTracking.enabled",True) \
    .config("spark.shuffle.service.enabled", False) \
    .config("spark.driver.port", 9999) \
    .config("spark.blockManager.port", 10005) \
    .config("spark.dynamicAllocation.executorIdleTimeout","30s") \
    .getOrCreate()

sc = spark_session.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/11 12:11:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/11 12:11:25 WARN StandaloneSchedulerBackend: Dynamic allocation enabled without spark.executor.cores explicitly set, you may get more executors allocated than expected. It's recommended to set spark.executor.cores explicitly. Please check SPARK-30299 for more details.


In [2]:
df = spark_session.read.json('hdfs://hdfs:9000/user/ubuntu/corpus-webis-tldr-17.json')
df.take(1)

                                                                                

[Row(author='raysofdarkmatter', body="I think it should be fixed on either UTC standard or UTC+1 year around, with the current zone offsets.\n\nMoving timescales add a lot of complexity to the implementation of timekeeping systems and have [dubious value]( \n\nI think seasonal shifting time made sense in the pre-electric past, when timekeeping was more flexible and artificial light was inefficient and often dangerous. \n\nNow we have machines that work easily with simple timekeeping rules, and it's more beneficial to spend a small amount on energy for lighting, and save the larger cost of engineering things to work with the complex timekeeping rules, as well as saving the irritation to humans.\n\nLighting has gotten much more efficient over time; we can squeeze out a lot more photons per unit of energy from a 2012 CFL or LED than a candle could in 1780, or a lightbulb could in 1950. \n\nThere's a lot of room for improvement in how we use lights as well; as lighting control gets more in

In [11]:

############################################################
# Process data
############################################################

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import re
import json

word_array = ["idiot", "dumbass", "asshole", "cunt"] 
process_body_udf = udf(lambda field: re.sub(r'[^a-zA-Z0-9\s\nåäöÅÄÖ]', '', field.lower()), StringType())

df = df.withColumn("content", process_body_udf("content"))

def map_content(row):
    subreddit = row["subreddit"]
    result = [(subreddit, word) for word in word_array if len(re.findall(r'\b'+word+r'\b', row["content"], re.IGNORECASE)) > 0]
    return result

rdd = df.rdd.flatMap(map_content)

rdd.take(10)

df_result = rdd.toDF(["subreddit", "word"])

df_result.show()

[Stage 11:>                                                         (0 + 1) / 1]

+-----------------+-------+
|        subreddit|   word|
+-----------------+-------+
|        AskReddit|  idiot|
|       technology|asshole|
|        AskReddit|asshole|
|              NAU|dumbass|
|        AskReddit|dumbass|
|       Torchlight|  idiot|
|        AskReddit|dumbass|
|       minimalism|  idiot|
|        AskReddit|  idiot|
|            trees|asshole|
|        Minecraft|asshole|
|        AskReddit|  idiot|
|        AskReddit|asshole|
|             halo|   cunt|
|            funny|  idiot|
|          running|  idiot|
|  TalesFromRetail|asshole|
|             tifu|dumbass|
|        AskReddit|asshole|
|explainlikeimfive|asshole|
+-----------------+-------+
only showing top 20 rows



                                                                                

In [14]:
rdd_map = rdd.map(lambda x: (x[0], (x[1], 1)))
rdd_map.take(10)

                                                                                

[('AskReddit', ('idiot', 1)),
 ('technology', ('asshole', 1)),
 ('AskReddit', ('asshole', 1)),
 ('NAU', ('dumbass', 1)),
 ('AskReddit', ('dumbass', 1)),
 ('Torchlight', ('idiot', 1)),
 ('AskReddit', ('dumbass', 1)),
 ('minimalism', ('idiot', 1)),
 ('AskReddit', ('idiot', 1)),
 ('trees', ('asshole', 1))]

In [17]:
rdd_pair = rdd_map.map(lambda x: ((x[0], x[1][0]), x[1][1]))
rdd_pair.take(10)

                                                                                

[(('AskReddit', 'idiot'), 1),
 (('technology', 'asshole'), 1),
 (('AskReddit', 'asshole'), 1),
 (('NAU', 'dumbass'), 1),
 (('AskReddit', 'dumbass'), 1),
 (('Torchlight', 'idiot'), 1),
 (('AskReddit', 'dumbass'), 1),
 (('minimalism', 'idiot'), 1),
 (('AskReddit', 'idiot'), 1),
 (('trees', 'asshole'), 1)]

In [19]:
from operator import add

rdd_counts = rdd_pair.reduceByKey(add)
rdd_counts.take(10)

                                                                                

[(('circlebroke', 'asshole'), 30),
 (('fantasyfootball', 'asshole'), 28),
 (('IAmA', 'asshole'), 360),
 (('steroids', 'asshole'), 13),
 (('TrueTrueReddit', 'idiot'), 1),
 (('Braveryjerk', 'asshole'), 6),
 (('dating_advice', 'asshole'), 204),
 (('books', 'idiot'), 26),
 (('religion', 'asshole'), 5),
 (('RedvsBlue', 'asshole'), 1)]

In [36]:
rdd_subreddit= rdd_counts.map(lambda x: (x[0][0], x[0][1], x[1]))
rdd_subreddit.take(10)

[('circlebroke', 'asshole', 30),
 ('fantasyfootball', 'asshole', 28),
 ('IAmA', 'asshole', 360),
 ('steroids', 'asshole', 13),
 ('TrueTrueReddit', 'idiot', 1),
 ('Braveryjerk', 'asshole', 6),
 ('dating_advice', 'asshole', 204),
 ('books', 'idiot', 26),
 ('religion', 'asshole', 5),
 ('RedvsBlue', 'asshole', 1)]

In [37]:
df = rdd_subreddit.toDF(["subreddit", "word", "count"])

pivoted_df = df.groupBy("subreddit").pivot("word").sum("count").na.fill(0)

pivoted_df.show()



+----------------+-------+----+-------+-----+
|       subreddit|asshole|cunt|dumbass|idiot|
+----------------+-------+----+-------+-----+
|      MensRights|     98|  38|     15|   97|
|   RSBottingGoal|      0|   0|      0|    1|
|      MLBTheShow|      0|   0|      1|    3|
|  TrueOffMyChest|     17|   7|      3|   12|
|       bookshelf|      0|   1|      0|    0|
|       FrozenFun|      1|   0|      0|    0|
|    couchsurfing|      0|   0|      0|    1|
|          travel|     15|   1|      2|   30|
|    marvelheroes|      2|   0|      0|    1|
|      QuotesPorn|      3|   0|      0|    3|
|        lacrosse|      5|   0|      0|    1|
|           HPMOR|      0|   0|      0|    5|
|   ThisWarofMine|      0|   0|      0|    1|
|     battlefront|      0|   0|      0|    1|
|           anime|     40|  11|     13|   54|
|freelanceWriters|      0|   0|      0|    2|
|    SaltLakeCity|      3|   0|      0|    0|
|       metro2033|      0|   0|      0|    1|
|DarkSouls2League|      0|   0|   

                                                                                

                                                                                

[('Documentaries', 'idiot', 9),
 ('Documentaries', 'asshole', 7),
 ('Documentaries', 'cunt', 1),
 ('Documentaries', 'dumbass', 1),
 ('freebies', 'idiot', 2),
 ('restorethefourth', 'idiot', 1),
 ('BatmanArkham', 'idiot', 3),
 ('BatmanArkham', 'asshole', 1),
 ('SanctionedSuicide', 'asshole', 4),
 ('SanctionedSuicide', 'idiot', 2)]

In [None]:
sc.stop()