In [2]:
import sys
assert sys.version_info >= (3, 8) # make sure we have Python 3.8+
from pyspark.sql import SparkSession, functions, types

comments_schema = types.StructType([
    types.StructField('archived', types.BooleanType()),
    types.StructField('author', types.StringType()),
    types.StructField('author_flair_css_class', types.StringType()),
    types.StructField('author_flair_text', types.StringType()),
    types.StructField('body', types.StringType()),
    types.StructField('controversiality', types.LongType()),
    types.StructField('created_utc', types.StringType()),
    types.StructField('distinguished', types.StringType()),
    types.StructField('downs', types.LongType()),
    types.StructField('edited', types.StringType()),
    types.StructField('gilded', types.LongType()),
    types.StructField('id', types.StringType()),
    types.StructField('link_id', types.StringType()),
    types.StructField('name', types.StringType()),
    types.StructField('parent_id', types.StringType()),
    types.StructField('retrieved_on', types.LongType()),
    types.StructField('score', types.LongType()),
    types.StructField('score_hidden', types.BooleanType()),
    types.StructField('subreddit', types.StringType()),
    types.StructField('subreddit_id', types.StringType()),
    types.StructField('ups', types.LongType()),
    #types.StructField('year', types.IntegerType()),
    #types.StructField('month', types.IntegerType()),
])


def main(in_directory, out_directory):
    comments = spark.read.json(in_directory, schema=comments_schema)

    # TODO

    #best_author.write.json(out_directory, mode='overwrite')


# if __name__=='__main__':
#     in_directory = sys.argv[1]
#     out_directory = sys.argv[2]
#     spark = SparkSession.builder.appName('Reddit Relative Scores').getOrCreate()
#     assert spark.version >= '3.2' # make sure we have Spark 3.2+
#     spark.sparkContext.setLogLevel('WARN')

#     main(in_directory, out_directory)


In [16]:
in_directory = 'reddit-1'
out_directory = 'output'
spark = SparkSession.builder.appName('Reddit Relative Scores').getOrCreate()
assert spark.version >= '3.2' # make sure we have Spark 3.2+
spark.sparkContext.setLogLevel('WARN')

comments = spark.read.json(in_directory, schema=comments_schema)


In [17]:
# comments.show()
averages = comments.groupBy(comments['subreddit']).avg('score')
# cache averages???? test time later
averages = averages.filter(averages['avg(score)']>0)
comments = comments.join(averages, 'subreddit')
comments = comments.withColumn("rel_score", comments['score']/comments['avg(score)'])

+---------+------------------+-----+-----------------+
|subreddit|            author|score|       avg(score)|
+---------+------------------+-----+-----------------+
|     xkcd|         [deleted]|    1|5.272939881689366|
|    scala|             gtani|    1|1.928939237899073|
|    scala|            ohdeno|    2|1.928939237899073|
|    scala|            ohdeno|    2|1.928939237899073|
|     xkcd|         A_Simpson|   10|5.272939881689366|
|    scala|             gtani|    1|1.928939237899073|
|Genealogy|          Starly24|    2|1.871313672922252|
|Genealogy|   StabMasterArson|    1|1.871313672922252|
|     xkcd|         PirateMud|    1|5.272939881689366|
|     xkcd|        DerFrycook|    2|5.272939881689366|
|     xkcd|            whtrbt|    1|5.272939881689366|
|     xkcd|           gfixler|    3|5.272939881689366|
|     xkcd|genericusername123|    7|5.272939881689366|
|     xkcd|        Canteloupe|   17|5.272939881689366|
|     xkcd|    AgentConundrum|    2|5.272939881689366|
|     xkcd

In [22]:
max_score = comments.groupBy(comments['subreddit']).max('rel_score')
comments = comments.join(max_score, 'subreddit')
comments.show()

+---------+------------------+-----+-----------------+------------------+-----------------+
|subreddit|            author|score|       avg(score)|         rel_score|   max(rel_score)|
+---------+------------------+-----+-----------------+------------------+-----------------+
|     xkcd|         [deleted]|    1|5.272939881689366|0.1896475253724334|63.15262594902032|
|     xkcd|         A_Simpson|   10|5.272939881689366| 1.896475253724334|63.15262594902032|
|     xkcd|         PirateMud|    1|5.272939881689366|0.1896475253724334|63.15262594902032|
|     xkcd|        DerFrycook|    2|5.272939881689366|0.3792950507448668|63.15262594902032|
|     xkcd|            whtrbt|    1|5.272939881689366|0.1896475253724334|63.15262594902032|
|     xkcd|           gfixler|    3|5.272939881689366|0.5689425761173001|63.15262594902032|
|     xkcd|genericusername123|    7|5.272939881689366|1.3275326776070338|63.15262594902032|
|     xkcd|        Canteloupe|   17|5.272939881689366| 3.224007931331368|63.1526

In [23]:
comments = comments.filter(comments['rel_score'] == comments['max(rel_score)'])
comments.show()

+---------+-----------+-----+------------------+------------------+------------------+
|subreddit|     author|score|        avg(score)|         rel_score|    max(rel_score)|
+---------+-----------+-----+------------------+------------------+------------------+
|     xkcd|  shigawire|  333| 5.272939881689366| 63.15262594902032| 63.15262594902032|
|    scala|   TheSmoke|   17| 1.928939237899073| 8.813134009610252| 8.813134009610252|
|optometry|Klinefelter|    6|1.4701986754966887| 4.081081081081082| 4.081081081081082|
|  Cameras|  [deleted]|    2|1.2222222222222223|1.6363636363636362|1.6363636363636362|
|  Cameras|TogOfStills|    2|1.2222222222222223|1.6363636363636362|1.6363636363636362|
|Genealogy|  ackbar420|   12| 1.871313672922252| 6.412607449856734| 6.412607449856734|
+---------+-----------+-----+------------------+------------------+------------------+



In [24]:
best_author = comments.select("subreddit", "author", "rel_score")
best_author.show()

                                                                                

+---------+-----------+------------------+
|subreddit|     author|         rel_score|
+---------+-----------+------------------+
|     xkcd|  shigawire| 63.15262594902032|
|    scala|   TheSmoke| 8.813134009610252|
|optometry|Klinefelter| 4.081081081081082|
|  Cameras|  [deleted]|1.6363636363636362|
|  Cameras|TogOfStills|1.6363636363636362|
|Genealogy|  ackbar420| 6.412607449856734|
+---------+-----------+------------------+

