In [1]:
import json
import pandas as pd
import numpy as np
import os
from glob import glob
from pyspark.sql import SparkSession
from pyspark import SparkContext
import pyspark.ml as M
import pyspark.sql.functions as F
import pyspark.sql.types as T
SparkContext.setSystemProperty('spark.executor.memory', '32g')
sc = SparkContext("local", "App Name")
sc.setLogLevel("ERROR")
spark = SparkSession(sc)
spark.conf.set('spark.ui.showConsoleProgress', True)
spark.conf.set("spark.sql.shuffle.partitions", 8)

In [2]:
posts = glob('/datasets/dsc180a-wi20-public/Malware/group_data/group_02/data/raw/posts/*.csv')
posts = pd.concat([pd.read_csv(i) for i in posts], ignore_index = True)
posts.head()

Unnamed: 0,id,author,title,selftext,num_comments,created_utc,full_link,subreddit,score
0,fsogd7,da-doggo2468,Some diamond armor perler beads I made,,8,1585699169,https://www.reddit.com/r/Minecraft/comments/fs...,Minecraft,24
1,fsog50,PsaPanic,Xbox to PC cross play not working?,Me and my girlfriend are tryna play on my Xbox...,17,1585699150,https://www.reddit.com/r/Minecraft/comments/fs...,Minecraft,1
2,fsofzi,rubychoco99,Are there any shaders just for lighting?,I don’t want shaders that change what blocks l...,6,1585699135,https://www.reddit.com/r/Minecraft/comments/fs...,Minecraft,3
3,fsofsb,greengamer01,Had a dream that minecraft came out on the psp...,,5,1585699116,https://www.reddit.com/r/Minecraft/comments/fs...,Minecraft,5
4,fsofom,starBURST312,My iron farm was looking pretty basic... so I ...,,12,1585699106,https://www.reddit.com/r/Minecraft/comments/fs...,Minecraft,17


In [3]:
posts.id.nunique()

250000

In [4]:
comments = glob('/datasets/dsc180a-wi20-public/Malware/group_data/group_02/data/raw/comments/*')
comments = spark.read.format("csv").option("header", "true").load(comments)

In [5]:
comments.count()

2770517

In [6]:
comments.show()

+-------+-------------------+-----------+------------+-------------+---------+------------+
|     id|             author|created_utc|is_submitter|    subreddit|  link_id|send_replies|
+-------+-------------------+-----------+------------+-------------+---------+------------+
|fm2b5i7|     LesserPyrenees| 1585693985|       False|todayilearned|t3_fsmwuu|        True|
|fm2b6h8|clearliquidclearjar| 1585693999|       False|todayilearned|t3_fsmwuu|        True|
|fm2bb71|       PorkfatWilly| 1585694072|       False|todayilearned|t3_fsmwuu|        True|
|fm2bmo3|        TippsAttack| 1585694253|       False|todayilearned|t3_fsn07n|        True|
|fm2btww|          agent3dev| 1585694366|       False|todayilearned|t3_fsmwuu|        True|
|fm2c6eb|      iAmH3r3ToH3lp| 1585694560|       False|todayilearned|t3_fsmwuu|        True|
|fm2cdqh|      AutoModerator| 1585694673|       False|todayilearned|t3_fsn65j|       False|
|fm2cs79|      iAmH3r3ToH3lp| 1585694899|       False|todayilearned|t3_fsmu6g|  

In [7]:
comments.select('link_id').distinct().count()

177908

In [8]:
comments.groupBy(F.col('is_submitter')).count().show()

+------------+-------+
|is_submitter|  count|
+------------+-------+
|        True| 276309|
|       False|2494208|
+------------+-------+



In [9]:
comments.select('author').distinct().count()

695560

In [6]:
filepath = "/datasets/dsc180a-wi20-public/Malware/group_data/group_02/data/raw"

In [34]:
subreddit_info = {'subreddit':[],'num_post':[],'total_comment':[],'avg_comment':[],'top_num_comment':[]}
for filename in os.listdir(filepath + '/posts'):
    if filename.endswith(".csv"):
        subreddit = pd.read_csv(os.path.join(filepath + '/posts', filename))
        #subreddit name
        subreddit_info['subreddit'].append(filename.split('.')[0])
        subreddit_info['num_post'].append(len(subreddit))
        subreddit_info['total_comment'].append(subreddit['num_comments'].sum())
        subreddit_info['avg_comment'].append(int(subreddit['num_comments'].sum()/len(subreddit)))
        subreddit_info['top_num_comment'].append(subreddit['num_comments'].max())
    #break

In [36]:
pd.DataFrame(subreddit_info).sort_values('total_comment',ascending = False)

Unnamed: 0,subreddit,num_post,total_comment,avg_comment,top_num_comment
19,todayilearned,5000,194798,38,4142
42,amitheasshole,5000,158348,31,2561
35,classicwow,5000,148426,29,1585
45,twoxchromosomes,5000,126261,25,4609
8,dndnext,5000,126085,25,1272
21,space,5000,122510,24,5349
16,ffxiv,5000,103658,20,892
36,tifu,5000,99814,19,3649
30,unpopularopinion,5000,87949,17,6736
26,sysadmin,5000,87706,17,3532
