In [73]:
import pandas as pd 
import datetime
import csv
import os 
import requests 
import datetime as dt
import time

# Exercise

For your exercise do the following:

1. Choose a reddit page you want to crawl
2. The following fields should be present when you crawl **(10 points)**:
    - author
    - subreddit
    - date created 
    - number of comments 
    - score
    - submission title 
    - submission description
3. After crawling, save your results to a pandas dataframe **(3 points)**. 
4. Answer the following questions **(12 points)**:
    - How many submissions were you able to gather? 
    - Who has the most submissions? 
    - Which submission has the highest score? 
    - Which submission has the highest number of comments?
    - Which day of the week has the most submissions? 
    
**Tip:** _For item#4, recall how to use the aggregation functions in `pandas` like count, value_counts, sum, etc. For getting the day of the week, look into how to get the `dayofweek` from a datetime object in `pandas`. (Hint: You may need to use `pd.to_datetime` to convert your date column...)_

In [148]:
def to_utc(date):
    #This function converts an object to UTC. This is to automate the conversion 
    #of dates instead of going to https://www.unixtimeconverter.io/ 
    return int(date.replace(tzinfo=dt.timezone.utc).timestamp())
    
def to_readable_date(timestamp):
    #This function converts the UTC format to a Year-Month-Day format 
    return dt.datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d")

#Declare start and end of reddit posts to extract 
start_date = dt.datetime.strptime("2020-08-10", "%Y-%m-%d")
end_date = dt.datetime.strptime("2020-08-15", "%Y-%m-%d")

date_range = (pd.date_range(
                start_date, 
                periods=(end_date - start_date).days + 2)
              .tolist())

#prepare the parameters needed to call the API
sort_type="score"
sort="desc"
fields=["author","subreddit","created_utc","num_comments","score","title","selftext"]
subreddit = 'ProRevenge'
url = "https://api.pushshift.io/reddit/submission/search/"
results = []
#loop through the dates 
for i, s_date in enumerate(date_range):
    #prevents us from getting an index out of range error
    if i != len(date_range)-1:
        #declare end date 
        e_date = date_range[i+1]
        #call the API
        r = requests.get(url = url, params={
            'after': to_utc(s_date),
            'before': to_utc(e_date),
            'sort_type': sort_type,
            'sort': sort,
            'subreddit': subreddit,
            'fields': fields,
            "size": 500
        })

        #add logs 
        print(f"Doing {s_date.strftime('%Y-%m-%d')} to {e_date.strftime('%Y-%m-%d')}")
        if r.status_code == 200:
            results.append(r.json()['data'])
            print("=====Done")
        else:
            print("=====Skipped")
        #so that we dont get blocked from abusing the API we call it after pausing for 1 second
        time.sleep(1)

Doing 2020-08-10 to 2020-08-11
=====Done
Doing 2020-08-11 to 2020-08-12
=====Done
Doing 2020-08-12 to 2020-08-13
=====Done
Doing 2020-08-13 to 2020-08-14
=====Done
Doing 2020-08-14 to 2020-08-15
=====Done
Doing 2020-08-15 to 2020-08-16
=====Done


In [149]:
flat_list = []
#loop through the reddit results
for sublist in results:
    #check if sublist is not empty. The reason we have empty lists is because there are days wherein there are no submissions
    if sublist is not None:
        #for each dictionary in the sublist add it to the flat list 
        for item in sublist:
            flat_list.append(item)

#pandas has a useful function called from_dict which will convert a list of dictionary objects into a dataframe
df = pd.DataFrame.from_dict(flat_list)
display(df.head())
df.to_csv("reddit_ProRevenge.csv")

Unnamed: 0,author,created_utc,num_comments,score,selftext,subreddit,title
0,Picachu-I-Choose-You,1597031922,21,1,"I tried to post this on Pettyrevenge, but it w...",ProRevenge,I can’t walk my dog? Then say hello to my litt...
1,Chickennuggies79,1597025467,6,1,So this happened years ago. I used to live out...,ProRevenge,Treat me like dirt
2,DSGyoyo,1597048325,2,1,[removed],ProRevenge,Repot my dad for false allegation and call me ...
3,_Book_Wurm,1597033674,6,1,"Not sure of this counts as ""pro"". You be the j...",ProRevenge,HVAC company fired me without cause. The E.P.A...
4,LisaSKadel,1597091579,19,1,Okay so I've been friends with this woman for ...,ProRevenge,Revenge on an ex friend


In [150]:
#Pandas Datafram
df

Unnamed: 0,author,created_utc,num_comments,score,selftext,subreddit,title
0,Picachu-I-Choose-You,1597031922,21,1,"I tried to post this on Pettyrevenge, but it w...",ProRevenge,I can’t walk my dog? Then say hello to my litt...
1,Chickennuggies79,1597025467,6,1,So this happened years ago. I used to live out...,ProRevenge,Treat me like dirt
2,DSGyoyo,1597048325,2,1,[removed],ProRevenge,Repot my dad for false allegation and call me ...
3,_Book_Wurm,1597033674,6,1,"Not sure of this counts as ""pro"". You be the j...",ProRevenge,HVAC company fired me without cause. The E.P.A...
4,LisaSKadel,1597091579,19,1,Okay so I've been friends with this woman for ...,ProRevenge,Revenge on an ex friend
...,...,...,...,...,...,...,...
91,TheMissingChapstick,1597507372,2,1,[removed],ProRevenge,Girlfriend airdrops guys nudes in to everyone ...
92,Any-League382,1597511282,2,1,[removed],ProRevenge,I pushed my ex friend who betrayed me off a sk...
93,Illustrious_Tennis_7,1597499289,2,1,[removed],ProRevenge,Tried getting revenge for rape
94,greenlady3,1597498254,2,1,[removed],ProRevenge,Made lax brownies


In [151]:
#To conver UTC to proper date formatting
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

In [152]:
df

Unnamed: 0,author,created_utc,num_comments,score,selftext,subreddit,title
0,Picachu-I-Choose-You,2020-08-10 03:58:42,21,1,"I tried to post this on Pettyrevenge, but it w...",ProRevenge,I can’t walk my dog? Then say hello to my litt...
1,Chickennuggies79,2020-08-10 02:11:07,6,1,So this happened years ago. I used to live out...,ProRevenge,Treat me like dirt
2,DSGyoyo,2020-08-10 08:32:05,2,1,[removed],ProRevenge,Repot my dad for false allegation and call me ...
3,_Book_Wurm,2020-08-10 04:27:54,6,1,"Not sure of this counts as ""pro"". You be the j...",ProRevenge,HVAC company fired me without cause. The E.P.A...
4,LisaSKadel,2020-08-10 20:32:59,19,1,Okay so I've been friends with this woman for ...,ProRevenge,Revenge on an ex friend
...,...,...,...,...,...,...,...
91,TheMissingChapstick,2020-08-15 16:02:52,2,1,[removed],ProRevenge,Girlfriend airdrops guys nudes in to everyone ...
92,Any-League382,2020-08-15 17:08:02,2,1,[removed],ProRevenge,I pushed my ex friend who betrayed me off a sk...
93,Illustrious_Tennis_7,2020-08-15 13:48:09,2,1,[removed],ProRevenge,Tried getting revenge for rape
94,greenlady3,2020-08-15 13:30:54,2,1,[removed],ProRevenge,Made lax brownies


In [153]:
# To find most submissions
df.author.mode()

0    giganticfuel
dtype: object

In [154]:
a = df.num_comments.max()
a

348

In [155]:
most_comments = df[df.num_comments == 348]
most_comments.head()
most_comments.title

7    Don't scam a Bot Developer
Name: title, dtype: object

In [157]:
s = pd.date_range('2020-08-09', '2020-08-15', freq='D').to_series()

In [158]:
s.dt.dayofweek

2020-08-09    6
2020-08-10    0
2020-08-11    1
2020-08-12    2
2020-08-13    3
2020-08-14    4
2020-08-15    5
Freq: D, dtype: int64

In [159]:
df['created_utc'] = pd.to_datetime(df['created_utc'].dt.date)

In [160]:
df

Unnamed: 0,author,created_utc,num_comments,score,selftext,subreddit,title
0,Picachu-I-Choose-You,2020-08-10,21,1,"I tried to post this on Pettyrevenge, but it w...",ProRevenge,I can’t walk my dog? Then say hello to my litt...
1,Chickennuggies79,2020-08-10,6,1,So this happened years ago. I used to live out...,ProRevenge,Treat me like dirt
2,DSGyoyo,2020-08-10,2,1,[removed],ProRevenge,Repot my dad for false allegation and call me ...
3,_Book_Wurm,2020-08-10,6,1,"Not sure of this counts as ""pro"". You be the j...",ProRevenge,HVAC company fired me without cause. The E.P.A...
4,LisaSKadel,2020-08-10,19,1,Okay so I've been friends with this woman for ...,ProRevenge,Revenge on an ex friend
...,...,...,...,...,...,...,...
91,TheMissingChapstick,2020-08-15,2,1,[removed],ProRevenge,Girlfriend airdrops guys nudes in to everyone ...
92,Any-League382,2020-08-15,2,1,[removed],ProRevenge,I pushed my ex friend who betrayed me off a sk...
93,Illustrious_Tennis_7,2020-08-15,2,1,[removed],ProRevenge,Tried getting revenge for rape
94,greenlady3,2020-08-15,2,1,[removed],ProRevenge,Made lax brownies


In [163]:
#For day of the week counts
a = df['created_utc'].value_counts()

In [164]:
a

2020-08-15    20
2020-08-10    20
2020-08-12    17
2020-08-13    13
2020-08-11    13
2020-08-14    13
Name: created_utc, dtype: int64

In [None]:
# How many submissions were you able to gather?
# - 117 Submissions gathered
# Who has the most submissions?
# - 2 modes are found with most submissions, each with 3 submissions.
# Which submission has the highest score?
# -Can't seem to understand why scores are registering as "1"s, tried to sort it by score, but everything is still at 1.

# Which submission has the highest number of comments?
# - Highest number of comments is the post "Don't scam a Bot developer"

# Which day of the week has the most submissions?
# -There was a tie in day of the most submissions on day of the week that are on August 15 and August 10, with reference to the day of the week being converted, it is on day 0 and 5 which is Monday and Saturday