In [1]:
import pandas as pd
import datetime as dt
import time
import requests


# Pushshift and cleaning

#### Takeaways from prelim testing:
* Can only get 100 posts per prequest (lit says 500)
* 77% of posts from pettyrevenge were selftext:'removed' , can filter during request
* ettyrevenge NO UNDERSCORE
* relationship_advice is way more active, make sure to sample from similar date ranges (so not all 7,000 r_a posts are from 2021 while pr crosses years - diction evolves).
    * pettyrevenge: 100 posts from 3/30-4/23 = 24 days
    * relat_adv can get 100 per day, but set day_window same as pettyrevenge
    
---

Exporting to csv (familiar) and json (not).  Jennifer recommended jq prettyprint and it may be useful format when checking on actual text form.

---

Cleaning in this notebook:
* worked filtering of 'removed' posts into for loop
* created timestampe from utc in for loop (code from class review)
    * this project has taught me I ALWAYS type timestampe with a final E like ye olde timestampe what the heck.
* dropping 'deleted' and is_self=False
* confirming date distrib is equivalent
* pulling out 'year' from timestamp (ended up dropping later, but felt like a good idea...)

Features to retain from each post
* **selftext**  --->  actual text
* **subreddit**  --->  target
* created_utc
* is_self  ---> can filter by is_self==True during scraping but all were True during test pulls
* score (upvotes, popularity metric; not necessarily up to date)
* title
* author
* num_comments

In [1]:
retain = ['selftext', 'subreddit', 'created_utc', 'is_self', 'score', 'title', 'author', 'num_comments']

In [112]:
def pushshift_go(subreddit, day_window, n):
    base = 'https://api.pushshift.io/reddit/search/submission'
    
    retain = ['selftext', 'subreddit', 'created_utc', 'is_self', 'score', 'title', 'author', 'num_comments']

    posts = []
    
    for i in range(1, n+1):

        params = {
        'subreddit' : subreddit,
        'size' : 100,
        'selftext:not' : '[removed]',
        'after': '{}d'.format(day_window*i) #after = searching forward, dupes will be deleted
        }
        res = requests.get(base, params)
        
        try:
            assert res.status_code == 200
            df = pd.DataFrame(res.json()['data'])[retain]
            posts.append(df)
            time.sleep(3)
        except: 
            print(f'Error: Status Code: {res.status_code}') #try-except ala Amanda

        #from class demo:
        total_scraped = sum(len(x) for x in posts)
        print(total_scraped)
        if total_scraped>7000:
            break


    alldata = pd.concat(posts, sort=False)
    alldata.drop_duplicates(inplace=True)
    alldata['timestamp'] = alldata['created_utc'].map(dt.date.fromtimestamp)  # from Gwen class
    
    print("Query Complete") #from class, how polite :)
    
    return alldata.reset_index(drop=True)

In [2]:
# %%time   #45min
# pr3 = pushshift_go('pettyrevenge', 25, 70)

In [54]:
len(pr3)

6592

In [74]:
pr3['year'] = pd.DatetimeIndex(pr3['timestamp']).year
pr3.sort_values('created_utc').head() #len 6592, earliest 2016-07-08, latest 2021-04-23

Unnamed: 0,selftext,subreddit,created_utc,is_self,score,title,author,num_comments,timestamp,year
6492,This one's short and sweet.\n\nA long time ago...,pettyrevenge,1468039320,True,1333,"Waste my time, I waste yours.",trampabroad,26,2016-07-08,2016
6493,https://imgur.com/gallery/PGl2D,pettyrevenge,1468041094,True,30,"Found this on Instagram, thought you guys migh...",Spontaneous_Mullet,13,2016-07-08,2016
6494,[deleted],pettyrevenge,1468072482,True,0,"Really? Steal *my* office? Okay, dothead....",[deleted],6,2016-07-09,2016
6495,So I am sleeping on this Saturday morning (pro...,pettyrevenge,1468080947,True,339,Happily sleeping and the phone rings,harssk,49,2016-07-09,2016
6496,[deleted],pettyrevenge,1468087635,True,131,Bad driver gets what's coming,[deleted],19,2016-07-09,2016


In [77]:
pr3.year.value_counts(normalize=True)  

2020    0.222694
2019    0.222239
2017    0.220874
2018    0.185983
2016    0.086924
2021    0.061286
Name: year, dtype: float64

In [120]:
pr3.is_self.value_counts()

True     6590
False       2
Name: is_self, dtype: int64

#### Remove 'is_self' == False (2), and any selftexts that were 'deleted' (~300)

In [121]:
pr3 = pr3[pr3['is_self']==True]

In [122]:
pr3.is_self.value_counts()

True    6590
Name: is_self, dtype: int64

In [135]:
pr3 = pr3[pr3['selftext']!='[deleted]']

In [136]:
pr3.to_csv('../data/pettyrev.csv', index=False)

In [137]:
pr3.to_json('../data/json_pr.json', orient='records')

## On to Relationship_advice

In [3]:
# %%time   # 7min
# ra1 = pushshift_go('relationship_advice', 25, 70)

In [118]:
ra1['year'] = pd.DatetimeIndex((ra1['timestamp'])).year
print(len(ra1))
ra1.head()

7000


Unnamed: 0,selftext,subreddit,created_utc,is_self,score,title,author,num_comments,timestamp,year
0,We’re not like publicly dating but we get down...,relationship_advice,1617083162,True,1,Desperately need help with my stepsister,BananaForScale69420,7,2021-03-29,2021
1,How should I tell my partner who plays games o...,relationship_advice,1617083390,True,1,How should I?,user_9012021,3,2021-03-29,2021
2,I found out on Saturday that my grandfather ha...,relationship_advice,1617083473,True,1,I don’t want to go into work.,pastabake101,10,2021-03-29,2021
3,So my girlfriends friend told me that i remind...,relationship_advice,1617083711,True,1,Advice please,RevolutionaryAD21,5,2021-03-29,2021
4,ThrowRA\nMy boyfriend(17M) and I(17F) had a ta...,relationship_advice,1617083749,True,1,How do I remember changes I want to make?,Bubbly-Appointment74,7,2021-03-29,2021


In [117]:
ra1.value_counts('year')

year
2020    1500
2018    1500
2019    1400
2017    1400
2016     800
2021     400
dtype: int64

In [119]:
ra1.is_self.value_counts()

True     6980
False      20
Name: is_self, dtype: int64

#### Remove is_self==False (20), and any 'deleted' posts (833)

In [123]:
ra1 = ra1[ra1['is_self']==True]

In [124]:
ra1.is_self.value_counts()

True    6980
Name: is_self, dtype: int64

In [140]:
ra1 = ra1[ra1.selftext!='[deleted]']

In [145]:
ra1.shape
# ra1.loc[ra1.selftext=='[deleted]']  #confirmed

(6147, 10)

In [146]:
ra1.to_csv('../data/relatadv.csv', index=False)

In [147]:
ra1.to_json('../data/json_ra.json', orient='records')