In [1]:
import pandas as pd
import requests 
import time
import datetime as dt

In [2]:
# Define fuction to pull subreddit submissions and pull fields from 'subfield'
def query_pushshift(subreddit, kind='submission', skip=30, times=10, 
                    subfield = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self'],
                    comfields = ['body', 'score', 'created_utc']):

    # Create stem of PushShift API URL + kind and subreddit name
    stem = "https://api.pushshift.io/reddit/search/{}/?subreddit={}&size=500".format(kind, subreddit)
    mylist = []
    
    # Create for loop from 1 to times (specified in )
    for x in range(1, times):
        
        URL = "{}&after={}d".format(stem, skip * x)
        print(URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        mylist.append(df)
        time.sleep(2)
    # Compile all posts into full list    
    full = pd.concat(mylist, sort=False)
    
    if kind == "submission":
        
        full = full[subfield]
        
        full = full.drop_duplicates()
        
        full = full.loc[full['is_self'] == True]
    # Transform UTC into date    
    def get_date(created):
        return dt.date.fromtimestamp(created)
    
    _timestamp = full["created_utc"].apply(get_date)
    
    full['timestamp'] = _timestamp

    print(full.shape)
    
    return full 

In [3]:
socialsm = query_pushshift('Socialism', skip = 60, times = 30)
socialsm.shape

https://api.pushshift.io/reddit/search/submission/?subreddit=Socialism&size=500&after=60d
https://api.pushshift.io/reddit/search/submission/?subreddit=Socialism&size=500&after=120d
https://api.pushshift.io/reddit/search/submission/?subreddit=Socialism&size=500&after=180d
https://api.pushshift.io/reddit/search/submission/?subreddit=Socialism&size=500&after=240d
https://api.pushshift.io/reddit/search/submission/?subreddit=Socialism&size=500&after=300d
https://api.pushshift.io/reddit/search/submission/?subreddit=Socialism&size=500&after=360d
https://api.pushshift.io/reddit/search/submission/?subreddit=Socialism&size=500&after=420d
https://api.pushshift.io/reddit/search/submission/?subreddit=Socialism&size=500&after=480d
https://api.pushshift.io/reddit/search/submission/?subreddit=Socialism&size=500&after=540d
https://api.pushshift.io/reddit/search/submission/?subreddit=Socialism&size=500&after=600d
https://api.pushshift.io/reddit/search/submission/?subreddit=Socialism&size=500&after=660d


(3912, 9)

In [4]:
communism = query_pushshift('Communism', skip = 30, times = 20)
communism.shape

https://api.pushshift.io/reddit/search/submission/?subreddit=Communism&size=500&after=30d
https://api.pushshift.io/reddit/search/submission/?subreddit=Communism&size=500&after=60d
https://api.pushshift.io/reddit/search/submission/?subreddit=Communism&size=500&after=90d
https://api.pushshift.io/reddit/search/submission/?subreddit=Communism&size=500&after=120d
https://api.pushshift.io/reddit/search/submission/?subreddit=Communism&size=500&after=150d
https://api.pushshift.io/reddit/search/submission/?subreddit=Communism&size=500&after=180d
https://api.pushshift.io/reddit/search/submission/?subreddit=Communism&size=500&after=210d
https://api.pushshift.io/reddit/search/submission/?subreddit=Communism&size=500&after=240d
https://api.pushshift.io/reddit/search/submission/?subreddit=Communism&size=500&after=270d
https://api.pushshift.io/reddit/search/submission/?subreddit=Communism&size=500&after=300d
https://api.pushshift.io/reddit/search/submission/?subreddit=Communism&size=500&after=330d
ht

(3602, 9)

In [5]:
# Concat seperate df pulls
communism_socialism_combined = pd.concat([communism, socialsm])
# Display proportions
communism_socialism_combined['subreddit'].value_counts(normalize = True)

socialism    0.520628
communism    0.479372
Name: subreddit, dtype: float64

In [6]:
communism_socialism_combined.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
1,Why do communists hate the government?,,communism,1543448391,Akimbo24,1,1,True,2018-11-28
2,Survival advice,Maybe if yall got rid of that yee yee ass ideo...,communism,1543449286,authisthicc,0,1,True,2018-11-28
5,Does communist country music exist?,,communism,1543451424,zzzergling,6,1,True,2018-11-28
8,Chinese voting,"To preface, I don't really hold any strong opi...",communism,1543455393,TheBeninator,1,1,True,2018-11-28
11,What's up with Vietnam and Laos?,"We hear so much talk about China, and rightful...",communism,1543462674,DoctorWasdarb,20,1,True,2018-11-28


In [7]:
# Save combined dataframes to csv
communism_socialism_combined.to_csv('./communism_socialism.csv')