In [1]:
import math
from pprint import pprint
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import praw
from praw.models import Submission
from prawcore.exceptions import Forbidden,NotFound
#from psaw import PushshiftAPI
from pmaw import PushshiftAPI
import csv
import os
from urllib.error import HTTPError
import glob
import requests
import json
from json import JSONDecodeError
import datetime
import pickle
from collections import defaultdict
import datetime as dt

In [2]:
import re

_RE_COMBINE_WHITESPACE = re.compile(r"\s+")

def strip_whitespace(text):
    return _RE_COMBINE_WHITESPACE.sub(" ", text).strip()

In [3]:
with open('../keywords_high_precision.txt','r') as f:
    KEYWORDS_HI_PREC = f.read().splitlines()

KEYWORDS_SHORT = set(["climate change","global warming","carbon","co2","methane",
                  "green","environment","fossil fuel"])

In [4]:
KEYWORDS_HI_PREC

['global warming',
 'climate change',
 'carbon dioxide',
 'co2',
 'methane',
 'fossil fuel',
 'climate crisis',
 'climate emergency',
 'extreme weather',
 'clean energy',
 'renewable energy',
 'cap and trade',
 'sea level rise',
 'IPCC',
 'deforestation',
 'permafrost',
 'greenhouse gas',
 'greenhouse effect',
 'green new deal',
 'environmentalism',
 'EPA']

There are two ways of getting Reddit data illustrated in this notebook:
* 1. [PRAW (Python Reddit API Wrapper)](#https://praw.readthedocs.io/en/latest/)
* 2. [Pushshift API](#https://pushshift.io/api-parameters/)
    
The main advantage of Pushshift is that it accesses data from an archive, so even posts from currently banned subreddits (e.g., r/The_Donald) are accessible.

# Using PRAW

You need to first create a reddit instance to use the PRAW API. Fill in the 3 fields (left blank) according to the instructions [here](#https://praw.readthedocs.io/en/latest/getting_started/authentication.html). Note: you will need to have a Reddit account and to register a developer app [here](#https://www.reddit.com/prefs/apps/).

In [27]:
reddit = praw.Reddit(client_id='ACEORGSlEeJyQhPyNRAUiA',
                     client_secret='xGN33fztXu_4jiOek_RHUtMGhHXAcQ',
                     user_agent='me',
                    password='redditC0bintr@sena',
                    username='Western-Wishbone573')

In [28]:
print(reddit.user.me())

Western-Wishbone573


In [35]:
# Replace with the subreddits you're interested in
non_niche_subs = set(['AskReddit','environment','politics','worldnews','climateskeptics',
                     'Showerthoughts','climate','askscience','The_Donald','science','EcoInternet',
                     'collapse','explainlikeimfive','conspiracy','NoStupidQuestions','australia',
                     'unpopularopinion','climatechange','news','energy','canada','Conservative',
                     'skeptic','todayilearned','shittyaskscience','ChapoTrapHouse','CanadaPolitics',
                     'EverythingScience','worldpolitics','europe','AskScienceDiscussion',
                     'ClimateOffensive','changemyview','ClimateActionPlan','AskTrumpSupporters',
                     'GlobalWarming','GlobalClimateChange','esist','Green'])

In [16]:
def get_praw_submissions(reddit_instance,subreddit_str):
    
    if not os.path.exists('praw_output'):
        os.mkdir('praw_output')
    
    subreddit = reddit_instance.subreddit(subreddit_str)
    print('Getting submissions and comments from: {} ...'.format(subreddit.display_name))  
    try:
        title = subreddit.title
        desc = subreddit.description

        if not os.path.exists('subreddits.tsv'):
            with open('subreddits.tsv','w') as f:
                csvwriter = csv.writer(f, delimiter='\t')
                csvwriter.writerow([subreddit.display_name,title,desc])
        else:
            with open('subreddits.tsv','a') as f:
                csvwriter = csv.writer(f, delimiter='\t')
                csvwriter.writerow([subreddit.display_name,title,desc])

        # Write header
        with open(os.path.join('praw_output','{}.tsv'.format(subreddit.display_name)), 'w', newline='\n') as csvfile:
            csvwriter = csv.writer(csvfile, delimiter='\t',
                                    quotechar='|', quoting=csv.QUOTE_MINIMAL)
            csvwriter.writerow(['title','author','date','is_video','id','num_downs','num_ups','upvote_ratio',
                               'num_comments','score','text','subreddit'])

        # Write body
        for submission in subreddit.new(limit=None):
            sub_title = submission.title
            sub_author = submission.author.name if submission.author is not None else -1
            sub_date = submission.created
            sub_is_vid = submission.is_video
            sub_id = submission.id
            sub_downvotes = submission.downs
            sub_upvotes = submission.ups
            sub_upvote_ratio = submission.upvote_ratio
            sub_num_comments = submission.num_comments
            sub_score = submission.score
            sub_text = submission.selftext.strip().replace('\t','').replace('\n','')
            sub_subreddit = submission.subreddit.display_name
            with open(os.path.join('praw_output','{}.tsv'.format(subreddit.display_name)), 'a', newline='\n') as csvfile:
                csvwriter = csv.writer(csvfile, delimiter='\t',
                                    quotechar='|', quoting=csv.QUOTE_MINIMAL)
                csvwriter.writerow([sub_title,sub_author,sub_date,sub_is_vid,sub_id,sub_downvotes,
                                   sub_upvotes,sub_upvote_ratio,sub_num_comments,sub_score,sub_text,sub_subreddit])
        
        print('Wrote output to:', os.path.join('praw_output','{}.tsv'.format(subreddit.display_name)))
        
    except HTTPError as e:
        if e.code == 403:
            print('Forbidden: private subreddit.')
            
def get_praw_submission_comments(reddit_instance,subreddit,submission_id):
    
    submission = Submission(reddit_instance,id=submission_id)
    
    try:
        submission.comments.replace_more(limit=0)
        all_comments = submission.comments.list()

        # Write header
        with open(os.path.join('praw_output','post_comments','{}_COMMENTS.tsv'.format(subreddit)), 'w', newline='\n') as csvfile:
            csvwriter = csv.writer(csvfile, delimiter='\t',
                                    quotechar='|', quoting=csv.QUOTE_MINIMAL)
            csvwriter.writerow(['submission_id','author','text','date','id','controversiality','num_downs','num_ups',
                               'num_likes','score','subreddit'])

        # Write body
        for comment in all_comments:
            sub_id = comment._submission.id
            assert sub_id == submission_id
            author_name = comment.author.name if comment.author is not None else -1
            comment_body = comment.body.strip().replace('\t','').replace('\n','')
            date_created = comment.created
            comment_id = comment.id
            controversiality = comment.controversiality
            num_downs = comment.downs
            num_ups = comment.ups
            num_likes = comment.likes
            score = comment.score
            subreddit_name = comment.subreddit.display_name
            #print(subreddit_name,subreddit)
            assert subreddit_name == subreddit
            
            with open(os.path.join('praw_output','post_comments','{}_COMMENTS.tsv'.format(subreddit_name)), 'a', newline='\n') as csvfile:
                csvwriter = csv.writer(csvfile, delimiter='\t',
                                    quotechar='|', quoting=csv.QUOTE_MINIMAL)
                csvwriter.writerow([sub_id,author_name,comment_body,date_created,comment_id,controversiality,
                                   num_downs,num_ups,num_likes,score,subreddit_name])
    except HTTPError as e:
        if e.code == 403:
            print('Forbidden: private subreddit.')

## Get a post by its post id

In [108]:
p_id = '2yrvob'
post = reddit.submission(id=p_id)

In [147]:
#[x.split('_')[-1] for x in post.__dict__['_comments_by_id'].keys()]

In [7]:
post.url

'http://www.sciencedaily.com/releases/2010/01/100114081543.htm'

In [8]:
post.is_self

False

In [9]:
post.selftext

'[deleted]'

## Get posts from a subreddit (e.g. r/spambotwatch)

In [17]:
get_praw_submissions(reddit,'spambotwatch')

Getting submissions and comments from: spambotwatch ...
Wrote output to: praw_output/spambotwatch.tsv


In [15]:
spambotwatch_df = pd.read_csv('praw_output/spambotwatch.tsv',sep='\t',header=0)
spambotwatch_df.head()

Unnamed: 0,title,author,date,is_video,id,num_downs,num_ups,upvote_ratio,num_comments,score,text,subreddit
0,JEEVAN BOBY (u/jeevanbobyvallickad) - Reddit,TheGeorge,1566348000.0,False,ct1svp,0,1,1.0,1,1,,spambotwatch
1,overview for funnynova,BuckRowdy,1555999000.0,False,bg7w8w,0,1,1.0,0,1,,spambotwatch
2,Spams Dating Tips Websites all with Stolen Con...,TheGeorge,1554842000.0,False,bb7e57,0,1,1.0,0,1,,spambotwatch
3,overview for poopcake5,ActionScripter9109,1501372000.0,False,6qbmgl,0,1,1.0,1,1,,spambotwatch
4,overview for sutei_m,ActionScripter9109,1501372000.0,False,6qbmar,0,1,1.0,1,1,,spambotwatch


## Get posts from list of subreddits

In [None]:
for sub in non_niche_subs:
    get_submissions(reddit,sub)

## Inspect output: tsv of subreddits and meta

In [86]:
pd.read_csv('subreddits.tsv',sep='\t',header=None).drop_duplicates(0,keep='first')

Unnamed: 0,0,1,2
0,climateskeptics,Climate Skeptics: Trying to see through the al...,"Seeing past hyperbole, alarmism and environmen..."
1,skeptic,skeptic,## [Click this link to Read the Rules](http://...
2,climatechange,A place for a rational discussion on a divisiv...,This is a place for the rational discussion of...
3,climate,Information about the world's climate,Real and accurate data about the Earth's clima...
4,science,Reddit Science,# [Submission Rules](https://www.reddit.com/r/...
...,...,...,...
63,republicans,Republicans - RNC - GOP: Grand Old Party,"Republican, RNC and GOP news, issues, gossip, ..."
64,askaconservative,Ask A Conservative: Ask Conservatives And Repu...,#[Ask a Conservative](/r/askaconservative)\n\n...
65,Conservative,Conservative,#####\n**[Join us on discord.](https://discord...
66,conservatives,conservatives,"Conservatism (from, conservare, ""to preserve"")..."


## Inspect output: tsv of one subreddit's posts

In [88]:
df = pd.read_csv('praw_output/350.tsv',sep='\t',header=0)

In [89]:
df.columns

Index(['title', 'author', 'date', 'is_video', 'id', 'num_downs', 'num_ups',
       'upvote_ratio', 'num_comments', 'score', 'text', 'subreddit'],
      dtype='object')

In [90]:
df.is_video.value_counts()

False    127
Name: is_video, dtype: int64

In [91]:
df.subreddit.value_counts()

350    127
Name: subreddit, dtype: int64

In [160]:
#df.head(15)

## Get comments for all posts with non-zero num comments

In [156]:
for subreddit_tsv in glob.glob('praw_output/posts/*.tsv'):
    if os.path.exists('praw_output/post_comments/{}_COMMENTS.tsv'.format(subreddit_tsv.split('/')[-1][:-4])):
        print('Already got comments for subreddit {}'.format(subreddit_tsv))
    else:
        subreddit_posts = pd.read_csv(subreddit_tsv,sep='\t',header=0)
        if len(subreddit_posts) > 0:
            subreddit = str(subreddit_posts.iloc[0]['subreddit'])
            posts_with_comments = subreddit_posts.loc[subreddit_posts.num_comments > 0]
            if len(posts_with_comments) > 0:
                print('Getting comments from posts in subreddit: {}'.format(subreddit))
                for ix,row in posts_with_comments.iterrows():
                    get_submission_comments(reddit,subreddit,row['id'])
            else:
                print('0 comments among all posts in subreddit: {}'.format(subreddit))
        else:
            print('Subreddit {} has no posts'.format(subreddit_tsv))

Already got comments for subreddit praw_output/posts/350.tsv
Already got comments for subreddit praw_output/posts/350ppm.tsv
Already got comments for subreddit praw_output/posts/askaconservative.tsv
Already got comments for subreddit praw_output/posts/AskTrumpSupporters.tsv
Already got comments for subreddit praw_output/posts/carboncapture.tsv
Already got comments for subreddit praw_output/posts/carbontax.tsv
Already got comments for subreddit praw_output/posts/ccfunding.tsv
Already got comments for subreddit praw_output/posts/climate.tsv
Already got comments for subreddit praw_output/posts/climate_activism.tsv
Already got comments for subreddit praw_output/posts/climate_discussion.tsv
Already got comments for subreddit praw_output/posts/climate_science.tsv
Already got comments for subreddit praw_output/posts/ClimateActionPlan.tsv
Already got comments for subreddit praw_output/posts/climatechange.tsv
Already got comments for subreddit praw_output/posts/ClimateChangeCancer.tsv
Already g

# Use Pushshift API

The Pushshift API uses the requests library to make requests to 3 possible endpoints:
    
* /reddit/comment/search (corresponding to a comment)
* /reddit/submission/search (corresponding to a post)
* /reddit/subreddit/search (corresponding to a subreddit)

The script below submits a request for a given query (i.e. keyword or set of keywords contained), a before and after date, and a given datatype (one of the 3 endpoints). 

In [5]:
api = PushshiftAPI()

def getPushshiftData(query, after_str, before_str, datatype):
    before_date = datetime.datetime.strptime(before_str, "%m-%d-%Y")
    after_date = datetime.datetime.strptime(after_str, "%m-%d-%Y")
    before_timestamp = int(datetime.datetime.timestamp(before_date))
    after_timestamp = int(datetime.datetime.timestamp(after_date))
    
    query_prefix = 'title' if datatype == 'submission' else 'q'
    url = 'https://api.pushshift.io/reddit/search/'+datatype+'/?'+\
            query_prefix+'='+str(query)+'&size=1000&after='+str(after_date)+\
            '&before='+str(before_date)
    #print(url)
    r = requests.get(url)
    data = json.loads(r.text)
    
    return data['data']

def getPushshiftDataForSub(subreddit, query, y2, m2, d2, y1, m1, d1, datatype, limit=10000):
    
    before = int(dt.datetime(y1,m1,d1,0,0).timestamp())
    after = int(dt.datetime(y2,m2,d2,0,0).timestamp())
    
    if datatype=='submissions':
        out = api.search_submissions(subreddit=subreddit, q=query, limit=limit, 
                             before=before, after=after)
    else:
        out = api.search_comments(subreddit=subreddit, q=query, limit=limit, 
                             before=before, after=after)
        
    return out

E.g., to get all posts containing 'climate change' between Jan. 1, 2020 and Feb. 1, 2020, we run:

In [211]:
comments = getPushshiftDataForSub('changemyview',None,2014,1,1,2014,12,31,
                                  'comments')

Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Total:: Success Rate: 100.00% - Requests: 101 - Batches: 11 - Items Remaining: 0


In [212]:
comments_df = pd.DataFrame(comments)
# preview the comments data
comments_df.head(5)

Unnamed: 0,author,author_flair_css_class,author_flair_text,body,controversiality,created_utc,distinguished,gilded,id,link_id,...,reply_delay,retrieved_on,score,score_hidden,subreddit,subreddit_id,author_created_utc,author_fullname,user_removed,edited
0,[deleted],,,Why are your stance that women can't enjoy dar...,0,1397998076,,0,cgx96ag,t3_23erso,...,119858,1433438423,0,False,changemyview,t5_2w2s8,,,,
1,UncharminglyWitty,,,What about a kid walking home from school?,0,1397997927,,0,cgx957p,t3_23hgnt,...,22288,1433438410,2,False,changemyview,t5_2w2s8,1366138000.0,t2_bc9qn,,
2,garnteller,,108Δ,A few thoughts:\n\n1. **Ownership of a sub** F...,0,1397997921,,0,cgx956g,t3_23i0xv,...,13103,1433438409,2,False,changemyview,t5_2w2s8,1386191000.0,t2_e7e7z,,
3,theivesinthenight,,,Yes they do have to change their actions. You ...,0,1397997882,,0,cgx94vw,t3_23ev1v,...,919,1433438404,1,False,changemyview,t5_2w2s8,,,,
4,-moose-,,,http://www.reddit.com/r/moosearchive/comments/...,0,1397997836,,0,cgx94jp,t3_23dc1g,...,132438,1433438401,1,False,changemyview,t5_2w2s8,1369286000.0,t2_brzdj,,


In [None]:
#comments_df['selftext'].values

In [8]:
KEYWORDS_HI_PREC

['global warming',
 'climate change',
 'carbon dioxide',
 'co2',
 'methane',
 'fossil fuel',
 'climate crisis',
 'climate emergency',
 'extreme weather',
 'clean energy',
 'renewable energy',
 'cap and trade',
 'sea level rise',
 'IPCC',
 'deforestation',
 'permafrost',
 'greenhouse gas',
 'greenhouse effect',
 'green new deal',
 'environmentalism',
 'EPA']

In [None]:
# Collect all posts w/ climate change keyword from CMV  

# for start_year in range(2013,2022,1):
#     for keyword in KEYWORDS_HI_PREC:
#         save_dir = os.path.join('pmaw_output','submissions','changemyview',
#                                 '1-1-{}_to_12-31-{}'.format(start_year,start_year))
#         if not os.path.exists(save_dir):
#             os.makedirs(save_dir)
#         save_path = os.path.join(save_dir,'{}.csv'.format(keyword))
#         if not os.path.exists(save_path):
#             print("Missing {}, {}".format(keyword,start_year))
#             posts = getPushshiftDataForSub('changemyview',keyword,
#                                            start_year,1,1,start_year,12,31,
#                                            'submissions')
#             posts_df = pd.DataFrame(posts)
#             posts_df.to_csv(save_path)

In [None]:
# Collect all posts from CMV as background--comments seem incomplete

for start_year in range(2013,2022,1):
    #for keyword in KEYWORDS_HI_PREC:
        #save_dir = os.path.join('pmaw_output','submissions','changemyview_background',
        #                        '1-1-{}_to_12-31-{}'.format(start_year,start_year))
        #if not os.path.exists(save_dir):
        #    os.makedirs(save_dir)
    save_path = os.path.join('pmaw_output','posts','changemyview_background',
                                '1-1-{}_to_12-31-{}.csv'.format(start_year,start_year))
    if not os.path.exists(save_path):
        print("Missing background posts for {}".format(start_year))
        posts = getPushshiftDataForSub('changemyview',None,
                                       start_year,1,1,start_year,12,31,
                                       'ps')
        posts_df = pd.DataFrame(posts)
        posts_df.to_csv(save_path)

In [217]:
# Collect all comments from CMV as background

for start_year in range(2013,2022,1):
    #for keyword in KEYWORDS_HI_PREC:
        #save_dir = os.path.join('pmaw_output','submissions','changemyview_background',
        #                        '1-1-{}_to_12-31-{}'.format(start_year,start_year))
        #if not os.path.exists(save_dir):
        #    os.makedirs(save_dir)
    save_path = os.path.join('pmaw_output','post_comments','changemyview_background',
                                '1-1-{}_to_12-31-{}.csv'.format(start_year,start_year))
    if not os.path.exists(save_path):
        print("Missing background posts for {}".format(start_year))
        posts = getPushshiftDataForSub('changemyview',None,
                                       start_year,1,1,start_year,12,31,
                                       'comments')
        posts_df = pd.DataFrame(posts)
        posts_df.to_csv(save_path)
        
# Append individual dataframes into one large one to share

df = pd.DataFrame(columns=list(COLUMNS)+['keyword'])

for start_year in range(2013,2022,1):
    save_path = os.path.join('pmaw_output','post_comments','changemyview_background',
                                '1-1-{}_to_12-31-{}.csv'.format(start_year,start_year))
    df_ = pd.read_csv(save_path,index_col=0)
    if len(df_) > 0:
        missing_cols = set(df.columns).difference(set(df_.columns))
        for missing_col in missing_cols:
            df_[missing_col] = [None]*len(df_)
        df = pd.concat([df,df_],ignore_index=True,axis=0)
        
# Deduplicate by ID 
print('Size of df, pre-deduplication:',len(df))
df.drop_duplicates(subset='id',inplace=True)
print('Size of df, post-deduplication:',len(df))

# Annotate with whether comment awarded delta or not (if ∆ is present in `author_flair_text`)
df['changed_view'] = df['author_flair_text'].apply(
                            lambda x: '∆' in x if type(x) == str else False
)
print(df['changed_view'].value_counts())

# UPDATE TO CURRENT DATE
from datetime import date
today = date.today()
str_today = today.strftime("%m-%d-%Y")
print('\nSaving deduplicated df of posts to: pmaw_output/post_comments/changemyview_background/\
        1-1-2010_to_{}.csv'.format(str_today))

df.to_csv('pmaw_output/post_comments/changemyview_background/1-1-2010_to_{}.csv'.format(str_today))

Size of df, pre-deduplication: 90016
Size of df, post-deduplication: 90001
False    52075
True     37926
Name: changed_view, dtype: int64

Saving deduplicated df of posts to: pmaw_output/post_comments/changemyview_background/        1-1-2010_to_09-24-2021.csv


In [215]:
df = pd.read_csv(
    'pmaw_output/post_comments/changemyview_background/1-1-2015_to_12-31-2015.csv',
index_col=0)
COLUMNS = df.columns
COLUMNS

Index(['author', 'author_created_utc', 'author_flair_css_class',
       'author_flair_text', 'author_fullname', 'body', 'controversiality',
       'created_utc', 'distinguished', 'gilded', 'id', 'link_id', 'nest_level',
       'parent_id', 'reply_delay', 'retrieved_on', 'score', 'score_hidden',
       'subreddit', 'subreddit_id', 'edited', 'user_removed', 'mod_removed',
       'stickied'],
      dtype='object')

In [17]:
# Append individual dataframes into one large one to share

df = pd.DataFrame(columns=list(COLUMNS)+['keyword'])

for start_year in range(2013,2022,1):
    for keyword in KEYWORDS_HI_PREC:
        save_dir = os.path.join('pmaw_output','submissions','changemyview',
                                '1-1-{}_to_12-31-{}'.format(start_year,start_year))
        save_path = os.path.join(save_dir,'{}.csv'.format(keyword))
        df_ = pd.read_csv(save_path,index_col=0)
        if len(df_) > 0:
            df_['keyword'] = [keyword]*len(df_)
            missing_cols = set(df.columns).difference(set(df_.columns))
            for missing_col in missing_cols:
                df_[missing_col] = [None]*len(df_)
            df = pd.concat([df,df_],ignore_index=True,axis=0)
        
# Deduplicate by ID 
print('Size of df, pre-deduplication:',len(df))
df.drop_duplicates(subset='id',inplace=True)
print('Size of df, post-deduplication:',len(df))

# Annotate with whether author awarded delta or not (if ∆ is present in `author_flair_text`)
df['delta_history'] = df['author_flair_text'].apply(
                            lambda x: '∆' in x if type(x) == str else False
)
print(df['delta_history'].value_counts())

# UPDATE TO CURRENT DATE
from datetime import date
today = date.today()
str_today = today.strftime("%m-%d-%Y")
print('\nSaving deduplicated df of posts to: pmaw_output/submissions/changemyview/1-1-2010_to_{}.csv'.format(str_today))

df.to_csv('pmaw_output/submissions/changemyview/1-1-2010_to_{}.csv'.format(str_today))

False    2784
True      392
Name: delta_history, dtype: int64

Saving deduplicated df of posts to: pmaw_output/submissions/changemyview/1-1-2010_to_09-22-2021.csv


In [293]:
# # Inspect whether some keywords are low precision
# for keyword in KEYWORDS_LONG:
#     print(keyword)
#     print(df.loc[df['keyword']==keyword]['url'].values[:10])
#     print('=='*10+'\n')

In [18]:
print(df['delta_history'].value_counts())

False    2784
True      392
Name: delta_history, dtype: int64


In [20]:
df.loc[df['delta_history']]

Unnamed: 0,author,author_created_utc,author_flair_css_class,author_flair_text,author_fullname,created_utc,domain,edited,full_link,gilded,...,steward_reports,og_description,og_title,removed_by_category,removed_by,media_metadata,is_created_from_ads_ui,author_is_blocked,awarded_delta,delta_history
22,patval,1.30124e+09,points,6∆,t2_50pzq,1404878570,self.changemyview,,https://www.reddit.com/r/changemyview/comments...,0,...,,,,,,,,,True,True
24,ianw19,1.36656e+09,points,1∆,t2_bei5p,1404765167,self.changemyview,1.40656e+09,https://www.reddit.com/r/changemyview/comments...,0,...,,,,,,,,,True,True
25,TEmpTom,1.32434e+09,points,3∆,t2_6h1tr,1404185778,self.changemyview,1.40419e+09,https://www.reddit.com/r/changemyview/comments...,0,...,,,,,,,,,True,True
27,Abstract_Atheist,1.3975e+09,points,1∆,t2_g4ixf,1403979277,self.changemyview,,https://www.reddit.com/r/changemyview/comments...,0,...,,,,,,,,,True,True
28,durutticolumn,1.39077e+09,points,7∆,t2_ezn09,1403215525,self.changemyview,1.40322e+09,https://www.reddit.com/r/changemyview/comments...,0,...,,,,,,,,,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4429,-SeeMeNoMore-,,,3∆,t2_5oiov2cv,1621084882,self.changemyview,,https://www.reddit.com/r/changemyview/comments...,,...,,,,,,,,,True,True
4457,Recognizant,,,11∆,t2_cyo9v,1631375392,self.changemyview,,https://www.reddit.com/r/changemyview/comments...,,...,,,,,,,False,False,True,True
4464,TheFakeChiefKeef,,,67∆,t2_3zbj5lrr,1612146828,self.changemyview,1.61215e+09,https://www.reddit.com/r/changemyview/comments...,,...,,,,,,,,,True,True
4509,Polar_Roid,,,5∆,t2_6nje1aoq,1620689019,self.changemyview,,https://www.reddit.com/r/changemyview/comments...,,...,,,,,,,,,True,True


In [76]:
# # Check API rate limiting
# import datetime
# start = datetime.datetime.now()
# for req_no in range(0,400):
#     post = reddit.submission(id=df['id'].values[req_no])
#     c_ids = [c for c in post.comments]
#     print('Post comments:',c_ids)
#     elapsed = (datetime.datetime.now()-start).total_seconds() / 60
#     print('Elapsed minutes:', elapsed)

In [206]:
# Get all comments on posts within big posts df

# define FIELDS; initialize comments dataframe
# FIELDS = set(['author_flair_text',
#               'treatment_tags',
#               'collapsed',
#               'subreddit_name_prefixed',
#               'controversiality',
#               'collapsed_because_crowd_control',
#               'mod_reports',
#               'subreddit_type',
#               'ups',
#              '_replies',
#              'id',
#              'total_awards_received',
#              'approved_at_utc',
#              'author_is_blocked',
#              'comment_type',
#              'edited',
#              'mod_reason_by',
#              'banned_by',
#              'author_flair_type',
#              'removal_reason',
#              'link_id',
#              'likes',
#              'author_fullname',
#              'banned_at_utc',
#              'mod_reason_title',
#              'gilded',
#              'archived',
#              'collapsed_reason_code',
#              'no_follow',
#              'can_mod_post',
#              'created_utc',
#              'send_replies',
#              'parent_id',
#              'score',
#              'approved_by',
#              'author_premium',
#              'mod_note',
#              'all_awardings',
#              'subreddit_id',
#              'body',
#              'awarders',
#               'user_reports',
#               'name',
#               'downs',
#               'author_flair_richtext',
#               'is_submitter',
#               'collapsed_reason',
#               'distinguished',
#               'associated_award',
#               'stickied',
#               'can_gild',
#               'top_awarded_type',
#               'score_hidden',
#               'permalink',
#               'num_reports',
#               'locked',
#               'report_reasons',
#               'created'
#              ])
# with open('reddit_comment_fields.txt','w') as f:
#     for field in FIELDS:
#         f.write(field+'\n')
# comments_dict = defaultdict(list)

# for n,p_id in enumerate(df['id'].values[2476:]):
#     #print(p_id)
#     post = reddit.submission(id=p_id)
#     c_ids = [c.id for c in post.comments]
#     #print('Post comments:',c_ids)
#     for c_id in c_ids:
#         comment = reddit.comment(c_id)
#         try:
#             comments_dict['author_flair_text'].append(comment.author_flair_text)
#         except AttributeError:
#             comments_dict['author_flair_text'].append(None)
#         try:
#             comments_dict['treatment_tags'].append(comment.treatment_tags)
#         except AttributeError:
#             comments_dict['treatment_tags'].append(None)
#         try:
#             comments_dict['collapsed'].append(comment.collapsed)
#         except AttributeError:
#             comments_dict['collapsed'].append(None)
#         try:
#             comments_dict['subreddit_name_prefixed'].append(comment.subreddit_name_prefixed)
#         except AttributeError:
#             comments_dict['subreddit_name_prefixed'].append(None)
#         try:
#             comments_dict['controversiality'].append(comment.controversiality)
#         except AttributeError:
#             comments_dict['controversiality'].append(None)
#         try:
#             comments_dict['collapsed_because_crowd_control'].append(comment.collapsed_because_crowd_control)
#         except AttributeError:
#             comments_dict['collapsed_because_crowd_control'].append(None)
#         try:
#             comments_dict['mod_reports'].append(comment.mod_reports)
#         except AttributeError:
#             comments_dict['mod_reports'].append(None)
#         try:
#             comments_dict['subreddit_type'].append(comment.subreddit_type)
#         except AttributeError:
#             comments_dict['subreddit_type'].append(None)
#         try:
#             comments_dict['ups'].append(comment.ups)
#         except AttributeError:
#             comments_dict['ups'].append(None)
#         try:
#             comments_dict['_replies'].append(comment._replies)
#         except AttributeError:
#             comments_dict['_replies'].append(None)
#         try:
#             comments_dict['id'].append(comment.id)
#         except AttributeError:
#             comments_dict['id'].append(None)
#         try:
#             comments_dict['total_awards_received'].append(comment.total_awards_received)
#         except AttributeError:
#             comments_dict['total_awards_received'].append(None)
#         try:
#             comments_dict['approved_at_utc'].append(comment.approved_at_utc)
#         except AttributeError:
#             comments_dict['approved_at_utc'].append(None)
#         try:
#             comments_dict['author_is_blocked'].append(comment.author_is_blocked)
#         except AttributeError:
#             comments_dict['author_is_blocked'].append(None)
#         try:
#             comments_dict['comment_type'].append(comment.comment_type)
#         except AttributeError:
#             comments_dict['comment_type'].append(None)
#         try:
#             comments_dict['edited'].append(comment.edited)
#         except AttributeError:
#             comments_dict['edited'].append(None)
#         try:
#             comments_dict['mod_reason_by'].append(comment.mod_reason_by)
#         except AttributeError:
#             comments_dict['mod_reason_by'].append(None)
#         try:
#             comments_dict['banned_by'].append(comment.banned_by)
#         except AttributeError:
#             comments_dict['banned_by'].append(None)
#         try:
#             comments_dict['author_flair_type'].append(comment.author_flair_type)
#         except AttributeError:
#             comments_dict['author_flair_type'].append(None)
#         try:
#             comments_dict['removal_reason'].append(comment.removal_reason)
#         except AttributeError:
#             comments_dict['removal_reason'].append(None)
#         try:
#             comments_dict['link_id'].append(comment.link_id)
#         except AttributeError:
#             comments_dict['link_id'].append(None)
#         try:
#             comments_dict['likes'].append(comment.likes)
#         except AttributeError:
#             comments_dict['likes'].append(None)
#         try:
#             comments_dict['author_fullname'].append(comment.author_fullname)
#         except AttributeError:
#             comments_dict['author_fullname'].append(None)
#         try:
#             comments_dict['banned_at_utc'].append(comment.banned_at_utc)
#         except AttributeError:
#             comments_dict['banned_at_utc'].append(None)
#         try:
#             comments_dict['mod_reason_title'].append(comment.mod_reason_title)
#         except AttributeError:
#             comments_dict['mod_reason_title'].append(None)
#         try:
#             comments_dict['gilded'].append(comment.gilded)
#         except AttributeError:
#             comments_dict['gilded'].append(None)
#         try:
#             comments_dict['archived'].append(comment.archived)
#         except AttributeError:
#             comments_dict['archived'].append(None)
#         try:
#             comments_dict['collapsed_reason_code'].append(comment.collapsed_reason_code)
#         except AttributeError:
#             comments_dict['collapsed_reason_code'].append(None)
#         try:
#             comments_dict['no_follow'].append(comment.no_follow)
#         except AttributeError:
#             comments_dict['no_follow'].append(None)
#         try:
#             comments_dict['can_mod_post'].append(comment.can_mod_post)
#         except AttributeError:
#             comments_dict['can_mod_post'].append(None)
#         try:
#             comments_dict['created_utc'].append(comment.created_utc)
#         except AttributeError:
#             comments_dict['created_utc'].append(None)
#         try:
#             comments_dict['send_replies'].append(comment.send_replies)
#         except AttributeError:
#             comments_dict['send_replies'].append(None)
#         try:
#             comments_dict['parent_id'].append(comment.parent_id)
#         except AttributeError:
#             comments_dict['parent_id'].append(None)
#         try:
#             comments_dict['score'].append(comment.score)
#         except AttributeError:
#             comments_dict['score'].append(None)
#         try:
#             comments_dict['approved_by'].append(comment.approved_by)
#         except AttributeError:
#             comments_dict['approved_by'].append(None)
#         try:
#             comments_dict['author_premium'].append(comment.author_premium)
#         except AttributeError:
#             comments_dict['author_premium'].append(None)
#         try:
#             comments_dict['mod_note'].append(comment.mod_note)
#         except AttributeError:
#             comments_dict['mod_note'].append(None)
#         try:
#             comments_dict['all_awardings'].append(comment.all_awardings)
#         except AttributeError:
#             comments_dict['all_awardings'].append(None)
#         try:
#             comments_dict['subreddit_id'].append(comment.subreddit_id)
#         except AttributeError:
#             comments_dict['subreddit_id'].append(None)
#         try:
#             comments_dict['body'].append(comment.body)
#         except AttributeError:
#             comments_dict['body'].append(None)
#         try:
#             comments_dict['awarders'].append(comment.awarders)
#         except AttributeError:
#             comments_dict['awarders'].append(None)
#         try:
#             comments_dict['user_reports'].append(comment.user_reports)
#         except AttributeError:
#             comments_dict['user_reports'].append(None)
#         try:
#             comments_dict['name'].append(comment.name)
#         except AttributeError:
#             comments_dict['name'].append(None)
#         try:
#             comments_dict['downs'].append(comment.downs)
#         except AttributeError:
#             comments_dict['downs'].append(None)
#         try:
#             comments_dict['author_flair_richtext'].append(comment.author_flair_richtext)
#         except AttributeError:
#             comments_dict['author_flair_richtext'].append(None)
#         try:
#             comments_dict['is_submitter'].append(comment.is_submitter)
#         except AttributeError:
#             comments_dict['is_submitter'].append(None)
#         try:
#             comments_dict['collapsed_reason'].append(comment.collapsed_reason)
#         except AttributeError:
#             comments_dict['collapsed_reason'].append(None)
#         try:
#             comments_dict['distinguished'].append(comment.distinguished)
#         except AttributeError:
#             comments_dict['distinguished'].append(None)
#         try:
#             comments_dict['associated_award'].append(comment.associated_award)
#         except AttributeError:
#             comments_dict['associated_award'].append(None)
#         try:
#             comments_dict['stickied'].append(comment.stickied)
#         except AttributeError:
#             comments_dict['stickied'].append(None)
#         try:
#             comments_dict['can_gild'].append(comment.can_gild)
#         except AttributeError:
#             comments_dict['can_gild'].append(None)
#         try:
#             comments_dict['top_awarded_type'].append(comment.top_awarded_type)
#         except AttributeError:
#             comments_dict['top_awarded_type'].append(None)
#         try:
#             comments_dict['score_hidden'].append(comment.score_hidden)
#         except AttributeError:
#             comments_dict['score_hidden'].append(None)
#         try:
#             comments_dict['permalink'].append(comment.permalink)
#         except AttributeError:
#             comments_dict['permalink'].append(None)
#         try:
#             comments_dict['num_reports'].append(comment.num_reports)
#         except AttributeError:
#             comments_dict['num_reports'].append(None)
#         try:
#             comments_dict['locked'].append(comment.locked)
#         except AttributeError:
#             comments_dict['locked'].append(None)
#         try:
#             comments_dict['report_reasons'].append(comment.report_reasons)
#         except AttributeError:
#             comments_dict['report_reasons'].append(None)
#         try:
#             comments_dict['created'].append(comment.created)
#         except AttributeError:
#             comments_dict['created'].append(None)

#     if n % 10 == 0:
#         print(n)

# comments_df = pd.DataFrame(comments_dict)
# print(comments_df.shape)

In [199]:
# And annotate with whether comment received delta or not
# (look at `author_flair_text`)
comments_df['changed_view'] = comments_df['author_flair_text'].apply(
    lambda x: '∆' in x if type(x) == str else False
)
comments_df['changed_view'].value_counts()

False    21604
True     15630
Name: changed_view, dtype: int64

In [205]:
comments_df.shape

(37234, 59)

In [203]:
comments_df.head()

Unnamed: 0,author_flair_text,treatment_tags,collapsed,subreddit_name_prefixed,controversiality,collapsed_because_crowd_control,mod_reports,subreddit_type,ups,_replies,...,stickied,can_gild,top_awarded_type,score_hidden,permalink,num_reports,locked,report_reasons,created,changed_view
0,,[],False,r/changemyview,0,,[],public,3,[],...,False,True,,False,/r/changemyview/comments/1vxyf3/i_believe_that...,,False,,1390489000.0,False
1,,[],False,r/changemyview,0,,[],public,3,[],...,False,True,,False,/r/changemyview/comments/1vxyf3/i_believe_that...,,False,,1390493000.0,False
2,,[],False,r/changemyview,0,,[],public,2,[],...,False,True,,False,/r/changemyview/comments/1vxyf3/i_believe_that...,,False,,1390494000.0,False
3,20∆,[],False,r/changemyview,0,,[],public,2,[],...,False,True,,False,/r/changemyview/comments/1vxyf3/i_believe_that...,,False,,1390499000.0,True
4,,[],False,r/changemyview,0,,[],public,4,[],...,False,True,,False,/r/changemyview/comments/1vps5i/i_am_an_18_yea...,,False,,1390268000.0,False


In [204]:
#os.makedirs('comments_output/changemyview')
comments_df.to_csv('comments_output/changemyview/from_posts_1-1-2010_to_9-22-2021.csv',
                  index=False)

# OLD

However, the API only returns 100 results at a time, so we need some additional wrapper scripts to iteratively retrieve all results. 

`collectSubData` gets specific fields we want from a post and `collectCommData` does the same but for a comment. Check out [this doc page](#https://pushshift.io/api-parameters/) for details on how you can modify these 2 scripts to change the fields you might be interested in.

In [21]:
def collectSubData(subm,subs_dict):
    try:
        title = subm['title']
    except KeyError:
        title = None
    try:
        url = subm['url']
    except KeyError:
        url = None
    try:
        flair = subm['link_flair_text']
    except KeyError:
        flair = "NaN"   
    try:
        author = subm['author']
    except KeyError:
        author = None
    sub_id = subm['id']
    try:
        score = subm['score']
    except KeyError:
        score = None
    try:
        created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
    except KeyError:
        created = None
    try:
        numComms = subm['num_comments']
    except KeyError:
        numComms = None
    try:
        permalink = subm['permalink']
    except KeyError:
        permalink = None
    try:
        is_vid = subm['is_video']
    except KeyError:
        is_vid = None
    try:
        upvote_ratio = subm['upvote_ratio']
    except KeyError:
        upvote_ratio = None
    try:
        text = subm['selftext'].strip().replace('\t','').replace('\n','')
    except KeyError:
        text = ""
    try:
        subreddit = subm['subreddit']
    except KeyError:
        subreddit = None
    subData = {'id':sub_id,'title':title,'url':url,'author':author,'score':score,'date':created,
                    'num_comments':numComms,'permalink':permalink,'flair':flair,'is_video':is_vid,
                    'upvote_ratio':upvote_ratio,'text':text,'subreddit':subreddit}
    subs_dict[sub_id] = subData
    
def collectCommData(subm,subs_dict): 
    try:
        author = subm['author']
    except KeyError:
        author = None
    sub_id = subm['id']
    try:
        link_id = subm['link_id']
    except KeyError:
        link_id = None
    try:
        score = subm['score']
    except KeyError:
        score = None
    try:
        created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
    except KeyError:
        created = None
    try:
        permalink = subm['permalink']
    except KeyError:
        permalink = None
    try:
        text = subm['body'].strip().replace('\t','').replace('\n','')
    except KeyError:
        text = ""
    try:
        subreddit = subm['subreddit']
    except KeyError:
        subreddit = None
    subData = {'id':sub_id,'link_id':link_id,'author':author,'score':score,'date':created,
                    'permalink':permalink,'text':text,'subreddit':subreddit}
    subs_dict[sub_id] = subData

Then, we can call `pushshift_wrapper` to run until all data has been gathered.

In [70]:
def pushshift_wrapper(after_str, before_str, datatype, 
                      subreddit=None, query=None, keywords=None):
    failed_requests = []
    
    if query is None:
        query = '|'.join(keywords)
    
    subCount = 0
    subStats = {}
    print("Getting all submissions with query '{}' within subreddit {} from {} to {}".format(
        query,subreddit,after_str,before_str))
    try:
        data = getPushshiftDataForSub(subreddit, query, after_str, before_str, datatype)
        # Will run until all posts have been gathered 
        # from the 'after' date up until before date
        while len(data) > 0:
            for submission in data:
                if datatype == "submission":
                    collectSubData(submission,subStats)
                else:
                    collectCommData(submission,subStats)
                subCount+=1
            # Calls getPushshiftData() with the created date of the last submission
            #print(len(data))
#             print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
#             after_timestamp = data[-1]['created_utc']
            
            new_after_str = str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])).\
                            split()[0]
            split_after_str = new_after_str.split('-')
            new_after_str = '{}-{}-{}'.format(split_after_str[1],split_after_str[2],
                                             split_after_str[0])
            print(new_after_str)
            try:
                data = getPushshiftDataForSub(subreddit, query, 
                                              new_after_str, before_str, datatype)
            except JSONDecodeError:
                failed_requests.append((subreddit,query,after_str,before_str,datatype))

        print('Num submissions:',subCount,len(subStats))

        interim_df = pd.DataFrame(list(subStats.values()))
        #print(interim_df)

        datatype_prefix = 'posts' if datatype == 'submission' else 'post_comments'
        out_dir = os.path.join('pushshift_output',datatype_prefix,subreddit,'{}_to_{}'.format(after_str,before_str))
        failed_reqs_out_dir = os.path.join('pushshift_output','failed_requests',subreddit,
                                                      '{}_{}'.format(after_str,before_str))
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        if not os.path.exists(failed_reqs_out_dir):
            os.makedirs(failed_reqs_out_dir)
            
        if keywords is None:
            interim_df.to_pickle(os.path.join(out_dir,'{}.pkl'.format(query)))
            print('Saved query submissions to {}!'.format(os.path.join(out_dir,'{}.pkl'.format(query))))
            pickle.dump(failed_requests,open(os.path.join(failed_reqs_out_dir,'{}.pkl'.format(query)),'wb'))
        else:
            interim_df.to_pickle(os.path.join(out_dir,'{}.pkl'.format('keywords_long')))
            print('Saved query submissions to {}!'.format(os.path.join(out_dir,'{}.pkl'.format('keywords_long'))))
            pickle.dump(failed_requests,open(os.path.join(failed_reqs_out_dir,'{}.pkl'.format('keywords_long')),'wb'))
            
    except JSONDecodeError:
        failed_requests.append((query,after_str,before_str,datatype))
        print("First request failed")

In [71]:
pushshift_wrapper('01-01-2014','1-5-2014','submission',subreddit='changemyview',
                 query='climate change')

Getting all submissions with query 'climate change' within subreddit changemyview from 01-01-2014 to 1-5-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014
01-03-2014


KeyboardInterrupt: 

In [55]:
# Collect all posts w/ climate change keyword from CMV  

for start_year in range(2010,2015,1):
    end_year = start_year+1
    for keyword in KEYWORDS_LONG:
        if not os.path.exists(os.path.join('pushshift_output','posts','changemyview'
                                           '1-1-{}_to_12-31-{}'.format(start_year,start_year),
                                           '{}.pkl'.format(keyword))):
            print("Missing {}, {}".format(keyword,start_year))
            pushshift_wrapper('1-1-{}'.format(start_year),'12-31-{}'.format(start_year),
                              'submission',subreddit='changemyview',query=keyword)
            

Missing global warming, 2010
Getting all submissions with query 'global warming' within subreddit changemyview from 1-1-2010 to 12-31-2010
Num submissions: 0 0
Saved query submissions to pushshift_output/posts/changemyview/1-1-2010_to_12-31-2010/global warming.pkl!
Missing climate change, 2010
Getting all submissions with query 'climate change' within subreddit changemyview from 1-1-2010 to 12-31-2010
Num submissions: 0 0
Saved query submissions to pushshift_output/posts/changemyview/1-1-2010_to_12-31-2010/climate change.pkl!
Missing carbon, 2010
Getting all submissions with query 'carbon' within subreddit changemyview from 1-1-2010 to 12-31-2010
Num submissions: 0 0
Saved query submissions to pushshift_output/posts/changemyview/1-1-2010_to_12-31-2010/carbon.pkl!
Missing fossil fuel, 2010
Getting all submissions with query 'fossil fuel' within subreddit changemyview from 1-1-2010 to 12-31-2010
Num submissions: 0 0
Saved query submissions to pushshift_output/posts/changemyview/1-1-2010_

Num submissions: 0 0
Saved query submissions to pushshift_output/posts/changemyview/1-1-2011_to_12-31-2011/2 degree.pkl!
Missing sustainable, 2011
Getting all submissions with query 'sustainable' within subreddit changemyview from 1-1-2011 to 12-31-2011
Num submissions: 0 0
Saved query submissions to pushshift_output/posts/changemyview/1-1-2011_to_12-31-2011/sustainable.pkl!
Missing clean energy, 2011
Getting all submissions with query 'clean energy' within subreddit changemyview from 1-1-2011 to 12-31-2011
First request failed
Missing renewable, 2011
Getting all submissions with query 'renewable' within subreddit changemyview from 1-1-2011 to 12-31-2011
Num submissions: 0 0
Saved query submissions to pushshift_output/posts/changemyview/1-1-2011_to_12-31-2011/renewable.pkl!
Missing cap and trade, 2011
Getting all submissions with query 'cap and trade' within subreddit changemyview from 1-1-2011 to 12-31-2011
First request failed
Missing sea level rise, 2011
Getting all submissions with

First request failed
Missing greenhouse effect, 2012
Getting all submissions with query 'greenhouse effect' within subreddit changemyview from 1-1-2012 to 12-31-2012
First request failed
Missing green new deal, 2012
Getting all submissions with query 'green new deal' within subreddit changemyview from 1-1-2012 to 12-31-2012
First request failed
Missing EPA, 2012
Getting all submissions with query 'EPA' within subreddit changemyview from 1-1-2012 to 12-31-2012
First request failed
Missing global warming, 2013
Getting all submissions with query 'global warming' within subreddit changemyview from 1-1-2013 to 12-31-2013
Num submissions: 0 0
Saved query submissions to pushshift_output/posts/changemyview/1-1-2013_to_12-31-2013/global warming.pkl!
Missing climate change, 2013
Getting all submissions with query 'climate change' within subreddit changemyview from 1-1-2013 to 12-31-2013
First request failed
Missing carbon, 2013
Getting all submissions with query 'carbon' within subreddit changem

KeyboardInterrupt: 

In [36]:
out = getPushshiftDataForSub('changemyview', 'the', '1-1-{}'.format(start_year),
                       '12-31-{}'.format(start_year), 'submission')

In [37]:
len(out)

0

In [38]:
'1-1-{}'.format(start_year)

'1-1-2010'

In [31]:
# for start_year in range(2015,2021,1):
#     end_year = start_year+1
#     pushshift_wrapper('1-1-{}'.format(start_year),'12-31-{}'.format(start_year),'submission',
#                       query=None,keywords=keywords_missing)

## Get all comments attached to a post

## PRAW

### Get IDs

In [410]:
posts_with_comments_ids = pickle.load(open('output/posts_with_comments_ids.pkl','rb'))
print(len(posts_with_comments_ids))
sub_ids_to_fetch = list(posts_with_comments_ids)

419100

In [420]:
# with open('output/comment_ids_per_post.tsv','w') as f:
#     f.write("{}\t{}\n".format('post_id','comment_ids'))

In [439]:
def praw_get_comments(sub_id):
    try:
        post = reddit.submission(id=sub_id)
        post_author = post.author
        post_title = post.title
        post_comms = list(post.__dict__['_comments_by_id'].keys())
        #print(len(post_comms))
        comments_per_post[sub_id] = post_comms

        with open('output/comment_ids_per_post.tsv','a') as f:
            f.write("{}\t{}\n".format(sub_id,','.join(post_comms)))
    except Forbidden:
        pass

In [461]:
for ix_sub_id in range(174623,len(sub_ids_to_fetch)):
    sub_id = sub_ids_to_fetch[ix_sub_id]
    praw_get_comments(sub_id)
    
    if ix_sub_id % 1000 == 0:
        print(ix_sub_id)

175000
176000
177000
178000
179000
180000
181000
182000
183000
184000
185000
186000
187000
188000
189000
190000
191000
192000
193000
194000
195000
196000
197000
198000
199000
200000
201000
202000
203000
204000
205000
206000
207000
208000
209000
210000
211000
212000
213000
214000
215000
216000
217000
218000
219000
220000
221000
222000
223000
224000
225000
226000
227000
228000
229000
230000
231000
232000
233000
234000
235000
236000
237000
238000
239000
240000
241000
242000
243000
244000
245000
246000
247000
248000
249000
250000
251000
252000
253000
254000
255000
256000
257000
258000
259000
260000
261000
262000
263000
264000
265000
266000
267000
268000
269000
270000
271000
272000
273000
274000
275000
276000
277000
278000
279000
280000
281000
282000
283000
284000
285000
286000
287000
288000
289000
290000
291000
292000
293000
294000
295000
296000
297000
298000
299000
300000
301000
302000
303000
304000
305000
306000
307000
308000
309000
310000
311000
312000
313000
314000
315000
316000
317000

In [460]:
ix_sub_id,sub_id

(174623, '92efdf')

In [447]:
len(comments_per_post)

11720

In [462]:
pd.read_csv('output/comment_ids_per_post.tsv',sep='\t')

Unnamed: 0,post_id,comment_ids
0,anbg0,"t1_c0ignhg,t1_c0ignz0,t1_c0igq3n"
1,apn8b,"t1_c0irhej,t1_c0is8sw"
2,aprgq,"t1_c0isfnb,t1_c0is90g,t1_c0iscd2,t1_c0isbol,t1..."
3,aprgv,"t1_c0is3y4,t1_c0ism1k,t1_c0isyft,t1_c0isvkx,t1..."
4,aprie,"t1_c0is4by,t1_c0islml,t1_c0ismfv,t1_c0isk7q,t1..."
...,...,...
403412,3yuldj,"t1_cygrqj0,t1_cygrhs5,t1_cygtcrx,t1_cygtrvf,t1..."
403413,3yulsl,"t1_cylxj3h,t1_cygqso3"
403414,3yup0j,"t1_cyhjhnw,t1_cylxiyc"
403415,3yuytd,"t1_cygu9uo,t1_cygx6g5,t1_cyhpqys,t1_cygxgig,t1..."


### Get text

In [405]:
comment = reddit.comment("c0st842")
comment.body

'Personally I am more concerned about the fact that methane is an extremely effective greenhouse gas.  '

In [None]:
comment

In [None]:
comment_ids_per_post = pd.read_csv('output/comment_ids_per_post.tsv',sep='\t')
comment_ids = comment_ids_per_post['comment_ids']
all_comment_ids = [x.split(',') for x in comment_ids]
all_comment_ids = [item for sublist in all_comment_ids for item in sublist]
unique_comment_ids = set(all_comment_ids)

In [None]:
print(len(all_comment_ids),len(unique_comment_ids))

In [None]:
# Dataframe of 
# comment_id | text

with open('output/text_per_comment.tsv','w') as f:
    f.write('{}\t{}\n'.format('comment_id','text'))
    
unique_comment_ids = list(unique_comment_ids)

In [None]:
for c_id_index in range(len(unique_comment_ids)):
    c_id = unique_comment_ids[c_id_index]
    comm = reddit.comment(c_id)
    comment_body = strip_whitespace(comm.body)
    
    with open('output/text_per_comment.tsv','a') as f:
        f.write('{}\t{}\n'.format(c_id,comment_body))