In [1]:
import math
from pprint import pprint
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import praw
from praw.models import Submission
from psaw import PushshiftAPI
from prawcore.exceptions import Forbidden
from praw.exceptions import ClientException
import csv
import os
from urllib.error import HTTPError
import glob
import requests
import json
from json import JSONDecodeError
import datetime
import pickle
from collections import defaultdict

  import pandas.util.testing as tm


In [2]:
import re

_RE_COMBINE_WHITESPACE = re.compile(r"\s+")

def strip_whitespace(text):
    return _RE_COMBINE_WHITESPACE.sub(" ", text).strip()

In [3]:
def get_submissions(reddit_instance,subreddit_str):
    
    if not os.path.exists('praw_output'):
        os.mkdir('praw_output')
    
    subreddit = reddit_instance.subreddit(subreddit_str)
    print('Getting submissions and comments from: {}'.format(subreddit.display_name))  
    try:
        title = subreddit.title
        desc = subreddit.description

        if not os.path.exists('subreddits.tsv'):
            with open('subreddits.tsv','w') as f:
                csvwriter = csv.writer(f, delimiter='\t')
                csvwriter.writerow([subreddit.display_name,title,desc])
        else:
            with open('subreddits.tsv','a') as f:
                csvwriter = csv.writer(f, delimiter='\t')
                csvwriter.writerow([subreddit.display_name,title,desc])

        # Write header
        with open(os.path.join('praw_output','{}.tsv'.format(subreddit.display_name)), 'w', newline='\n') as csvfile:
            csvwriter = csv.writer(csvfile, delimiter='\t',
                                    quotechar='|', quoting=csv.QUOTE_MINIMAL)
            csvwriter.writerow(['title','author','date','is_video','id','num_downs','num_ups','upvote_ratio',
                               'num_comments','score','text','subreddit'])

        # Write body
        for submission in subreddit.new(limit=None):
            sub_title = submission.title
            sub_author = submission.author.name if submission.author is not None else -1
            sub_date = submission.created
            sub_is_vid = submission.is_video
            sub_id = submission.id
            sub_downvotes = submission.downs
            sub_upvotes = submission.ups
            sub_upvote_ratio = submission.upvote_ratio
            sub_num_comments = submission.num_comments
            sub_score = submission.score
            sub_text = submission.selftext.strip().replace('\t','').replace('\n','')
            sub_subreddit = submission.subreddit.display_name
            with open(os.path.join('praw_output','{}.tsv'.format(subreddit.display_name)), 'a', newline='\n') as csvfile:
                csvwriter = csv.writer(csvfile, delimiter='\t',
                                    quotechar='|', quoting=csv.QUOTE_MINIMAL)
                csvwriter.writerow([sub_title,sub_author,sub_date,sub_is_vid,sub_id,sub_downvotes,
                                   sub_upvotes,sub_upvote_ratio,sub_num_comments,sub_score,sub_text,sub_subreddit])
    except HTTPError as e:
        if e.code == 403:
            print('Forbidden: private subreddit.')
            
def get_submission_comments(reddit_instance,subreddit,submission_id):
    
    submission = Submission(reddit_instance,id=submission_id)
    
    try:
        submission.comments.replace_more(limit=0)
        all_comments = submission.comments.list()

        # Write header
        with open(os.path.join('praw_output','post_comments','{}_COMMENTS.tsv'.format(subreddit)), 'w', newline='\n') as csvfile:
            csvwriter = csv.writer(csvfile, delimiter='\t',
                                    quotechar='|', quoting=csv.QUOTE_MINIMAL)
            csvwriter.writerow(['submission_id','author','text','date','id','controversiality','num_downs','num_ups',
                               'num_likes','score','subreddit'])

        # Write body
        for comment in all_comments:
            sub_id = comment._submission.id
            assert sub_id == submission_id
            author_name = comment.author.name if comment.author is not None else -1
            comment_body = comment.body.strip().replace('\t','').replace('\n','')
            date_created = comment.created
            comment_id = comment.id
            controversiality = comment.controversiality
            num_downs = comment.downs
            num_ups = comment.ups
            num_likes = comment.likes
            score = comment.score
            subreddit_name = comment.subreddit.display_name
            #print(subreddit_name,subreddit)
            assert subreddit_name == subreddit
            
            with open(os.path.join('praw_output','post_comments','{}_COMMENTS.tsv'.format(subreddit_name)), 'a', newline='\n') as csvfile:
                csvwriter = csv.writer(csvfile, delimiter='\t',
                                    quotechar='|', quoting=csv.QUOTE_MINIMAL)
                csvwriter.writerow([sub_id,author_name,comment_body,date_created,comment_id,controversiality,
                                   num_downs,num_ups,num_likes,score,subreddit_name])
    except HTTPError as e:
        if e.code == 403:
            print('Forbidden: private subreddit.')

# Load sets of keywords, subreddits

In [4]:
with open('../keywords_long.txt','r') as f:
    KEYWORDS_LONG = f.read().splitlines()
KEYWORDS_SHORT = set(["climate change","global warming","carbon","co2","methane",
                  "green","environment","fossil fuel"])
#my_keywords = set(['climate change', 'global warming', 'fossil fuel', 'methane', 'carbon', 'co2'])

Read in climate-related subreddits:

In [64]:
SUBREDDITS = pd.read_csv('CLIMATE_SUBREDDITS.txt',sep='\t',header=0)
SUBREDDITS.stance.value_counts()

pro     42
anti    20
neut    10
Name: stance, dtype: int64

In [65]:
SUBREDDITS.loc[SUBREDDITS.stance=='neut']

Unnamed: 0,subreddit,stance
4,science,neut
19,energy,neut
52,climatecmv,neut
65,Republican,neut
66,republicans,neut
67,askaconservative,neut
68,Conservative,neut
69,conservatives,neut
70,TrueConservativism,neut
71,AskTrumpSupporters,neut


In [66]:
len(set(SUBREDDITS['subreddit'])),len(SUBREDDITS['subreddit'])

(72, 72)

In [159]:
SUBREDDITS_LIST = list(SUBREDDITS['subreddit'])

In [6]:
most_common_subs_df = pd.read_csv('most_common_subreddits.tsv',sep='\t',header=0)
most_common_subs_df.head()

Unnamed: 0,subreddit_name,initials,What is the subreddit about?,Unnamed: 3,Can you infer a general political leaning of the subreddit?,cc_stance,Unnamed: 6,What other impressions do you have about the subreddit?,Unnamed: 8,Unnamed: 9,Unnamed: 10,Other notes
0,r/politics,yiwei,general political news (in the US?),,"pretty liberal--stuff defending AOC, in favor ...",p,,a fair amount of sarcasm in the comment thread...,,,,posts mostly just links to news articles
1,r/worldnews,yiwei,world news excluding US events,,possibly liberal--stuff about COVID is aligned...,p,,some debate and opposing views in comment thre...,,,,
2,r/AskReddit,yiwei,"place to ask and answer ""thought-provoking"" qu...",,there are definitely liberals here (https://ww...,p,,main purpose may be for entertainment/humor?,,,,
3,r/Futurology,yiwei,"discussion about the future of humanity, civil...",,there is posting from liberal news outlets (th...,p,,a lot of AI-related posts,,,,
4,r/news,yiwei,"a lot of general news, primarily US but also t...",,"pretty liberal I think, see https://www.reddit...",p,,,,,,


In [7]:
most_common_subs = [x[2:] for x in most_common_subs_df.subreddit_name]
most_common_subs

['politics',
 'worldnews',
 'AskReddit',
 'Futurology',
 'news',
 'science',
 'The_Donald',
 'todayilearned',
 'collapse',
 'canada',
 'environment',
 'pics',
 'climateskeptics',
 'explainlikeimfive',
 'PlantedTank',
 'Homebrewing',
 'australia',
 'conspiracy',
 'europe',
 'EcoInternet',
 'TalkativePeople',
 'climate ',
 'askscience',
 'AutoNewspaper',
 'newsbotbot',
 'Showerthoughts',
 'POLITIC',
 'energy',
 'bicycling',
 'climatechange',
 'thinkpad',
 'mechmarket',
 'autotldr',
 'spacex',
 'Aquariums',
 'Conservative',
 'skeptic',
 'space',
 'technology',
 'EvolveSustain',
 'NoStupidQuestions',
 'unpopularopinion',
 'EverythingScience',
 'BreakingNews24hr',
 'changemyview',
 'neoliberal',
 'atheism',
 'Libertarian',
 'ChapoTrapHouse',
 'IAmA',
 'dataisbeautiful',
 'PoliticalDiscussion',
 'AdviceAnimals',
 'PoliticalHumor',
 'TrueReddit']

# Create PRAW reddit instance to get posts and comments

In [11]:
reddit = praw.Reddit(client_id='1sbu376RCBiWRw',
                     client_secret='NbqiHMPiKicBXvgfrID-xVNktZM',
                     user_agent='mac:cc_framing:v1 (by /u/emma_cc_research)')

In [132]:
# Can it know if a user is a bot?
vars(reddit.redditor("DragonFireDon"))

{'_reddit': <praw.reddit.Reddit at 0x11c21d630>,
 '_fetched': False,
 '_listing_use_sort': True,
 'name': 'DragonFireDon'}

In [16]:
reddit.redditor("AutoModerator").link_karma

13230

In [18]:
reddit.redditor("AutoModerator").__dict__

{'_reddit': <praw.reddit.Reddit at 0x1177ea240>,
 '_fetched': False,
 '_listing_use_sort': True,
 'name': 'AutoModerator'}

In [13]:
reddit.redditor("emma_cc_research")

Redditor(name='emma_cc_research')

## Get posts from r/spambotwatch

In [5]:
get_submissions(reddit,'spambotwatch')

Getting submissions and comments from: spambotwatch


In [None]:
#get_submission_comments(reddit,'spambotwatch')

In [7]:
spambotwatch_df = pd.read_csv('praw_output/spambotwatch.tsv',sep='\t',header=0)
spambotwatch_df.title

0           JEEVAN BOBY (u/jeevanbobyvallickad) - Reddit
1                                 overview for funnynova
2      Spams Dating Tips Websites all with Stolen Con...
3                                 overview for poopcake5
4                                   overview for sutei_m
                             ...                        
190                               overview for royboy204
191                                 overview for mlg7732
192                            overview for 0wned_alover
193                                 overview for xrox333
194                                overview for frede933
Name: title, Length: 195, dtype: object

## Get posts from all subreddits

In [84]:
for i in range(len(SUBREDDITS['subreddit'])-1,len(SUBREDDITS['subreddit'])):
    SUBREDDIT = SUBREDDITS_LIST[i]
    get_submissions(reddit,SUBREDDIT)

Getting submissions and comments from: AskTrumpSupporters


## Inspect output: tsv of subreddits and meta

In [86]:
pd.read_csv('subreddits.tsv',sep='\t',header=None).drop_duplicates(0,keep='first')

Unnamed: 0,0,1,2
0,climateskeptics,Climate Skeptics: Trying to see through the al...,"Seeing past hyperbole, alarmism and environmen..."
1,skeptic,skeptic,## [Click this link to Read the Rules](http://...
2,climatechange,A place for a rational discussion on a divisiv...,This is a place for the rational discussion of...
3,climate,Information about the world's climate,Real and accurate data about the Earth's clima...
4,science,Reddit Science,# [Submission Rules](https://www.reddit.com/r/...
...,...,...,...
63,republicans,Republicans - RNC - GOP: Grand Old Party,"Republican, RNC and GOP news, issues, gossip, ..."
64,askaconservative,Ask A Conservative: Ask Conservatives And Repu...,#[Ask a Conservative](/r/askaconservative)\n\n...
65,Conservative,Conservative,#####\n**[Join us on discord.](https://discord...
66,conservatives,conservatives,"Conservatism (from, conservare, ""to preserve"")..."


## Inspect output: tsv of one subreddit's posts

In [88]:
df = pd.read_csv('praw_output/350.tsv',sep='\t',header=0)

In [89]:
df.columns

Index(['title', 'author', 'date', 'is_video', 'id', 'num_downs', 'num_ups',
       'upvote_ratio', 'num_comments', 'score', 'text', 'subreddit'],
      dtype='object')

In [90]:
df.is_video.value_counts()

False    127
Name: is_video, dtype: int64

In [91]:
df.subreddit.value_counts()

350    127
Name: subreddit, dtype: int64

In [160]:
#df.head(15)

## Get comments for all posts with non-zero num comments

In [156]:
for subreddit_tsv in glob.glob('praw_output/posts/*.tsv'):
    if os.path.exists('praw_output/post_comments/{}_COMMENTS.tsv'.format(subreddit_tsv.split('/')[-1][:-4])):
        print('Already got comments for subreddit {}'.format(subreddit_tsv))
    else:
        subreddit_posts = pd.read_csv(subreddit_tsv,sep='\t',header=0)
        if len(subreddit_posts) > 0:
            subreddit = str(subreddit_posts.iloc[0]['subreddit'])
            posts_with_comments = subreddit_posts.loc[subreddit_posts.num_comments > 0]
            if len(posts_with_comments) > 0:
                print('Getting comments from posts in subreddit: {}'.format(subreddit))
                for ix,row in posts_with_comments.iterrows():
                    get_submission_comments(reddit,subreddit,row['id'])
            else:
                print('0 comments among all posts in subreddit: {}'.format(subreddit))
        else:
            print('Subreddit {} has no posts'.format(subreddit_tsv))

Already got comments for subreddit praw_output/posts/350.tsv
Already got comments for subreddit praw_output/posts/350ppm.tsv
Already got comments for subreddit praw_output/posts/askaconservative.tsv
Already got comments for subreddit praw_output/posts/AskTrumpSupporters.tsv
Already got comments for subreddit praw_output/posts/carboncapture.tsv
Already got comments for subreddit praw_output/posts/carbontax.tsv
Already got comments for subreddit praw_output/posts/ccfunding.tsv
Already got comments for subreddit praw_output/posts/climate.tsv
Already got comments for subreddit praw_output/posts/climate_activism.tsv
Already got comments for subreddit praw_output/posts/climate_discussion.tsv
Already got comments for subreddit praw_output/posts/climate_science.tsv
Already got comments for subreddit praw_output/posts/ClimateActionPlan.tsv
Already got comments for subreddit praw_output/posts/climatechange.tsv
Already got comments for subreddit praw_output/posts/ClimateChangeCancer.tsv
Already g

# Use Pushshift API

In [18]:
multiple_search_term = '|'.join(KEYWORDS_LONG)
multiple_search_term

'global warming|climate change|carbon|fossil fuel|methane|environment|co2|climate crisis|climate emergency|extreme weather|2 degree|sustainable|clean energy|renewable|cap and trade|sea level rise|environmental justice|climate justice|COP|IPCC|deforestation|permafrost|glacier|drought|ecosystem|greenhouse gas|greenhouse effect|green new deal|EPA'

In [23]:
before_date = datetime.datetime.strptime("07-22-2020", "%m-%d-%Y")
after_date = datetime.datetime.strptime("07-20-2020", "%m-%d-%Y")
before_timestamp = int(datetime.datetime.timestamp(before_date))
after_timestamp = int(datetime.datetime.timestamp(after_date))

In [26]:
test_data = getPushshiftData(multiple_search_term,after_timestamp,before_timestamp,"comment")

In [61]:
test_data = getPushshiftData(multiple_search_term,after_timestamp,before_timestamp,"submission")

In [62]:
len(test_data)

0

In [67]:
# test_data[0
#         ]

In [24]:
def getPushshiftData(query, after, before, datatype):
    query_prefix = 'title' if datatype == 'submission' else 'q'
    url = 'https://api.pushshift.io/reddit/search/'+datatype+'/?'+\
    query_prefix+'='+str(query)+'&size=1000&after='+str(after)+'&before='+str(before)
    #print(url)
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']


def getPushshiftSubreddit(subreddit, after, before, datatype):
    url = 'https://api.pushshift.io/reddit/search/{}/?subreddit={}&size=1000&after={}&before={}'.format(
        datatype,subreddit,after,before)
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']


def collectSubData(subm,subs_dict):
    try:
        title = subm['title']
    except KeyError:
        title = None
    try:
        url = subm['url']
    except KeyError:
        url = None
    try:
        flair = subm['link_flair_text']
    except KeyError:
        flair = "NaN"   
    try:
        author = subm['author']
    except KeyError:
        author = None
    sub_id = subm['id']
    try:
        score = subm['score']
    except KeyError:
        score = None
    try:
        created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
    except KeyError:
        created = None
    try:
        numComms = subm['num_comments']
    except KeyError:
        numComms = None
    try:
        permalink = subm['permalink']
    except KeyError:
        permalink = None
    try:
        is_vid = subm['is_video']
    except KeyError:
        is_vid = None
    try:
        upvote_ratio = subm['upvote_ratio']
    except KeyError:
        upvote_ratio = None
    try:
        text = subm['selftext'].strip().replace('\t','').replace('\n','')
    except KeyError:
        text = ""
    try:
        subreddit = subm['subreddit']
    except KeyError:
        subreddit = None
    subData = {'id':sub_id,'title':title,'url':url,'author':author,'score':score,'date':created,
                    'num_comments':numComms,'permalink':permalink,'flair':flair,'is_video':is_vid,
                    'upvote_ratio':upvote_ratio,'text':text,'subreddit':subreddit}
    subs_dict[sub_id] = subData
    
    
def collectCommData(subm,subs_dict): 
    try:
        author = subm['author']
    except KeyError:
        author = None
    sub_id = subm['id']
    try:
        link_id = subm['link_id']
    except KeyError:
        link_id = None
    try:
        score = subm['score']
    except KeyError:
        score = None
    try:
        created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
    except KeyError:
        created = None
    try:
        permalink = subm['permalink']
    except KeyError:
        permalink = None
    try:
        text = subm['body'].strip().replace('\t','').replace('\n','')
    except KeyError:
        text = ""
    try:
        subreddit = subm['subreddit']
    except KeyError:
        subreddit = None
    subData = {'id':sub_id,'link_id':link_id,'author':author,'score':score,'date':created,
                    'permalink':permalink,'text':text,'subreddit':subreddit}
    subs_dict[sub_id] = subData

In [19]:
def pushshift_wrapper(after_str, before_str, datatype, query=None, keywords=None):
    failed_requests = []
    
    if query == 'ALL':
        query = '|'.join(keywords)
    
    subCount = 0
    subStats = {}
    before_date = datetime.datetime.strptime(before_str, "%m-%d-%Y")
    after_date = datetime.datetime.strptime(after_str, "%m-%d-%Y")
    before_timestamp = int(datetime.datetime.timestamp(before_date))
    after_timestamp = int(datetime.datetime.timestamp(after_date))
#     print("Getting all submissions with query '{}' from {} to {}".format(query,
#                                                                          after_str,before_str))
    print("Getting all submissions from subreddit '{}' from {} to {}".format(query,
                                                                         after_str,before_str))
    try:
        #data = getPushshiftData(query, after_timestamp, before_timestamp, datatype)
        data = getPushshiftSubreddit(query, after_timestamp, before_timestamp, datatype)
        # Will run until all posts have been gathered 
        # from the 'after' date up until before date
        while len(data) > 0:
            for submission in data:
                if datatype == "submission":
                    collectSubData(submission,subStats)
                else:
                    collectCommData(submission,subStats)
                subCount+=1
            # Calls getPushshiftData() with the created date of the last submission
            #print(len(data))
            #print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
            after_timestamp = data[-1]['created_utc']
            #print(after_timestamp)
            try:
                #data = getPushshiftData(query, after_timestamp, before_timestamp, datatype)
                data = getPushshiftSubreddit(query, after_timestamp, before_timestamp, datatype)
            except JSONDecodeError:
                failed_requests.append((query,after_timestamp,before_timestamp,datatype))

        print('Num submissions:',subCount,len(subStats))

        interim_df = pd.DataFrame(list(subStats.values()))
        #print(interim_df)

        datatype_prefix = 'posts' if datatype == 'submission' else 'post_comments'
        out_dir = os.path.join('output','pushshift_output_background',datatype_prefix,'{}_to_{}'.format(after_str,before_str))
        failed_reqs_out_dir = os.path.join('output','pushshift_output_background','failed_requests',
                                                      '{}_{}'.format(after_str,before_str))
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
        if not os.path.exists(failed_reqs_out_dir):
            os.mkdir(failed_reqs_out_dir)
            
        if True:#keywords is None:
            interim_df.to_pickle(os.path.join(out_dir,'{}.pkl'.format(query)))
            print('Saved subreddit posts to {}!'.format(os.path.join(out_dir,'{}.pkl'.format(query))))
            pickle.dump(failed_requests,open(os.path.join(failed_reqs_out_dir,'{}.pkl'.format(query)),'wb'))
        else:
            interim_df.to_pickle(os.path.join(out_dir,'{}.pkl'.format('keywords_long')))
            print('Saved subreddit posts to {}!'.format(os.path.join(out_dir,'{}.pkl'.format('keywords_long'))))
            pickle.dump(failed_requests,open(os.path.join(failed_reqs_out_dir,'{}.pkl'.format('keywords_long')),'wb'))
            
    except JSONDecodeError:
        failed_requests.append((query,after_timestamp,before_timestamp,datatype))
        print("First request failed")

In [17]:
len(most_common_subs)

55

In [None]:
for start_year in range(2010,2015,1):
    end_year = start_year+1
    for sub in most_common_subs:
        if not os.path.exists(os.path.join('output','pushshift_output_background','posts',
                                           '1-1-{}_to_12-31-{}'.format(start_year,start_year),
                                           '{}.pkl'.format(sub))):
            print("Missing {}, {}".format(sub,start_year))
            pushshift_wrapper('1-1-{}'.format(start_year),'12-31-{}'.format(start_year),'submission',
            query=sub)
print('************')    
# for start_year in range(2010,2015,1):
#     end_year = start_year+1
#     #for keyword in KEYWORDS_LONG:
#     for sub in most_common_subs:
#         if not os.path.exists(os.path.join('output','pushshift_output_background','post_comments',
#                                            '1-1-{}_to_12-31-{}'.format(start_year,start_year),
#                                            '{}.pkl'.format(sub))):
#             print("Missing {}, {}".format(sub,start_year))
#             #pushshift_wrapper(keyword,'1-1-{}'.format(start_year),'12-31-{}'.format(start_year),'comment')

Missing politics, 2010
Getting all submissions from subreddit 'politics' from 1-1-2010 to 12-31-2010


In [31]:
# for start_year in range(2015,2021,1):
#     end_year = start_year+1
#     pushshift_wrapper('1-1-{}'.format(start_year),'12-31-{}'.format(start_year),'submission',
#                       query=None,keywords=keywords_missing)

# Get all comments attached to a post

## PRAW

### Get IDs

In [410]:
posts_with_comments_ids = pickle.load(open('output/posts_with_comments_ids.pkl','rb'))
print(len(posts_with_comments_ids))
sub_ids_to_fetch = list(posts_with_comments_ids)

419100

In [420]:
# with open('output/comment_ids_per_post.tsv','w') as f:
#     f.write("{}\t{}\n".format('post_id','comment_ids'))

In [439]:
def praw_get_comments(sub_id):
    try:
        post = reddit.submission(id=sub_id)
        post_author = post.author
        post_title = post.title
        post_comms = list(post.__dict__['_comments_by_id'].keys())
        #print(len(post_comms))
        comments_per_post[sub_id] = post_comms

        with open('output/comment_ids_per_post.tsv','a') as f:
            f.write("{}\t{}\n".format(sub_id,','.join(post_comms)))
    except Forbidden:
        pass

In [None]:
for ix_sub_id in range(11950,len(sub_ids_to_fetch)):
    sub_id = sub_ids_to_fetch[ix_sub_id]
    praw_get_comments(sub_id)
    
    if ix_sub_id % 1000 == 0:
        print(ix_sub_id)

12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000


In [452]:
ix_sub_id,sub_id

(11950, '4q96ti')

In [447]:
len(comments_per_post)

11720

### Get text

In [7]:
comment_ids_per_post = pd.read_csv('output/comment_ids_per_post.tsv',sep='\t')
comment_ids_per_post = comment_ids_per_post.loc[comment_ids_per_post.comment_ids.apply(lambda x: 
                                                                                      isinstance(x,str))]
comment_ids_per_post = comment_ids_per_post.loc[comment_ids_per_post.comment_ids.apply(lambda x: 
                                                                                      len(x) > 0)]
comment_ids = comment_ids_per_post['comment_ids']
all_comment_ids = [x.split(',') for x in comment_ids]
all_comment_ids = [item for sublist in all_comment_ids for item in sublist]
unique_comment_ids = set(all_comment_ids)

In [8]:
print(len(all_comment_ids),len(unique_comment_ids))

393227 393227


In [15]:
# Dataframe of 
# comment_id | text

with open('output/text_per_comment.tsv','w') as f:
    f.write('{}\t{}\n'.format('comment_id','text'))
    
unique_comment_ids = list(unique_comment_ids)

In [28]:
c_id_index

112577

In [None]:
for c_id_index in range(112577,len(unique_comment_ids)):
    
    c_id = unique_comment_ids[c_id_index].split('_')[-1]
    comm = reddit.comment(c_id)
    try:
        comment_body = strip_whitespace(comm.body)

        with open('output/text_per_comment.tsv','a') as f:
            f.write('{}\t{}\n'.format(c_id,comment_body))
    except ClientException:
        pass

    if c_id_index % 1000 == 0:
        print(c_id_index)

113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
151000
152000
153000
154000
155000
156000
157000
158000
159000
160000
161000
162000
163000
164000
165000
166000
167000
168000
169000
170000
171000
172000
173000
174000


In [18]:
test_df = pd.read_csv('output/text_per_comment.tsv',sep='\t',header=0)

In [19]:
test_df.comment_id.apply(lambda x: type(x)).value_counts()

<class 'str'>    100
Name: comment_id, dtype: int64

In [20]:
test_df.comment_id.apply(lambda x: len(x)).value_counts()

7    100
Name: comment_id, dtype: int64

In [21]:
test_df.text.apply(lambda x: type(x)).value_counts()

<class 'str'>    100
Name: text, dtype: int64

## Pushshift -- doesn't seem to work >:(

In [290]:
def getComments(sub_id):
    url = 'https://api.pushshift.io/reddit/submission/comment_ids/{}'.format(sub_id)
    r = requests.get(url)
    sub_comm_data = json.loads(r.text)
    return sub_comm_data['data']

In [45]:
def pushshift_comment_wrapper(link_id, after_str, before_str, datatype):
    failed_requests = []
    
    subCount = 0
    subStats = {}
    before_date = datetime.datetime.strptime(before_str, "%m-%d-%Y")
    after_date = datetime.datetime.strptime(after_str, "%m-%d-%Y")
    before_timestamp = int(datetime.datetime.timestamp(before_date))
    after_timestamp = int(datetime.datetime.timestamp(after_date))
    print("Getting all comments with link_id '{}' from {} to {}".format(link_id,after_str,before_str))
    try:
        data = getComments(link_id, after_timestamp, before_timestamp, datatype)
        # Will run until all posts have been gathered 
        # from the 'after' date up until before date
        while len(data) > 0:
            for submission in data:
                if datatype == "submission":
                    collectSubData(submission,subStats)
                else:
                    collectCommData(submission,subStats)
                subCount+=1
            # Calls getPushshiftData() with the created date of the last submission
            #print(len(data))
            #print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
            after_timestamp = data[-1]['created_utc']
            #print(after_timestamp)
            try:
                data = getComments(link_id, after_timestamp, before_timestamp, datatype)
            except JSONDecodeError:
                failed_requests.append((link_id,after_timestamp,before_timestamp,datatype))

        print('Num submissions:',subCount,len(subStats))

        interim_df = pd.DataFrame(list(subStats.values()))
        #print(interim_df)

        datatype_prefix = 'posts' if datatype == 'submission' else 'linked_post_comments'
        out_dir = os.path.join('output','pushshift_output',datatype_prefix,'{}_to_{}'.format(after_str,before_str))
        failed_reqs_out_dir = os.path.join('pushshift_output','failed_requests',
                                                      '{}_{}'.format(after_str,before_str))
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
        if not os.path.exists(failed_reqs_out_dir):
            os.mkdir(failed_reqs_out_dir)
            
        interim_df.to_pickle(os.path.join(out_dir,'{}.pkl'.format(link_id)))
        print('Saved query submissions to {}!'.format(os.path.join(out_dir,'{}.pkl'.format(link_id))))
        pickle.dump(failed_requests,open(os.path.join(failed_reqs_out_dir,'{}.pkl'.format(link_id)),'wb'))
            
    except JSONDecodeError:
        failed_requests.append((query,after_timestamp,before_timestamp,datatype))
        print("First request failed")

In [390]:
getComments(sub_id)

[]

In [391]:
url = 'https://api.pushshift.io/reddit/submission/search/?ids={}&limit=1000'.format(sub_id)
r = requests.get(url)
sub_data = json.loads(r.text)

In [392]:
len(sub_data['data'])

1

In [None]:
#sub_data['data'][0]

In [394]:
url = 'https://api.pushshift.io/reddit/comment/search/?id=43u5cw'
r = requests.get(url)
comm_data = json.loads(r.text)

In [None]:
#comm_data

In [395]:
url = 'https://api.pushshift.io/reddit/submission/comment_ids/{}'.format(sub_id)
r = requests.get(url)
sub_comm_data = json.loads(r.text)

In [396]:
sub_comm_data['data']

[]

In [397]:
url = 'https://api.pushshift.io/reddit/comment/search/?link_id={}'.format(sub_id)
r = requests.get(url)
comm_from_sub_data = json.loads(r.text)

In [398]:
comm_from_sub_data['data']

[]

In [301]:
comm_ids = [x['id'] for x in comm_from_sub_data['data']]
comm_ids

[]

In [286]:
set(comm_ids).issubset(set(sub_comm_data['data']))

True

In [287]:
len(comm_ids),len(sub_comm_data['data'])

(25, 71)