In [3]:
import math
from pprint import pprint
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import praw
from praw.models import Submission
from psaw import PushshiftAPI
import csv
import os
from urllib.error import HTTPError
import glob

  import pandas.util.testing as tm


In [4]:
def get_submissions(reddit_instance,subreddit_str):
    
    if not os.path.exists('praw_output'):
        os.mkdir('praw_output')
    
    subreddit = reddit_instance.subreddit(subreddit_str)
    print('Getting submissions and comments from: {}'.format(subreddit.display_name))  
    try:
        title = subreddit.title
        desc = subreddit.description

        if not os.path.exists('subreddits.tsv'):
            with open('subreddits.tsv','w') as f:
                csvwriter = csv.writer(f, delimiter='\t')
                csvwriter.writerow([subreddit.display_name,title,desc])
        else:
            with open('subreddits.tsv','a') as f:
                csvwriter = csv.writer(f, delimiter='\t')
                csvwriter.writerow([subreddit.display_name,title,desc])

        # Write header
        with open(os.path.join('praw_output','{}.tsv'.format(subreddit.display_name)), 'w', newline='\n') as csvfile:
            csvwriter = csv.writer(csvfile, delimiter='\t',
                                    quotechar='|', quoting=csv.QUOTE_MINIMAL)
            csvwriter.writerow(['title','author','date','is_video','id','num_downs','num_ups','upvote_ratio',
                               'num_comments','score','text','subreddit'])

        # Write body
        for submission in subreddit.new(limit=None):
            sub_title = submission.title
            sub_author = submission.author.name if submission.author is not None else -1
            sub_date = submission.created
            sub_is_vid = submission.is_video
            sub_id = submission.id
            sub_downvotes = submission.downs
            sub_upvotes = submission.ups
            sub_upvote_ratio = submission.upvote_ratio
            sub_num_comments = submission.num_comments
            sub_score = submission.score
            sub_text = submission.selftext.strip().replace('\t','').replace('\n','')
            sub_subreddit = submission.subreddit.display_name
            with open(os.path.join('praw_output','{}.tsv'.format(subreddit.display_name)), 'a', newline='\n') as csvfile:
                csvwriter = csv.writer(csvfile, delimiter='\t',
                                    quotechar='|', quoting=csv.QUOTE_MINIMAL)
                csvwriter.writerow([sub_title,sub_author,sub_date,sub_is_vid,sub_id,sub_downvotes,
                                   sub_upvotes,sub_upvote_ratio,sub_num_comments,sub_score,sub_text,sub_subreddit])
    except HTTPError as e:
        if e.code == 403:
            print('Forbidden: private subreddit.')
            
def get_submission_comments(reddit_instance,subreddit,submission_id):
    
    submission = Submission(reddit_instance,id=submission_id)
    
    try:
        submission.comments.replace_more(limit=0)
        all_comments = submission.comments.list()

        # Write header
        with open(os.path.join('praw_output','post_comments','{}_COMMENTS.tsv'.format(subreddit)), 'w', newline='\n') as csvfile:
            csvwriter = csv.writer(csvfile, delimiter='\t',
                                    quotechar='|', quoting=csv.QUOTE_MINIMAL)
            csvwriter.writerow(['submission_id','author','text','date','id','controversiality','num_downs','num_ups',
                               'num_likes','score','subreddit'])

        # Write body
        for comment in all_comments:
            sub_id = comment._submission.id
            assert sub_id == submission_id
            author_name = comment.author.name if comment.author is not None else -1
            comment_body = comment.body.strip().replace('\t','').replace('\n','')
            date_created = comment.created
            comment_id = comment.id
            controversiality = comment.controversiality
            num_downs = comment.downs
            num_ups = comment.ups
            num_likes = comment.likes
            score = comment.score
            subreddit_name = comment.subreddit.display_name
            #print(subreddit_name,subreddit)
            assert subreddit_name == subreddit
            
            with open(os.path.join('praw_output','post_comments','{}_COMMENTS.tsv'.format(subreddit_name)), 'a', newline='\n') as csvfile:
                csvwriter = csv.writer(csvfile, delimiter='\t',
                                    quotechar='|', quoting=csv.QUOTE_MINIMAL)
                csvwriter.writerow([sub_id,author_name,comment_body,date_created,comment_id,controversiality,
                                   num_downs,num_ups,num_likes,score,subreddit_name])
    except HTTPError as e:
        if e.code == 403:
            print('Forbidden: private subreddit.')

Read in climate-related subreddits:

In [5]:
SUBREDDITS = pd.read_csv('CLIMATE_SUBREDDITS.txt',sep='\t',header=0)
SUBREDDITS.stance.value_counts()

pro     42
anti    20
neut    10
Name: stance, dtype: int64

In [6]:
SUBREDDITS.loc[SUBREDDITS.stance=='neut']

Unnamed: 0,subreddit,stance
4,science,neut
19,energy,neut
52,climatecmv,neut
65,Republican,neut
66,republicans,neut
67,askaconservative,neut
68,Conservative,neut
69,conservatives,neut
70,TrueConservativism,neut
71,AskTrumpSupporters,neut


In [158]:
len(set(SUBREDDITS['subreddit'])),len(SUBREDDITS['subreddit'])

(72, 72)

In [159]:
SUBREDDITS_LIST = list(SUBREDDITS['subreddit'])

# Create PRAW reddit instance to get posts and comments

In [2]:
reddit = praw.Reddit(client_id='1sbu376RCBiWRw',
                     client_secret='NbqiHMPiKicBXvgfrID-xVNktZM',
                     user_agent='mac:cc_framing:v1 (by /u/emma_cc_research)')

## Get posts from all subreddits

In [84]:
for i in range(len(SUBREDDITS['subreddit'])-1,len(SUBREDDITS['subreddit'])):
    SUBREDDIT = SUBREDDITS_LIST[i]
    get_submissions(reddit,SUBREDDIT)

Getting submissions and comments from: AskTrumpSupporters


## Inspect output: tsv of subreddits and meta

In [86]:
pd.read_csv('subreddits.tsv',sep='\t',header=None).drop_duplicates(0,keep='first')

Unnamed: 0,0,1,2
0,climateskeptics,Climate Skeptics: Trying to see through the al...,"Seeing past hyperbole, alarmism and environmen..."
1,skeptic,skeptic,## [Click this link to Read the Rules](http://...
2,climatechange,A place for a rational discussion on a divisiv...,This is a place for the rational discussion of...
3,climate,Information about the world's climate,Real and accurate data about the Earth's clima...
4,science,Reddit Science,# [Submission Rules](https://www.reddit.com/r/...
...,...,...,...
63,republicans,Republicans - RNC - GOP: Grand Old Party,"Republican, RNC and GOP news, issues, gossip, ..."
64,askaconservative,Ask A Conservative: Ask Conservatives And Repu...,#[Ask a Conservative](/r/askaconservative)\n\n...
65,Conservative,Conservative,#####\n**[Join us on discord.](https://discord...
66,conservatives,conservatives,"Conservatism (from, conservare, ""to preserve"")..."


## Inspect output: tsv of one subreddit's posts

In [88]:
df = pd.read_csv('praw_output/350.tsv',sep='\t',header=0)

In [89]:
df.columns

Index(['title', 'author', 'date', 'is_video', 'id', 'num_downs', 'num_ups',
       'upvote_ratio', 'num_comments', 'score', 'text', 'subreddit'],
      dtype='object')

In [90]:
df.is_video.value_counts()

False    127
Name: is_video, dtype: int64

In [91]:
df.subreddit.value_counts()

350    127
Name: subreddit, dtype: int64

In [160]:
#df.head(15)

## Get comments for all posts with non-zero num comments

In [156]:
for subreddit_tsv in glob.glob('praw_output/posts/*.tsv'):
    if os.path.exists('praw_output/post_comments/{}_COMMENTS.tsv'.format(subreddit_tsv.split('/')[-1][:-4])):
        print('Already got comments for subreddit {}'.format(subreddit_tsv))
    else:
        subreddit_posts = pd.read_csv(subreddit_tsv,sep='\t',header=0)
        if len(subreddit_posts) > 0:
            subreddit = str(subreddit_posts.iloc[0]['subreddit'])
            posts_with_comments = subreddit_posts.loc[subreddit_posts.num_comments > 0]
            if len(posts_with_comments) > 0:
                print('Getting comments from posts in subreddit: {}'.format(subreddit))
                for ix,row in posts_with_comments.iterrows():
                    get_submission_comments(reddit,subreddit,row['id'])
            else:
                print('0 comments among all posts in subreddit: {}'.format(subreddit))
        else:
            print('Subreddit {} has no posts'.format(subreddit_tsv))

Already got comments for subreddit praw_output/posts/350.tsv
Already got comments for subreddit praw_output/posts/350ppm.tsv
Already got comments for subreddit praw_output/posts/askaconservative.tsv
Already got comments for subreddit praw_output/posts/AskTrumpSupporters.tsv
Already got comments for subreddit praw_output/posts/carboncapture.tsv
Already got comments for subreddit praw_output/posts/carbontax.tsv
Already got comments for subreddit praw_output/posts/ccfunding.tsv
Already got comments for subreddit praw_output/posts/climate.tsv
Already got comments for subreddit praw_output/posts/climate_activism.tsv
Already got comments for subreddit praw_output/posts/climate_discussion.tsv
Already got comments for subreddit praw_output/posts/climate_science.tsv
Already got comments for subreddit praw_output/posts/ClimateActionPlan.tsv
Already got comments for subreddit praw_output/posts/climatechange.tsv
Already got comments for subreddit praw_output/posts/ClimateChangeCancer.tsv
Already g

# Use PSAW

In [163]:
my_keywords = set(['climate change', 'global warming', 'fossil fuel', 'methane', 'carbon', 'co2'])

In [162]:
api = PushshiftAPI()

In [169]:
cc_results = api.search_submissions(q='climate change') # search_submissions(), search_comments()

In [173]:
list_cc_results = []
for res in cc_results:
    list_cc_results.append(res)



KeyboardInterrupt: 

In [171]:
len(list_cc_results)

TypeError: 'generator' object is not subscriptable

In [166]:
len(list(reddit.subreddit('all').stream.comments()))

KeyboardInterrupt: 