In [10]:
import math
from pprint import pprint
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import praw
from praw.models import Submission
from psaw import PushshiftAPI
import csv
import os
from urllib.error import HTTPError
import glob
import requests
import json
from json import JSONDecodeError
import datetime
import pickle

In [2]:
def get_submissions(reddit_instance,subreddit_str):
    
    if not os.path.exists('praw_output'):
        os.mkdir('praw_output')
    
    subreddit = reddit_instance.subreddit(subreddit_str)
    print('Getting submissions and comments from: {}'.format(subreddit.display_name))  
    try:
        title = subreddit.title
        desc = subreddit.description

        if not os.path.exists('subreddits.tsv'):
            with open('subreddits.tsv','w') as f:
                csvwriter = csv.writer(f, delimiter='\t')
                csvwriter.writerow([subreddit.display_name,title,desc])
        else:
            with open('subreddits.tsv','a') as f:
                csvwriter = csv.writer(f, delimiter='\t')
                csvwriter.writerow([subreddit.display_name,title,desc])

        # Write header
        with open(os.path.join('praw_output','{}.tsv'.format(subreddit.display_name)), 'w', newline='\n') as csvfile:
            csvwriter = csv.writer(csvfile, delimiter='\t',
                                    quotechar='|', quoting=csv.QUOTE_MINIMAL)
            csvwriter.writerow(['title','author','date','is_video','id','num_downs','num_ups','upvote_ratio',
                               'num_comments','score','text','subreddit'])

        # Write body
        for submission in subreddit.new(limit=None):
            sub_title = submission.title
            sub_author = submission.author.name if submission.author is not None else -1
            sub_date = submission.created
            sub_is_vid = submission.is_video
            sub_id = submission.id
            sub_downvotes = submission.downs
            sub_upvotes = submission.ups
            sub_upvote_ratio = submission.upvote_ratio
            sub_num_comments = submission.num_comments
            sub_score = submission.score
            sub_text = submission.selftext.strip().replace('\t','').replace('\n','')
            sub_subreddit = submission.subreddit.display_name
            with open(os.path.join('praw_output','{}.tsv'.format(subreddit.display_name)), 'a', newline='\n') as csvfile:
                csvwriter = csv.writer(csvfile, delimiter='\t',
                                    quotechar='|', quoting=csv.QUOTE_MINIMAL)
                csvwriter.writerow([sub_title,sub_author,sub_date,sub_is_vid,sub_id,sub_downvotes,
                                   sub_upvotes,sub_upvote_ratio,sub_num_comments,sub_score,sub_text,sub_subreddit])
    except HTTPError as e:
        if e.code == 403:
            print('Forbidden: private subreddit.')
            
def get_submission_comments(reddit_instance,subreddit,submission_id):
    
    submission = Submission(reddit_instance,id=submission_id)
    
    try:
        submission.comments.replace_more(limit=0)
        all_comments = submission.comments.list()

        # Write header
        with open(os.path.join('praw_output','post_comments','{}_COMMENTS.tsv'.format(subreddit)), 'w', newline='\n') as csvfile:
            csvwriter = csv.writer(csvfile, delimiter='\t',
                                    quotechar='|', quoting=csv.QUOTE_MINIMAL)
            csvwriter.writerow(['submission_id','author','text','date','id','controversiality','num_downs','num_ups',
                               'num_likes','score','subreddit'])

        # Write body
        for comment in all_comments:
            sub_id = comment._submission.id
            assert sub_id == submission_id
            author_name = comment.author.name if comment.author is not None else -1
            comment_body = comment.body.strip().replace('\t','').replace('\n','')
            date_created = comment.created
            comment_id = comment.id
            controversiality = comment.controversiality
            num_downs = comment.downs
            num_ups = comment.ups
            num_likes = comment.likes
            score = comment.score
            subreddit_name = comment.subreddit.display_name
            #print(subreddit_name,subreddit)
            assert subreddit_name == subreddit
            
            with open(os.path.join('praw_output','post_comments','{}_COMMENTS.tsv'.format(subreddit_name)), 'a', newline='\n') as csvfile:
                csvwriter = csv.writer(csvfile, delimiter='\t',
                                    quotechar='|', quoting=csv.QUOTE_MINIMAL)
                csvwriter.writerow([sub_id,author_name,comment_body,date_created,comment_id,controversiality,
                                   num_downs,num_ups,num_likes,score,subreddit_name])
    except HTTPError as e:
        if e.code == 403:
            print('Forbidden: private subreddit.')

In [20]:
with open('../keywords_long.txt','r') as f:
    KEYWORDS_LONG = f.read().splitlines()
KEYWORDS_SHORT = set(["climate change","global warming","carbon","co2","methane",
                  "green","environment","fossil fuel"])
#my_keywords = set(['climate change', 'global warming', 'fossil fuel', 'methane', 'carbon', 'co2'])

Read in climate-related subreddits:

In [5]:
SUBREDDITS = pd.read_csv('CLIMATE_SUBREDDITS.txt',sep='\t',header=0)
SUBREDDITS.stance.value_counts()

pro     42
anti    20
neut    10
Name: stance, dtype: int64

In [6]:
SUBREDDITS.loc[SUBREDDITS.stance=='neut']

Unnamed: 0,subreddit,stance
4,science,neut
19,energy,neut
52,climatecmv,neut
65,Republican,neut
66,republicans,neut
67,askaconservative,neut
68,Conservative,neut
69,conservatives,neut
70,TrueConservativism,neut
71,AskTrumpSupporters,neut


In [158]:
len(set(SUBREDDITS['subreddit'])),len(SUBREDDITS['subreddit'])

(72, 72)

In [159]:
SUBREDDITS_LIST = list(SUBREDDITS['subreddit'])

# Create PRAW reddit instance to get posts and comments

In [11]:
reddit = praw.Reddit(client_id='1sbu376RCBiWRw',
                     client_secret='NbqiHMPiKicBXvgfrID-xVNktZM',
                     user_agent='mac:cc_framing:v1 (by /u/emma_cc_research)')

In [14]:
# Can it know if a user is a bot?
vars(reddit.redditor("DragonFireDon"))

{'_reddit': <praw.reddit.Reddit at 0x1177ea240>,
 '_fetched': False,
 '_listing_use_sort': True,
 'name': 'DragonFireDon'}

In [16]:
reddit.redditor("AutoModerator").link_karma

13230

In [18]:
reddit.redditor("AutoModerator").__dict__

{'_reddit': <praw.reddit.Reddit at 0x1177ea240>,
 '_fetched': False,
 '_listing_use_sort': True,
 'name': 'AutoModerator'}

In [13]:
reddit.redditor("emma_cc_research")

Redditor(name='emma_cc_research')

## Get posts from all subreddits

In [84]:
for i in range(len(SUBREDDITS['subreddit'])-1,len(SUBREDDITS['subreddit'])):
    SUBREDDIT = SUBREDDITS_LIST[i]
    get_submissions(reddit,SUBREDDIT)

Getting submissions and comments from: AskTrumpSupporters


## Inspect output: tsv of subreddits and meta

In [86]:
pd.read_csv('subreddits.tsv',sep='\t',header=None).drop_duplicates(0,keep='first')

Unnamed: 0,0,1,2
0,climateskeptics,Climate Skeptics: Trying to see through the al...,"Seeing past hyperbole, alarmism and environmen..."
1,skeptic,skeptic,## [Click this link to Read the Rules](http://...
2,climatechange,A place for a rational discussion on a divisiv...,This is a place for the rational discussion of...
3,climate,Information about the world's climate,Real and accurate data about the Earth's clima...
4,science,Reddit Science,# [Submission Rules](https://www.reddit.com/r/...
...,...,...,...
63,republicans,Republicans - RNC - GOP: Grand Old Party,"Republican, RNC and GOP news, issues, gossip, ..."
64,askaconservative,Ask A Conservative: Ask Conservatives And Repu...,#[Ask a Conservative](/r/askaconservative)\n\n...
65,Conservative,Conservative,#####\n**[Join us on discord.](https://discord...
66,conservatives,conservatives,"Conservatism (from, conservare, ""to preserve"")..."


## Inspect output: tsv of one subreddit's posts

In [88]:
df = pd.read_csv('praw_output/350.tsv',sep='\t',header=0)

In [89]:
df.columns

Index(['title', 'author', 'date', 'is_video', 'id', 'num_downs', 'num_ups',
       'upvote_ratio', 'num_comments', 'score', 'text', 'subreddit'],
      dtype='object')

In [90]:
df.is_video.value_counts()

False    127
Name: is_video, dtype: int64

In [91]:
df.subreddit.value_counts()

350    127
Name: subreddit, dtype: int64

In [160]:
#df.head(15)

## Get comments for all posts with non-zero num comments

In [156]:
for subreddit_tsv in glob.glob('praw_output/posts/*.tsv'):
    if os.path.exists('praw_output/post_comments/{}_COMMENTS.tsv'.format(subreddit_tsv.split('/')[-1][:-4])):
        print('Already got comments for subreddit {}'.format(subreddit_tsv))
    else:
        subreddit_posts = pd.read_csv(subreddit_tsv,sep='\t',header=0)
        if len(subreddit_posts) > 0:
            subreddit = str(subreddit_posts.iloc[0]['subreddit'])
            posts_with_comments = subreddit_posts.loc[subreddit_posts.num_comments > 0]
            if len(posts_with_comments) > 0:
                print('Getting comments from posts in subreddit: {}'.format(subreddit))
                for ix,row in posts_with_comments.iterrows():
                    get_submission_comments(reddit,subreddit,row['id'])
            else:
                print('0 comments among all posts in subreddit: {}'.format(subreddit))
        else:
            print('Subreddit {} has no posts'.format(subreddit_tsv))

Already got comments for subreddit praw_output/posts/350.tsv
Already got comments for subreddit praw_output/posts/350ppm.tsv
Already got comments for subreddit praw_output/posts/askaconservative.tsv
Already got comments for subreddit praw_output/posts/AskTrumpSupporters.tsv
Already got comments for subreddit praw_output/posts/carboncapture.tsv
Already got comments for subreddit praw_output/posts/carbontax.tsv
Already got comments for subreddit praw_output/posts/ccfunding.tsv
Already got comments for subreddit praw_output/posts/climate.tsv
Already got comments for subreddit praw_output/posts/climate_activism.tsv
Already got comments for subreddit praw_output/posts/climate_discussion.tsv
Already got comments for subreddit praw_output/posts/climate_science.tsv
Already got comments for subreddit praw_output/posts/ClimateActionPlan.tsv
Already got comments for subreddit praw_output/posts/climatechange.tsv
Already got comments for subreddit praw_output/posts/ClimateChangeCancer.tsv
Already g

# Use Pushshift API

In [5]:
before_date = datetime.datetime.strptime("07-22-2020", "%m-%d-%Y")
after_date = datetime.datetime.strptime("07-20-2020", "%m-%d-%Y")
before_timestamp = int(datetime.datetime.timestamp(before_date))
after_timestamp = int(datetime.datetime.timestamp(after_date))

In [6]:
test_data = getPushshiftData('climate change',after_timestamp,before_timestamp,"submission")

In [8]:
len(test_data)

100

In [9]:
test_data[0]

{'all_awardings': [],
 'allow_live_comments': False,
 'author': 'DragonFireDon',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_text': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_mvdti',
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1595219379,
 'domain': 'weforum.org',
 'full_link': 'https://www.reddit.com/r/climate/comments/huewrd/the_relationship_between_climate_change_and/',
 'gildings': {},
 'id': 'huewrd',
 'is_crosspostable': False,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_robot_indexable': False,
 'is_self': False,
 'is_video': False,
 'link_flair_background_color': '',
 'link_flair_richtext': [],
 'link_flair_text_color': 'dark',
 'link_flair_type': 'text',
 'locked': False,
 'media_only': False,
 'no_follow': True,
 'num_comments': 0,
 'num_crossposts': 0,
 'over_18': False,
 'parent_whiteli

In [4]:
def getPushshiftData(query, after, before, datatype):
    query_prefix = 'title' if datatype == 'submission' else 'q'
    url = 'https://api.pushshift.io/reddit/search/'+datatype+'/?'+query_prefix+'='+str(query)+'&size=1000&after='+str(after)+'&before='+str(before)
    #print(url)
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']


def collectSubData(subm,subs_dict):
    try:
        title = subm['title']
    except KeyError:
        title = None
    try:
        url = subm['url']
    except KeyError:
        url = None
    try:
        flair = subm['link_flair_text']
    except KeyError:
        flair = "NaN"   
    try:
        author = subm['author']
    except KeyError:
        author = None
    sub_id = subm['id']
    try:
        score = subm['score']
    except KeyError:
        score = None
    try:
        created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
    except KeyError:
        created = None
    try:
        numComms = subm['num_comments']
    except KeyError:
        numComms = None
    try:
        permalink = subm['permalink']
    except KeyError:
        permalink = None
    try:
        is_vid = subm['is_video']
    except KeyError:
        is_vid = None
    try:
        upvote_ratio = subm['upvote_ratio']
    except KeyError:
        upvote_ratio = None
    try:
        text = subm['selftext'].strip().replace('\t','').replace('\n','')
    except KeyError:
        text = ""
    try:
        subreddit = subm['subreddit']
    except KeyError:
        subreddit = None
    subData = {'id':sub_id,'title':title,'url':url,'author':author,'score':score,'date':created,
                    'num_comments':numComms,'permalink':permalink,'flair':flair,'is_video':is_vid,
                    'upvote_ratio':upvote_ratio,'text':text,'subreddit':subreddit}
    subs_dict[sub_id] = subData
    
    
def collectCommData(subm,subs_dict): 
    try:
        author = subm['author']
    except KeyError:
        author = None
    sub_id = subm['id']
    try:
        score = subm['score']
    except KeyError:
        score = None
    try:
        created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
    except KeyError:
        created = None
    try:
        permalink = subm['permalink']
    except KeyError:
        permalink = None
    try:
        text = subm['body'].strip().replace('\t','').replace('\n','')
    except KeyError:
        text = ""
    try:
        subreddit = subm['subreddit']
    except KeyError:
        subreddit = None
    subData = {'id':sub_id,'author':author,'score':score,'date':created,
                    'permalink':permalink,'text':text,'subreddit':subreddit}
    subs_dict[sub_id] = subData

In [239]:
def pushshift_wrapper(query, after_str, before_str, datatype):
    failed_requests = []
    
    subCount = 0
    subStats = {}
    before_date = datetime.datetime.strptime(before_str, "%m-%d-%Y")
    after_date = datetime.datetime.strptime(after_str, "%m-%d-%Y")
    before_timestamp = int(datetime.datetime.timestamp(before_date))
    after_timestamp = int(datetime.datetime.timestamp(after_date))
    print("Getting all submissions with query '{}' from {} to {}".format(query,
                                                                         after_str,before_str))
    try:
        data = getPushshiftData(query, after_timestamp, before_timestamp, datatype)
        # Will run until all posts have been gathered 
        # from the 'after' date up until before date
        while len(data) > 0:
            for submission in data:
                if datatype == "submission":
                    collectSubData(submission,subStats)
                else:
                    collectCommData(submission,subStats)
                subCount+=1
            # Calls getPushshiftData() with the created date of the last submission
            #print(len(data))
            #print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
            after_timestamp = data[-1]['created_utc']
            #print(after_timestamp)
            try:
                data = getPushshiftData(query, after_timestamp, before_timestamp, datatype)
            except JSONDecodeError:
                failed_requests.append((query,after_timestamp,before_timestamp,datatype))

        print('Num submissions:',subCount,len(subStats))

        interim_df = pd.DataFrame(list(subStats.values()))
        #print(interim_df)

        datatype_prefix = 'posts' if datatype == 'submission' else 'post_comments'
        out_dir = os.path.join('pushshift_output',datatype_prefix,'{}_to_{}'.format(after_str,before_str))
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
        interim_df.to_pickle(os.path.join(out_dir,'{}.pkl'.format(query)))
        print('Saved query submissions to {}!'.format(os.path.join(out_dir,'{}.pkl'.format(query))))

        failed_reqs_out_dir = os.path.join('pushshift_output','failed_requests',
                                                      '{}_{}'.format(after_str,before_str))
        if not os.path.exists(failed_reqs_out_dir):
            os.mkdir(failed_reqs_out_dir)
        pickle.dump(failed_requests,open(os.path.join(failed_reqs_out_dir,'{}.pkl'.format(query)),'wb'))
    except JSONDecodeError:
        failed_requests.append((query,after_timestamp,before_timestamp,datatype))
        print("First request failed")

In [268]:
for start_year in range(2010,2015,1):
    end_year = start_year+1
    for keyword in my_keywords:
        if not os.path.exists(os.path.join('pushshift_output','posts',
                                           '1-1-{}_to_12-31-{}'.format(start_year,start_year),
                                           '{}.pkl'.format(keyword))):
            print("Missing {}, {}".format(keyword,start_year))
            pushshift_wrapper(keyword,'1-1-{}'.format(start_year),'12-31-{}'.format(start_year),'submission')
print('************')    
for start_year in range(2010,2015,1):
    end_year = start_year+1
    for keyword in my_keywords:
        if not os.path.exists(os.path.join('pushshift_output','post_comments',
                                           '1-1-{}_to_12-31-{}'.format(start_year,start_year),
                                           '{}.pkl'.format(keyword))):
            print("Missing {}, {}".format(keyword,start_year))
            #pushshift_wrapper(keyword,'1-1-{}'.format(start_year),'12-31-{}'.format(start_year),'comment')

************


In [216]:
pd.read_pickle('pushshift_output/posts/1-1-2019_to_12-31-2019/methane.pkl')

Unnamed: 0,id,title,url,author,score,date,num_comments,permalink,flair,is_video,upvote_ratio,text,subreddit
0,ablzoh,Mars’ disappearing methane proves a puzzle for...,https://icdn2.digitaltrends.com/image/mars-sur...,Raisaood,1,2019-01-01 16:05:55,1,/r/science/comments/ablzoh/mars_disappearing_m...,Physics,False,,,science
1,abnjro,Can we use methane as long term energy storage?,https://www.reddit.com/r/askscience/comments/a...,Epinephrine666,1,2019-01-01 18:55:09,1,/r/askscience/comments/abnjro/can_we_use_metha...,,False,,[removed],askscience
2,absvn9,Has Mars' methane gone missing?,https://earthsky.org/space/esa-exomars-trace-g...,EcoInternetNewsfeed,1,2019-01-02 07:13:59,0,/r/EcoInternet/comments/absvn9/has_mars_methan...,,False,,,EcoInternet
3,abszhb,TIL about the Aliso Canyon gas leak - a massiv...,https://en.wikipedia.org/wiki/Aliso_Canyon_gas...,JustWentFullBlown,1,2019-01-02 07:30:40,3,/r/todayilearned/comments/abszhb/til_about_the...,,False,,,todayilearned
4,abto43,[Research] - Modeling the Distribution and Typ...,https://ntrs.nasa.gov/search.jsp?R=20180008814,AutoNewsAdmin,1,2019-01-02 09:04:29,0,/r/NASAauto/comments/abto43/research_modeling_...,,False,,,NASAauto
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5894,ehb2r3,[World] - Slippery salvation: Could seaweed as...,https://www.japantimes.co.jp/news/2019/12/30/w...,AutoNewspaperAdmin,1,2019-12-29 15:33:32,0,/r/AutoNewspaper/comments/ehb2r3/world_slipper...,,False,,,AutoNewspaper
5895,ehclof,This phenomenon happens when the ice freezes o...,https://i.redd.it/x6ugu6o7fh741.jpg,the_karma_llama,1,2019-12-29 17:26:17,0,/r/DidntKnowThatExisted/comments/ehclof/this_p...,,False,,,DidntKnowThatExisted
5896,ehgbdr,An alternative to steam methane reforming - Kv...,https://en.m.wikipedia.org/wiki/Kv%C3%A6rner_p...,cheaptrainride,1,2019-12-29 22:21:45,9,/r/energy/comments/ehgbdr/an_alternative_to_st...,,False,,,energy
5897,ehp29o,Replacing one gas with another helps efficient...,https://phys.org/news/2019-12-gas-efficiently-...,I_did_dit,1,2019-12-30 12:32:06,0,/r/TopScience/comments/ehp29o/replacing_one_ga...,,False,,,TopScience


In [104]:
subStats['hrgntm'] # keys are post ids

{'id': 'hrgntm',
 'title': 'Burger King addresses climate change by changing cows’ diets, reducing cow farts',
 'url': 'https://www.kcbd.com/2020/07/14/burger-king-addresses-climate-change-by-changing-cows-diets/',
 'author': 'SexandTrees',
 'score': 1,
 'date': datetime.datetime(2020, 7, 15, 0, 0, 1),
 'num_comments': 0,
 'permalink': '/r/TheFightThatMatters/comments/hrgntm/burger_king_addresses_climate_change_by_changing/',
 'flair': 'NaN',
 'is_video': False,
 'upvote_ratio': 1.0,
 'text': '',
 'subreddit': 'TheFightThatMatters'}