# Reddit API Query

In [66]:
import requests
import pandas as pd
import datetime as dt
import time
import pickle
import os
import json
import sys
pd.set_option('display.max_columns', None)

In [67]:
# load project config

terminal_call = ! git rev-parse --show-toplevel
repo_path=terminal_call[0]
project_config_path = os.path.join(repo_path,'project_config.json')

with open(project_config_path,'r') as fp: 
    project_config = json.load(fp)


project_config

{'project_module_relative_path': 'src'}

In [68]:
# import custom scraper module

module_path = os.path.join(repo_path,project_config['project_module_relative_path'])
sys.path.append(module_path)

import scraper
from scraper import tweet_scraper

from importlib import reload

__Pulling data from these Reddit threads:__

-[ukpolitics](https://www.reddit.com/r/ukpolitics/comments/6k6ykd/jeremycorbyn_monday_the_conservatives_spent_1/): Political news and debate concerning the United Kingdom.

-[brexit](https://www.reddit.com/r/brexit/comments/hk4yj9/morals_of_brexit/): A place to debate and discuss the UK's exit from the European Union, including the negotiations for the future UK-EU relationship. Please debate general UK topics (non-brexit related) elsewhere, such as r/ukpolitics

### 1. Query Reddit

In [53]:
# custom function to query reddit multiple times
# this code is adapted from Hovanes's codes that he shared in class

def reddit_query(subreddit, n_samples, after):
    base_url = 'https://api.pushshift.io/reddit/submission/search' # base url for api
    list_posts = []
    oldest_post = after
    
    while len(list_posts) < n_samples: # n_samples exceeds the reddit api limit of 1000
        params = {
            "subreddit" : subreddit,
            "size" : 1000,
            "after": oldest_post # a time stamp to set when to start pulling posts
            }  
            
        res = requests.get(base_url,params) # request reddit info
            
        posts = res.json()['data'] # create list from json
        
        if len(posts) == 0:
            onewest_post = after # for the first time running this, before will be None
            list_posts.extend(posts) # add batch of posts to list_posts     
        else:
            time.sleep(3) # wait 3 seconds before querying again
            newest_post = dt.datetime.fromtimestamp(posts[-1]["created_utc"]) # take the oldest time from the list and pull posts before that time
            list_posts.extend(posts) # add batch of posts to list_posts
    return pd.DataFrame(list_posts) # return pandas dataframe

### Pull data for reddit threads:

In [93]:
df = reddit_query("ukpolitics", 10_000, "2016-01-01")

In [87]:
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

## Pickle for later use 

In [88]:
pickle.dump(df, open("../data/ukpolitics.pkl", "wb"))

## Clean text

In [89]:
# Clean comments column
df = tweet_scraper.clean_tweets(df, 'selftext')

In [90]:
# write to csv
df.to_csv(f"../data/ukpolitics.csv", index = False)