In [None]:
# default_exp collect_data
# all_data

In [None]:
# hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Collecting the data

> This notebook covers collecting the Reddit data.

## Variables

In [None]:
SUBREDDIT = 'politics'

## Imports

In [None]:
# export
from psaw import PushshiftAPI
from tqdm import tqdm
import datetime as dt
import json
import pandas as pd
import os

## Run query

### Lexeme-based

In [None]:
# export
def query_lexeme(lex, year):
    api = PushshiftAPI()
    gen = api.search_comments(
        q = lex,
        after = int(dt.datetime(year, 1, 1).timestamp()),
        before = int(dt.datetime(year, 12, 31).timestamp())
    )
    return gen

In [None]:
gen = query_lexeme('Anglo-Saxon', 2006)
next(gen)

### Subreddit-based

In [None]:
# export
def query_subr(subreddit):
    api = PushshiftAPI()
    gen = api.search_comments(
        subreddit=subreddit
    )
    return gen

In [None]:
# export
def query_subr_year(subreddit, year):
    api = PushshiftAPI()
    gen = api.search_comments(
        subreddit = subreddit,
        after = int(dt.datetime(int(year), 1, 1).timestamp()),
        before = int(dt.datetime(int(year), 12, 31).timestamp())
    )
    return gen

## Collect results

In [None]:
# export
def get_results(gen, limit):
    cache = []
    for c in tqdm(gen, total=limit):
        cache.append(c)
        if len(cache) >= limit:
            break
    return cache

In [None]:
results = get_results(query_gen, 100)

In [None]:
assert len(results) == 100

## Convert to DF

In [None]:
# export
def conv_results_to_df(results):
    df = pd.DataFrame([thing.d_ for thing in results])
    return df

In [None]:
comments = conv_results_to_df(results)

In [None]:
assert comments.shape == (100, 23)

In [None]:
comments['body']

## Write out comments

In [None]:
# export
def comm_subr_to_csv(comments, subreddit='NaN', limit='NaN', year='NaN'):
    dir_out = f'data/subreddit/{subreddit}'
    if not os.path.exists(dir_out):
        os.makedirs(dir_out)
    comments.to_csv(
        f'{dir_out}/{limit}_{year}.csv',
        index=False
    )        

In [None]:
comm_subr_to_csv(comments, 'politics', 100, '2007')

## Pipeline function

In [None]:
# export
def get_subr_year(subreddit, year, limit):
    query_gen = query_subr(subreddit, year)
    results = get_results(query_gen, limit)
    comments = conv_results_to_df(results)
    comm_subr_to_csv(comments, subreddit, limit, year)

## Export notebooks

In [None]:
# hide
from nbdev.export import notebook2script
notebook2script()