In [None]:
# default_exp collect_data
# all_data

In [None]:
# hide
%load_ext autoreload
%autoreload 2

# Collecting the data

> This notebook covers collecting the Reddit data.

## Imports

In [None]:
# export
from psaw import PushshiftAPI
from tqdm import tqdm
import datetime as dt
import json
import pandas as pd
import os

## Run query

### Lexeme-based

In [None]:
# export
def query_lexeme(lex, year):
    api = PushshiftAPI()
    gen = api.search_comments(
        q = lex,
        after = int(dt.datetime(year, 1, 1).timestamp()),
        before = int(dt.datetime(year, 12, 31).timestamp())
    )
    return gen

In [None]:
gen = query_lexeme('Anglo-Saxon', 2006)
next(gen)

comment(author='Jean-Naimard', author_created_utc=1163570322, author_flair_css_class=None, author_flair_text=None, author_fullname='t2_qx69', body='Western canadians have always been, are and will always be utterly clueless as to the nature of Québec.\r\n\r\nHaving toiled for centuries to suppress anything that’s french, the western rednecks have always shown their utter dedication to the most basest, vile, stupid, primitive human instincts possible.\r\n\r\nWhat the west really want is to join their moronic brethen south-of-the-border and become the 51st State, so they can have "less" taxes, the "right" to carry guns, and generally be clueless ignorant rednecks, as they always have been.\r\n\r\nThere is no "western" nation. It’s the same anglo-saxon soup that you will find in Ontario, the Maritimes or in the US. It’s just the seasoning that changes; some cod sprinkled here, some Screech there, some crumpets here, and some angus beef there. They’re just pissed-off that they have so meag

### Subreddit-based

In [None]:
# export
def query_subr(subreddit, year):
    api = PushshiftAPI()
    gen = api.search_comments(
        subreddit = subreddit,
        after = int(dt.datetime(int(year), 1, 1).timestamp()),
        before = int(dt.datetime(int(year), 12, 31).timestamp())
    )
    return gen

In [None]:
query_gen = query_subr('politics', '2007')
next(gen)

comment(author='ooouuuurrrriiiii', author_created_utc=1164155492, author_flair_css_class=None, author_flair_text=None, author_fullname='t2_rxtz', body="It's also in Britain, so it's an Anglo-Saxon thing it seems.", controversiality=0, created_utc=1164383206, distinguished=None, gilded=0, id='csacq', link_id='t3_s818', nest_level=5, parent_id='t1_cs9id', reply_delay=20955, retrieved_on=1473806750, score=3, stickied=False, subreddit='reddit.com', subreddit_id='t5_6', created=1164376006.0, d_={'author': 'ooouuuurrrriiiii', 'author_created_utc': 1164155492, 'author_flair_css_class': None, 'author_flair_text': None, 'author_fullname': 't2_rxtz', 'body': "It's also in Britain, so it's an Anglo-Saxon thing it seems.", 'controversiality': 0, 'created_utc': 1164383206, 'distinguished': None, 'gilded': 0, 'id': 'csacq', 'link_id': 't3_s818', 'nest_level': 5, 'parent_id': 't1_cs9id', 'reply_delay': 20955, 'retrieved_on': 1473806750, 'score': 3, 'stickied': False, 'subreddit': 'reddit.com', 'subre

## Collect results

In [None]:
# export
def get_results(gen, limit):
    cache = []
    for c in tqdm(gen, total=limit):
        cache.append(c)
        if len(cache) >= limit:
            break
    return cache

In [None]:
results = get_results(query_gen, 100)

 99%|█████████▉| 99/100 [00:02<00:00, 48.33it/s]


In [None]:
assert len(results) == 100

## Convert to DF

In [None]:
# export
def conv_results_to_df(results):
    df = pd.DataFrame([thing.d_ for thing in results])
    return df

In [None]:
comments = conv_results_to_df(results)

In [None]:
assert comments.shape == (100, 23)

In [None]:
comments['body']

0                                               SUP BRO
1     You should have given the blog dude a piece of...
2     I DONT THINK THERE IS A STONG CLAN PRESENCE IN...
3     The problem is that the absurd is not preventi...
4                                             [deleted]
                            ...                        
95    The same person who published this story is ke...
96    I don't think that's really true, personally. ...
97    You also wouldn't be concerned if they were dr...
98    There are other social news websites besides r...
99    &gt; I wouldn't be surprised if Ron Paul takes...
Name: body, Length: 100, dtype: object

## Write out comments

In [None]:
# export
def comm_subr_to_csv(comments, subreddit='NaN', limit='NaN', year='NaN'):
    dir_out = f'data/subreddit/{subreddit}'
    if not os.path.exists(dir_out):
        os.makedirs(dir_out)
    comments.to_csv(
        f'{dir_out}/{limit}_{year}.csv',
        index=False
    )        

In [None]:
comm_subr_to_csv(comments, 'politics', 100, '2007')

### Disk usage

| n_comments | disk_usage_mb |
|-----------:|--------------:|
| 10000      | 3.5           |

## Pipeline function

In [None]:
# export
def get_subr_year(subreddit, year, limit):
    query_gen = query_subr(subreddit, year)
    results = get_results(query_gen, limit)
    comments = conv_results_to_df(results)
    comm_subr_to_csv(comments, subreddit, limit, year)

## Export notebooks

In [None]:
# hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_processing.ipynb.
Converted 01_installation.ipynb.
Converted 02_collect-data.ipynb.
Converted 03_read_data.ipynb.
Converted 04_clean_data.ipynb.
Converted 05_usage_freq.ipynb.
Converted 06_token_emb.ipynb.
Converted 07_type_emb.ipynb.
Converted index.ipynb.
