In [None]:
# default_exp processing
# all_data

In [None]:
# hide
%load_ext autoreload
%autoreload 2

# Processing

> This notebook contains the processing pipeline.

## Imports

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
from socemb.collect_data import *
from socemb.read_data import *
from socemb.type_emb import *
from socemb.clean_data import *
from socemb.usage_freq import *

In [None]:
import pandas as pd
import altair as alt
import random
import os

## Variables

In [None]:
METHOD = 'subreddit'
SUBREDDIT = 'askreddit'
YEARS = [year for year in range(2006, 2021)]
LIMIT = 100_000

In [None]:
YEAR = '2012'

In [None]:
CORPUS_DIR = 'data/'
VECS_DIR = 'data/vecs/'

## Collect data

In [None]:
query = query_subr('politics')

In [None]:
%%time
results = get_results(query, 1_000_000)

 24%|██▍       | 244700/1000000 [8:15:19<25:28:53,  8.23it/s]


KeyboardInterrupt: 

In [None]:
comments = conv_results_to_df(results)

In [None]:
comm_subr_to_csv(comments, 'politics', 100, '2007')

## Read data

In [None]:
fpath = get_fpath_subr_yr(SUBREDDIT, LIMIT, YEAR)

In [None]:
comments = read_comm_csv(fpath)

In [None]:
comments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99999 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   body         99999 non-null  string        
 1   created_utc  99999 non-null  datetime64[ns]
 2   id           99999 non-null  string        
 3   subreddit    99999 non-null  string        
dtypes: datetime64[ns](1), string(3)
memory usage: 3.8 MB


## Train type embeddings

### per subreddit

In [None]:
subreddit = 'politics'
limit = 100_000
years = range(2006, 2021)

In [None]:
for subreddit in ['askaconservative', 'asklibertarians']:
    for year in years:
        try:
            fpath = get_fpath_subr_yr(subreddit, 100_000, year)
            comments = read_comm_csv(fpath)
            docs_clean = clean_docs(comments['body'])
            corpus = Corpus(docs_clean)
            model = train_emb(corpus)
            wv = model.wv
            wv.save(f'{VECS_DIR}{subreddit}_{year}_{limit}.wv')
            print(f'{year}:\t{len(docs_clean)} docs')
        except:
            continue

data/subreddit/askaconservative/100000_2006.csv not found on disk
data/subreddit/askaconservative/100000_2007.csv is empty
data/subreddit/askaconservative/100000_2008.csv is empty
data/subreddit/askaconservative/100000_2009.csv is empty
data/subreddit/askaconservative/100000_2010.csv is empty
data/subreddit/askaconservative/100000_2011.csv is empty
2012:	1881 docs
2013:	4715 docs
2014:	13066 docs
2015:	11928 docs
2016:	11080 docs
2017:	27587 docs
2018:	37826 docs
2019:	70474 docs
2020:	67636 docs
data/subreddit/asklibertarians/100000_2006.csv not found on disk
data/subreddit/asklibertarians/100000_2007.csv is empty
data/subreddit/asklibertarians/100000_2008.csv is empty
data/subreddit/asklibertarians/100000_2009.csv is empty
data/subreddit/asklibertarians/100000_2010.csv is empty
data/subreddit/asklibertarians/100000_2011.csv is empty
data/subreddit/asklibertarians/100000_2012.csv is empty
2013:	482 docs
2014:	1257 docs
2015:	1851 docs
2016:	2741 docs
2017:	8392 docs
2018:	24504 docs
2

### per year

In [None]:
YEAR = 2020
LIMIT = 100_000

In [None]:
fpaths = get_fpaths_yr(YEAR)

In [None]:
fpaths

[Path('data/subreddit/asklibertarians/100000_2020.csv'),
 Path('data/subreddit/askaconservative/100000_2020.csv'),
 Path('data/subreddit/politics/100_2020.csv'),
 Path('data/subreddit/politics/1000_2020.csv'),
 Path('data/subreddit/politics/10000_2020.csv'),
 Path('data/subreddit/politics/100000_2020.csv'),
 Path('data/subreddit/askreddit/100000_2020.csv')]

In [None]:
comments = read_comm_csvs(fpaths)

In [None]:
docs_clean = clean_docs(comments['body'])

In [None]:
docs_clean

0         [you, re, asking, how, they, re, going, to, be...
1         [gt, i, don, t, think, there, are, any, varyin...
2         [its, split, on, copyright, it, leans, anti, n...
3         [that, would, be, up, to, the, land, owners, a...
4         [i, have, him, here, gun, to, his, head, round...
                                ...                        
380954    [the, nature, police, are, supposed, to, be, p...
380957    [i, got, reddit, last, year, and, only, starte...
380958    [i, m, just, curious, how, this, is, clear, to...
380962    [staying, at, said, job, while, being, underpa...
380963    [i, m, not, sure, that, this, has, ever, been,...
Name: body, Length: 264796, dtype: object

In [None]:
%%time
corpus = Corpus(docs_clean)
model = train_emb(corpus)
wv = model.wv
wv.save(f'{VECS_DIR}year/{YEAR}.wv')

CPU times: user 2min 33s, sys: 841 ms, total: 2min 34s
Wall time: 41.6 s


## Clean data

### Remove bot subreddits

In [None]:
df = rm_bots_subreddits(df)

### Remove duplicate comments

In [None]:
df = rm_dupl_comments(df)

### Remove comments without target tokens

In [None]:
df = rm_comm_no_toks(df, LEX)

## Usage frequency

### Aggregate usage frequency in monthly bins

In [None]:
df_m = get_monthly_freq(df)
df_m

### Plot usage frequency

In [None]:
plot_use_freq(df_m, LEX)

## Export notebooks

In [None]:
# hide
from nbdev.export import notebook2script
notebook2script()