In [None]:
# default_exp processing
# all_data

In [None]:
# hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Processing

> This notebook contains the processing pipeline.

## Imports

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
from socemb.collect_data import *
from socemb.read_data import *
from socemb.type_emb import *
from socemb.clean_data import *
from socemb.usage_freq import *

In [None]:
import random
import os

import pandas as pd
import altair as alt

from scipy.spatial.distance import cosine

## Variables

In [None]:
CORPUS_DIR = 'data/'
VECS_DIR = 'data/vecs/'

In [None]:
METHOD = 'subreddit'
LIMIT = 100_000

In [None]:
YEARS = range(2006, 2021)
SUBREDDITS = ['askaconservative', 'asklibertarians', 'askreddit', 'conservative', 'politics']

## Collect data

In [None]:
query = query_subr(SUBREDDIT)

In [None]:
%%time
results = get_results(query, LIMIT)

100%|█████████▉| 499999/500000 [3:27:09<00:00, 40.23it/s]   

CPU times: user 7min 29s, sys: 31 s, total: 8min
Wall time: 3h 27min 9s





In [None]:
comments = conv_results_to_df(results)

In [None]:
comm_subr_to_csv(comments, SUBREDDIT, LIMIT, YEAR)

## Train type embeddings

### per subreddit

In [None]:
for subreddit in SUBREDDITS[1:]:
    for year in years:
        try:
            fpath = get_fpath_subr_yr(subreddit, year, LIMIT)
            comments = read_comm_csv(fpath)
            docs_clean = clean_docs(comments['body'])
            corpus = Corpus(docs_clean)
            model = train_emb(corpus)
            model.wv.save(f'{VECS_DIR}{subreddit}_{year}_{limit}.kv')
            print(f'{subreddit} / {year}:\t{len(docs_clean)} docs')
        except:
            continue

data/subreddit/asklibertarians/100000_2006.csv not found on disk
data/subreddit/asklibertarians/100000_2007.csv is empty
data/subreddit/asklibertarians/100000_2008.csv is empty
data/subreddit/asklibertarians/100000_2009.csv is empty
data/subreddit/asklibertarians/100000_2010.csv is empty
data/subreddit/asklibertarians/100000_2011.csv is empty
data/subreddit/asklibertarians/100000_2012.csv is empty
asklibertarians / 2013:	482 docs
asklibertarians / 2014:	1257 docs
asklibertarians / 2015:	1851 docs
asklibertarians / 2016:	2741 docs
asklibertarians / 2017:	8392 docs
asklibertarians / 2018:	24504 docs
asklibertarians / 2019:	34298 docs
asklibertarians / 2020:	58181 docs
data/subreddit/askreddit/100000_2006.csv not found on disk
data/subreddit/askreddit/100000_2007.csv is empty
askreddit / 2008:	60967 docs
askreddit / 2009:	63777 docs
askreddit / 2010:	64626 docs
askreddit / 2011:	60026 docs
askreddit / 2012:	58865 docs
askreddit / 2013:	58129 docs
askreddit / 2014:	57484 docs
askreddit / 2

### per year

In [None]:
YEAR = 2020
LIMIT = 100_000

In [None]:
fpaths = get_fpaths_yr(YEAR)

In [None]:
fpaths

[Path('data/subreddit/asklibertarians/100000_2020.csv'),
 Path('data/subreddit/askaconservative/100000_2020.csv'),
 Path('data/subreddit/politics/100_2020.csv'),
 Path('data/subreddit/politics/1000_2020.csv'),
 Path('data/subreddit/politics/10000_2020.csv'),
 Path('data/subreddit/politics/100000_2020.csv'),
 Path('data/subreddit/askreddit/100000_2020.csv')]

In [None]:
comments = read_comm_csvs(fpaths)

In [None]:
docs_clean = clean_docs(comments['body'])

In [None]:
docs_clean

0         [you, re, asking, how, they, re, going, to, be...
1         [gt, i, don, t, think, there, are, any, varyin...
2         [its, split, on, copyright, it, leans, anti, n...
3         [that, would, be, up, to, the, land, owners, a...
4         [i, have, him, here, gun, to, his, head, round...
                                ...                        
380954    [the, nature, police, are, supposed, to, be, p...
380957    [i, got, reddit, last, year, and, only, starte...
380958    [i, m, just, curious, how, this, is, clear, to...
380962    [staying, at, said, job, while, being, underpa...
380963    [i, m, not, sure, that, this, has, ever, been,...
Name: body, Length: 264796, dtype: object

In [None]:
%%time
corpus = Corpus(docs_clean)
model = train_emb(corpus)
wv = model.wv
wv.save(f'{VECS_DIR}year/{YEAR}.wv')

CPU times: user 2min 33s, sys: 841 ms, total: 2min 34s
Wall time: 41.6 s


## Measure semantic distances

In [None]:
LEX = 'the'

### retrieve vectors

In [None]:
vecs = []
for subr in SUBREDDITS:
    for year in YEARS:
        model_path = f'{VECS_DIR}{subr}_{year}_{LIMIT}.kv'
        if os.path.exists(model_path):
            model = load_model(subr, year)
            vec = {}
            vec['subr'] = subr
            vec['year'] = year
            vec['vec'] = get_vec_from_model(LEX, model)
            vecs.append(vec)
        else:
            continue

In [None]:
df = pd.DataFrame(vecs)

In [None]:
df

Unnamed: 0,subr,year,vec
0,askaconservative,2012,"[0.33568993, 0.28289905, 0.18170774, 0.3234423..."
1,askaconservative,2013,"[0.1681983, 0.34340978, 0.48565668, 0.5050152,..."
2,askaconservative,2014,"[-0.11564513, -0.1509151, 0.06975199, -0.20559..."
3,askaconservative,2015,"[-0.30687356, 0.50545174, -0.12360158, -0.0459..."
4,askaconservative,2016,"[0.8604058, 0.7368072, 0.19080263, 1.4651458, ..."
5,askaconservative,2017,"[0.40982404, 0.8223834, 0.41120344, 0.31097403..."
6,askaconservative,2018,"[-0.09827916, 0.096937105, -0.5799866, -0.1271..."
7,askaconservative,2019,"[-0.018214036, 0.27892032, -0.64972323, -0.167..."
8,askaconservative,2020,"[-0.22599941, 0.49779037, -0.2889724, -0.36246..."
9,asklibertarians,2013,"[0.2851873, 0.39539212, 0.14333488, 0.32296813..."


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   subr    44 non-null     object
 1   year    44 non-null     int64 
 2   vec     44 non-null     object
dtypes: int64(1), object(2)
memory usage: 1.2+ KB


### diachronic distances

In [None]:
df_grouped = df.groupby('year')

In [None]:
dists = {}
for name, group in df_grouped:
    dists[name] = cosine(group.query('subr == "politics"').iloc[0]['vec'], group.query('subr == "askaconservative"').iloc[0]['vec'])

In [None]:
dists_df = pd.DataFrame(
    data={
        'dist': dists.values(),
        'year': dists.keys()
    }
)

In [None]:
dists_df['year'] = dists_df['year'].astype(str)

In [None]:
dists_df

Unnamed: 0,dist,year
0,0.568387,2013
1,0.409027,2014
2,0.75708,2015
3,0.70754,2016
4,0.633533,2017
5,0.509409,2018
6,0.544035,2019
7,0.602764,2020


In [None]:
alt.Chart(dists_df)\
    .mark_line()\
    .encode(
        x='year:O',
        y=alt.Y('dist', scale=alt.Scale(zero=False))
    )

#### using `shift`

In [None]:
df['vec_lag'] = df.groupby(['subr'])['vec'].shift(1)

In [None]:
df = df.dropna()

In [None]:
df

Unnamed: 0,subr,year,vec,vec_lag
1,askaconservative,2014,"[0.05898989, -0.020806514, -0.09522354, -0.025...","[0.10089946, -0.15217064, -0.093968906, -0.009..."
2,askaconservative,2015,"[-0.33191583, -0.13233104, 0.2913851, 0.344635...","[0.05898989, -0.020806514, -0.09522354, -0.025..."
3,askaconservative,2016,"[-0.25068983, -0.3269286, 0.38366556, -0.05369...","[-0.33191583, -0.13233104, 0.2913851, 0.344635..."
4,askaconservative,2017,"[-1.3258034, 0.091056794, 0.3448868, -0.657256...","[-0.25068983, -0.3269286, 0.38366556, -0.05369..."
5,askaconservative,2018,"[-0.5096724, 0.036298662, -0.27478805, -0.1627...","[-1.3258034, 0.091056794, 0.3448868, -0.657256..."
6,askaconservative,2019,"[-0.002224556, 0.09256109, 0.14517716, -0.4745...","[-0.5096724, 0.036298662, -0.27478805, -0.1627..."
7,askaconservative,2020,"[-0.85950464, -0.15552747, 0.68602735, -0.8375...","[-0.002224556, 0.09256109, 0.14517716, -0.4745..."
9,politics,2014,"[0.04908611, 0.049922604, 0.073071755, -0.0735...","[0.1833485, 0.107011795, 0.04851157, -0.051158..."
10,politics,2015,"[-0.47832316, 0.35027474, -0.57572514, 0.82645...","[0.04908611, 0.049922604, 0.073071755, -0.0735..."
11,politics,2016,"[-0.62413377, -0.297199, -1.072379, 0.5791618,...","[-0.47832316, 0.35027474, -0.57572514, 0.82645..."


In [None]:
df.apply(lambda x: cosine(x["vec"], x["vec_lag"]), axis=1)

1     0.373125
2     0.840510
3     0.330363
4     0.365177
5     0.391861
6     0.358622
7     0.416066
9     0.480897
10    1.029389
11    0.484906
12    0.525790
13    0.385125
14    0.391713
15    0.538054
dtype: float64

## Dimensionality reduction

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import numpy as np

In [None]:
Y_tsne = TSNE(
    perplexity=20,
    method='exact',
    init='pca',
    verbose=True
    )\
    .fit_transform(list(df['vec']))

[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 16 / 16
[t-SNE] Mean sigma: 1125899906842624.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 60.384134
[t-SNE] KL divergence after 1000 iterations: 0.355095


In [None]:
df['x_tsne'] = Y_tsne[:, [0]]
df['y_tsne'] = Y_tsne[:, [1]]

In [None]:
pca = PCA(n_components=2).fit(list(df['vec']))
Y_pca = pca.transform(list(df['vec']))
df['x_pca'] = Y_pca[:, [0]]
df['y_pca'] = Y_pca[:, [1]]
print(f"explained var (sum): {np.sum(pca.explained_variance_ratio_)}", '\n')

explained var (sum): 0.4043634744038942 



In [None]:
df

Unnamed: 0,subr,year,vec,x_tsne,y_tsne,x_pca,y_pca
0,askaconservative,2013,"[0.10089946, -0.15217064, -0.093968906, -0.009...",42.720264,-52.183632,5.351763,-1.840265
1,askaconservative,2014,"[0.05898989, -0.020806514, -0.09522354, -0.025...",-28.857372,-62.780521,5.337487,-1.886082
2,askaconservative,2015,"[-0.33191583, -0.13233104, 0.2913851, 0.344635...",90.509644,-100.966339,3.995889,0.81494
3,askaconservative,2016,"[-0.25068983, -0.3269286, 0.38366556, -0.05369...",111.582184,-27.120678,2.90721,2.517781
4,askaconservative,2017,"[-1.3258034, 0.091056794, 0.3448868, -0.657256...",52.191757,92.402519,1.251124,4.340449
5,askaconservative,2018,"[-0.5096724, 0.036298662, -0.27478805, -0.1627...",16.258982,-120.523094,-0.835,4.984162
6,askaconservative,2019,"[-0.002224556, 0.09256109, 0.14517716, -0.4745...",-128.386917,13.372989,-3.486289,4.359694
7,askaconservative,2020,"[-0.85950464, -0.15552747, 0.68602735, -0.8375...",-9.373852,52.650635,-3.3244,3.816815
8,politics,2013,"[0.1833485, 0.107011795, 0.04851157, -0.051158...",-61.047222,2.011389,5.58056,-1.700699
9,politics,2014,"[0.04908611, 0.049922604, 0.073071755, -0.0735...",-19.384014,120.202591,5.340346,-1.680099


## Plot 

In [None]:
import altair as alt

In [None]:
alt.Chart(df)\
    .mark_point()\
    .encode(
        x = 'x_pca',
        y = 'y_pca',
        color = 'subr',
        tooltip = 'year'
    ).interactive()


## Clean data

### Remove bot subreddits

In [None]:
df = rm_bots_subreddits(df)

### Remove duplicate comments

In [None]:
df = rm_dupl_comments(df)

### Remove comments without target tokens

In [None]:
df = rm_comm_no_toks(df, LEX)

## Usage frequency

### Aggregate usage frequency in monthly bins

In [None]:
df_m = get_monthly_freq(df)
df_m

### Plot usage frequency

In [None]:
plot_use_freq(df_m, LEX)

## Export notebooks

In [None]:
# hide
from nbdev.export import notebook2script
notebook2script()