In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# all_data

In [None]:
from neocov.read_data import *
from neocov.preproc import *
from neocov.type_emb import *
from neocov.communities import *

In [None]:
from gensim.models import Word2Vec
import pandas as pd
from pathlib import Path

In [None]:
DATA_DIR = '../data/'
COMMENTS_DIAC_DIR = f'{DATA_DIR}comments/by_date/'
OUT_DIR = '../out/'

# NeoCov

> Semantic change and social semantic variation of Covid-related English neologisms on Reddit.

## Semantic change

In [None]:
YEAR = '2020'

### Read data

#### Get file paths

In [None]:
comment_paths_year = get_comments_paths_year(COMMENTS_DIAC_DIR, YEAR)

#### Read comments

In [None]:
%%time
comments = read_comm_csvs(comment_paths_year)

In [None]:
comments

### Pre-process comments

In [None]:
%%time
comments_clean = clean_comments(comments)

In [None]:
docs = comments_clean['body'].to_list()

In [None]:
import pickle

In [None]:
with open(f'{OUT_DIR}docs_clean/diac_{YEAR}.pickle', 'wb') as fp:
    pickle.dump(docs, fp)

In [None]:
with open(f'{OUT_DIR}docs_clean/diac_{YEAR}.pickle', 'rb') as fp:
    docs = pickle.load(fp)

### Train models

#### Create corpus

In [None]:
corpus = Corpus(docs)

#### Train model

In [None]:
%%time
model = train_model(corpus, EPOCHS=20)

In [None]:
len(model.wv.key_to_index)

#### Save model

In [None]:
model.save(f'{OUT_DIR}models/{YEAR}_ep-20.model')

### Load models

In [None]:
model_2019 = Word2Vec.load(f'{OUT_DIR}models/2019_ep-20.model')

In [None]:
model_2020 = Word2Vec.load(f'{OUT_DIR}models/2020_ep-20.model')

### Align models

In [None]:
model_2019_vocab = len(model_2019.wv.key_to_index)
model_2020_vocab = len(model_2020.wv.key_to_index)

In [None]:
smart_procrustes_align_gensim(model_2019, model_2020)

190756 190756
190756 190756


<gensim.models.word2vec.Word2Vec at 0x110df3160>

In [None]:
assert len(model_2019.wv.key_to_index) == len(model_2020.wv.vectors)

In [None]:
models_vocab = pd.DataFrame(
    columns=['Model', 'Words'],
    data=[
        ['2019', model_2019_vocab],
        ['2020', model_2020_vocab],
        ['intersection', len(model_2019.wv.key_to_index)]
    ],
)

models_vocab

Unnamed: 0,Model,Words
0,2019,252564
1,2020,277707
2,intersection,190756


In [None]:
models_vocab.to_csv(f'{OUT_DIR}models_vocab.csv', index=False)

### Measure distances

In [None]:
distances = measure_distances(model_2019, model_2020)

TODO: filter by true type frequency; `Gensim`'s type frequency seems incorrect; it probably reflects frequency ranks instead of total counts.

In [None]:
blacklist_lex = (pd.read_csv('../data/blacklist_lex.csv')
    .query('Excl == True')
    .loc[:, 'Lex']
)

In [None]:
k = 20
freq_min = 100

sem_change_cands = (distances\
    .query('freq_1 > @freq_min and freq_2 > @freq_min')
    .query('lex.str.isalpha() == True')
    .query('lex.str.len() > 3')
    .query('lex not in @blacklist_lex')
    .nlargest(k, 'dist_sem')
    .reset_index(drop=True)
)

sem_change_cands

Unnamed: 0,lex,dist_sem,freq_1,freq_2
0,lockdowns,1.016951,940,991
1,maskless,0.996101,118,127
2,sunsetting,0.996084,111,120
3,childe,0.980564,209,222
4,megalodon,0.975273,752,792
5,newf,0.962381,107,115
6,corona,0.926739,3553,3684
7,filtrate,0.918609,102,110
8,chaz,0.899856,190,202
9,klee,0.888728,161,173


In [None]:
sem_change_cands_out = (sem_change_cands
    .nlargest(100, 'dist_sem')
    .assign(index_1 = lambda df: df.index + 1)
    .assign(dist_sem = lambda df: df['dist_sem'].round(2))
    .assign(dist_sem = lambda df: df['dist_sem'].apply('{:.2f}'.format))
    .rename({'index_1': '', 'lex': 'Lexeme', 'dist_sem': 'SemDist'}, axis=1)
)

In [None]:
sem_change_cands_out.to_csv(
        f'{OUT_DIR}sem_change_cands.csv',
        columns=['', 'Lexeme', 'SemDist'],
        index=False
    )

### Inspect nearest neighbours of lexemes

In [None]:
LEX_NBS = 'ahahahah'

In [None]:
nbs_model_1, nbs_model_2 = get_nearest_neighbours_models(
    lex=LEX_NBS, 
    freq_min=1,
    model_1=model_2019, 
    model_2=model_2020,
    k=10
)

display(
    nbs_model_1,
    nbs_model_2
)

Unnamed: 0,model,lex,similarity,freq
0,1,hahahha,0.455687,76
1,1,ahaha,0.45532,668
2,1,hahaha,0.441289,6690
3,1,ahha,0.436381,43
4,1,myyyy,0.434361,14
5,1,funni,0.433964,49
6,1,hahahah,0.432263,496
7,1,yeaaaa,0.429689,33
8,1,yess,0.429398,320
9,1,woooow,0.42713,46


Unnamed: 0,model,lex,similarity,freq
100000,2,wiki_rule_2,0.389549,127
100001,2,jk,0.381757,2248
100002,2,dqw4w9wgxcq,0.348253,763
100003,2,wiki_rule_b,0.346046,12
100004,2,20enabled,0.32649,476
100005,2,20questions,0.314265,513
100006,2,flowerboy,0.309573,11
100007,2,_love_,0.307687,21
100008,2,subed,0.306114,21
100009,2,420th,0.303107,22


Not related to Covid:

- sunsetting: > gaming-related meaning in 2020
- childe: > gaming-related proper name in 2020
- megalodon: > gaming-related proper name in 2020
- newf: (derogatory) slang term for people from Newfoundland (Canada)
- chaz: > Capitol Hill Autonomous Zone (CHAZ)
- klee: > computer game character, proper name
- rittenhouse: whiskey brand > proper name, involved in shooting related to BLM protests

Related to Covid:

- cerb: > Canada Emergency Response Benefit for Covid
- vacuo: > medical term, 'vacuum'
- moderna: > vaccine

## Social semantic variation

### Inspect subreddits

#### read comments

In [None]:
comments_dir_path = Path('../data/comments/lexeme/')

In [None]:
comments_paths = list(comments_dir_path.glob(f'Covid*.csv'))

In [None]:
%%time
comments = read_comm_csvs(comments_paths)
comments

TODO: filter comments

- [ ] remove duplicates
- [ ] remove bots

#### get subreddit counts

In [None]:
subr_counts = get_subr_counts(comments)

In [None]:
subr_counts_plt = plot_subr_counts(subr_counts, k=20)
subr_counts_plt

In [None]:
subr_counts_plt.save(f'{OUT_DIR}subr_counts.png', scale_factor=2.0)

### Train models

In [None]:
COMMENTS_DIR_SUBR = '../data/comments/subr/'

In [None]:
SUBR = 'conspiracy'

In [None]:
fpaths = get_comments_paths_subr(COMMENTS_DIR_SUBR, SUBR)

In [None]:
%%time
comments = read_comm_csvs(fpaths)

In [None]:
%%time
comments_clean = clean_comments(comments)

In [None]:
docs = comments_clean['body']

In [None]:
docs = docs.to_list()

In [None]:
import pickle

In [None]:
with open(f'{OUT_DIR}docs_clean/subr_{SUBR}.pickle', 'wb') as fp:
    pickle.dump(docs, fp)

In [None]:
with open('{OUT_DIR}docs_clean/subr_{SUBR}.pickle', 'rb') as fp:
    docs = pickle.load(fp)

Corpus information

| Subreddit          | Comments  | DateFirst  | DateLast   |
|:-------------------|---------: |:-----------|:-----------|
| LockdownSkepticism |   520,392 | 2020-03-26 | 2020-12-27 |  
| Coronavirus        | 4,121,144 | 2020-01-21 | 2020-12-27 |
| conspiracy         | 3,973,514 | 2020-01-01 | 2020-12-27 |

In [None]:
corpus = Corpus(docs)

In [None]:
%%time
model = train_model(corpus)

In [None]:
len(model.wv.key_to_index)

In [None]:
model.save(f'{OUT_DIR}models/{SUBR}.model')

### Load models

In [None]:
SUBRS = ['Coronavirus', 'conspiracy']

In [None]:
model_1 = Word2Vec.load(f'{OUT_DIR}models/{SUBRS[0]}.model')

In [None]:
model_2 = Word2Vec.load(f'{OUT_DIR}models/{SUBRS[1]}.model')

### Align models

In [None]:
model_1_vocab = len(model_1.wv.key_to_index)
model_2_vocab = len(model_2.wv.key_to_index)

In [None]:
smart_procrustes_align_gensim(model_1, model_2)

In [None]:
assert len(model_1.wv.key_to_index) == len(model_2.wv.vectors)

In [None]:
models_vocab = pd.DataFrame(
    columns=['Model', 'Words'],
    data=[
        [SUBRS[0], model_1_vocab],
        [SUBRS[1], model_2_vocab],
        ['intersection', len(model_1.wv.key_to_index)]
    ],
)

models_vocab

In [None]:
models_vocab.to_csv(f'{OUT_DIR}models_subrs_vocab.csv', index=False)

### Measure distances

In [None]:
distances = measure_distances(model_1, model_2)

#### words that differ the most between both communities

In [None]:
freq_min = 100

distances\
    .query('freq_1 > @freq_min and freq_2 > @freq_min')\
    .sort_values('dist_sem', ascending=False)\
    .head(20)

#### nearest neighbours for target lexemes in both communities

In [None]:
LEX = 'vaccine'

In [None]:
nbs_model_1, nbs_model_2 = get_nearest_neighbours_models(
    lex=LEX, 
    freq_min=100,
    model_1=model_1, 
    model_2=model_2,
    k=10
)

display(nbs_model_1, nbs_model_2)

#### biggest discrepancies in nearest neighbours for target lexemes

In [None]:
nbs_model_1, nbs_model_2 = get_nearest_neighbours_models(
    lex=LEX, 
    freq_min=150,
    model_1=model_1, 
    model_2=model_2,
    k=100_000
)

In [None]:
nbs_diffs = pd.merge(
    nbs_model_1, nbs_model_2, 
    on='lex',
    suffixes = ('_1', '_2')
)

In [None]:
nbs_diffs = nbs_diffs\
    .assign(sim_diff = abs(nbs_diffs['similarity_1'] - nbs_diffs['similarity_2']))\
    .sort_values('sim_diff', ascending=False)\
    .reset_index(drop=True)\
    .query('lex.str.len() >= 4')

In [None]:
topn = 10

subr_1_nbs = nbs_diffs\
    .query('similarity_1 > similarity_2')\
    .nlargest(topn, 'sim_diff')

subr_2_nbs = nbs_diffs\
    .query('similarity_2 > similarity_1')\
    .nlargest(topn, 'sim_diff')

display(subr_1_nbs, subr_2_nbs)