In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# all_data

In [None]:
from neocov.read_data import *
from neocov.preproc import *
from neocov.type_emb import *
from neocov.communities import *

In [None]:
from gensim.models import Word2Vec
import pandas as pd
pd.set_option('display.max_rows', 100)
from pathlib import Path

In [None]:
DATA_DIR = '../data/'
COMMENTS_DIAC_DIR = f'{DATA_DIR}comments/by_date/'
OUT_DIR = '../out/'

# NeoCov

> Semantic change and social semantic variation of Covid-related English neologisms on Reddit.

## Semantic change

In [None]:
YEAR = '2020'

### Read data

#### Get file paths

In [None]:
comment_paths_year = get_comments_paths_year(COMMENTS_DIAC_DIR, YEAR)

#### Read comments

In [None]:
%%time
comments = read_comm_csvs(comment_paths_year)

In [None]:
comments

### Pre-process comments

In [None]:
%%time
comments_clean = clean_comments(comments)

In [None]:
docs = comments_clean['body'].to_list()

In [None]:
import pickle

In [None]:
with open(f'{OUT_DIR}docs_clean/diac_{YEAR}.pickle', 'wb') as fp:
    pickle.dump(docs, fp)

In [None]:
with open(f'{OUT_DIR}docs_clean/diac_{YEAR}.pickle', 'rb') as fp:
    docs = pickle.load(fp)

### Train models

#### Create corpus

In [None]:
corpus = Corpus(docs)

#### Train model

In [None]:
%%time
model = train_model(corpus, EPOCHS=20)

In [None]:
len(model.wv.key_to_index)

#### Save model

In [None]:
model.save(f'{OUT_DIR}models/{YEAR}_ep-20.model')

### Load models

In [None]:
model_2019 = Word2Vec.load(f'{OUT_DIR}models/2019_ep-20.model')

In [None]:
model_2020 = Word2Vec.load(f'{OUT_DIR}models/2020_ep-20.model')

### Align models

In [None]:
model_2019_vocab = len(model_2019.wv.key_to_index)
model_2020_vocab = len(model_2020.wv.key_to_index)

In [None]:
smart_procrustes_align_gensim(model_2019, model_2020)

In [None]:
assert len(model_2019.wv.key_to_index) == len(model_2020.wv.vectors)

In [None]:
models_vocab = pd.DataFrame(
    columns=['Model', 'Words'],
    data=[
        ['2019', model_2019_vocab],
        ['2020', model_2020_vocab],
        ['intersection', len(model_2019.wv.key_to_index)]
    ],
)

models_vocab

In [None]:
models_vocab.to_csv(f'{OUT_DIR}models_vocab.csv', index=False)

### Measure distances

In [None]:
distances = measure_distances(model_2019, model_2020)

TODO: filter by true type frequency; `Gensim`'s type frequency seems incorrect; it probably reflects frequency ranks instead of total counts.

In [None]:
blacklist_lex = load_blacklist_lex()

k = 500
freq_min = 100

sem_change_cands = (distances\
    .query('freq_1 > @freq_min and freq_2 > @freq_min')
    .query('lex.str.isalpha() == True')
    .query('lex.str.len() > 3')
    .query('lex not in @blacklist_lex')
    .nlargest(k, 'dist_sem')
    .reset_index(drop=True)
)

sem_change_cands

In [None]:
sem_change_cands_out = (sem_change_cands
    .nlargest(100, 'dist_sem')
    .assign(index_1 = lambda df: df.index + 1)
    .assign(dist_sem = lambda df: df['dist_sem'].round(2))
    .assign(dist_sem = lambda df: df['dist_sem'].apply('{:.2f}'.format))
    .rename({'index_1': '', 'lex': 'Lexeme', 'dist_sem': 'SemDist'}, axis=1)
)

In [None]:
sem_change_cands_out.to_csv(
        f'{OUT_DIR}sem_change_cands.csv',
        columns=['', 'Lexeme', 'SemDist'],
        index=False
    )

### Inspect nearest neighbours of lexemes

In [None]:
LEX_NBS = 'ahahahah'

In [None]:
nbs_model_1, nbs_model_2 = get_nearest_neighbours_models(
    lex=LEX_NBS, 
    freq_min=1,
    model_1=model_2019, 
    model_2=model_2020,
    k=10
)

display(
    nbs_model_1,
    nbs_model_2
)

Not related to Covid:

- sunsetting: > gaming-related meaning in 2020
- childe: > gaming-related proper name in 2020
- megalodon: > gaming-related proper name in 2020
- newf: (derogatory) slang term for people from Newfoundland (Canada)
- chaz: > Capitol Hill Autonomous Zone (CHAZ)
- klee: > computer game character, proper name
- rittenhouse: whiskey brand > proper name, involved in shooting related to BLM protests

Related to Covid:

- cerb: > Canada Emergency Response Benefit for Covid
- vacuo: > medical term, 'vacuum'
- moderna: > vaccine

## Social semantic variation

### Inspect subreddits

#### read comments

In [None]:
comments_dir_path = Path('../data/comments/lexeme/')

In [None]:
comments_paths = list(comments_dir_path.glob(f'Covid*.csv'))

In [None]:
%%time
comments = read_comm_csvs(comments_paths)
comments

TODO: filter comments

- [ ] remove duplicates
- [ ] remove bots

#### get subreddit counts

In [None]:
subr_counts = get_subr_counts(comments)

In [None]:
subr_counts_plt = plot_subr_counts(subr_counts, k=20)
subr_counts_plt

In [None]:
subr_counts_plt.save(f'{OUT_DIR}subr_counts.png', scale_factor=2.0)

### Train models

In [None]:
COMMENTS_DIR_SUBR = '../data/comments/subr/'

In [None]:
SUBR = 'conspiracy'

In [None]:
fpaths = get_comments_paths_subr(COMMENTS_DIR_SUBR, SUBR)

In [None]:
%%time
comments = read_comm_csvs(fpaths)

In [None]:
%%time
comments_clean = clean_comments(comments)

In [None]:
docs = comments_clean['body']

In [None]:
docs = docs.to_list()

In [None]:
import pickle

In [None]:
with open(f'{OUT_DIR}docs_clean/subr_{SUBR}.pickle', 'wb') as fp:
    pickle.dump(docs, fp)

In [None]:
with open('{OUT_DIR}docs_clean/subr_{SUBR}.pickle', 'rb') as fp:
    docs = pickle.load(fp)

Corpus information

| Subreddit          | Comments  | DateFirst  | DateLast   |
|:-------------------|---------: |:-----------|:-----------|
| LockdownSkepticism |   520,392 | 2020-03-26 | 2020-12-27 |  
| Coronavirus        | 4,121,144 | 2020-01-21 | 2020-12-27 |
| conspiracy         | 3,973,514 | 2020-01-01 | 2020-12-27 |

In [None]:
corpus = Corpus(docs)

In [None]:
%%time
model = train_model(corpus)

In [None]:
len(model.wv.key_to_index)

In [None]:
model.save(f'{OUT_DIR}models/{SUBR}.model')

### Load models

In [None]:
SUBRS = ['Coronavirus', 'LockdownSkepticism']

In [None]:
model_1 = Word2Vec.load(f'{OUT_DIR}models/{SUBRS[0]}.model')

In [None]:
model_2 = Word2Vec.load(f'{OUT_DIR}models/{SUBRS[1]}.model')

### Align models

In [None]:
model_1_vocab = len(model_1.wv.key_to_index)
model_2_vocab = len(model_2.wv.key_to_index)

In [None]:
smart_procrustes_align_gensim(model_1, model_2)

37317 37317
37317 37317


<gensim.models.word2vec.Word2Vec at 0x187dbcfa0>

In [None]:
assert len(model_1.wv.key_to_index) == len(model_2.wv.vectors)

In [None]:
models_vocab = pd.DataFrame(
    columns=['Model', 'Words'],
    data=[
        [SUBRS[0], model_1_vocab],
        [SUBRS[1], model_2_vocab],
        ['intersection', len(model_1.wv.key_to_index)]
    ],
)

models_vocab

Unnamed: 0,Model,Words
0,Coronavirus,94816
1,LockdownSkepticism,38926
2,intersection,37317


In [None]:
models_vocab.to_csv(f'{OUT_DIR}models_subrs_vocab.csv', index=False)

### Measure distances

In [None]:
distances = measure_distances(model_1, model_2)

#### words that differ the most between both communities

In [None]:
blacklist_lex = load_blacklist_lex()

k = 20
freq_min = 100

sem_change_cands = (distances\
    .query('freq_1 > @freq_min and freq_2 > @freq_min')
    .query('lex.str.isalpha() == True')
    .query('lex.str.len() > 3')
    .query('lex not in @blacklist_lex')
    .nlargest(k, 'dist_sem')
    .reset_index(drop=True)
)

sem_change_cands

Unnamed: 0,lex,dist_sem,freq_1,freq_2
0,plandemic,0.892523,789,138
1,scams,0.889811,964,167
2,vigorous,0.866856,647,114
3,likewise,0.843846,1444,251
4,borderline,0.829629,34936,6561
5,examining,0.827804,1337,234
6,review,0.824861,20052,3647
7,improved,0.822236,17517,3032
8,examination,0.813457,1314,229
9,blurred,0.807373,634,112


In [None]:
sem_change_cands_out = (sem_change_cands
    .nlargest(100, 'dist_sem')
    .assign(index_1 = lambda df: df.index + 1)
    .assign(dist_sem = lambda df: df['dist_sem'].round(2))
    .assign(dist_sem = lambda df: df['dist_sem'].apply('{:.2f}'.format))
    .rename({'index_1': '', 'lex': 'Lexeme', 'dist_sem': 'SemDist'}, axis=1)
)
sem_change_cands_out.to_csv(
        f'{OUT_DIR}sem_var_soc_cands.csv',
        columns=['', 'Lexeme', 'SemDist'],
        index=False
    )

#### nearest neighbours for target lexemes in both communities

In [None]:
LEX_NBS = 'lockdown'

In [None]:
nbs_model_1, nbs_model_2 = get_nearest_neighbours_models(
    lex=LEX_NBS, 
    freq_min=50,
    model_1=model_1, 
    model_2=model_2,
    k=10
)

display(
    nbs_model_1,
    nbs_model_2
)

Unnamed: 0,model,lex,similarity,freq
0,1,shutdown,0.844873,8598
1,1,lockdowns,0.768522,50035
2,1,shutdowns,0.661069,3037
3,1,curfew,0.613175,1652
4,1,quarantine,0.600933,40419
5,1,restrictions,0.581443,31971
6,1,quarantines,0.568791,2767
7,1,quarentine,0.540457,454
8,1,curfews,0.53641,751
9,1,containment,0.51541,4363


Unnamed: 0,model,lex,similarity,freq
29861,2,lockdowns,0.737042,9460
29862,2,shutdown,0.727802,1489
29863,2,lockdowners,0.599373,156
29864,2,shutdowns,0.584469,545
29865,2,maskers,0.572266,660
29866,2,vaxxers,0.557066,465
29867,2,masker,0.541411,177
29868,2,vax,0.522942,504
29869,2,vaxx,0.508261,132
29870,2,lock,0.490704,9527


#### embeddings projection

In [None]:
from scipy import spatial

In [None]:
import altair as alt

In [None]:
models = []
models.append({'subreddit': SUBRS[0], 'model': model_1})
models.append({'subreddit': SUBRS[1], 'model': model_2})

In [None]:
def make_sem_axis(model, pole_word_1: str, pole_word_2: str):
	pole_1_vec = model_1.wv.get_vector(pole_1)
	pole_2_vec = model_1.wv.get_vector(pole_2)
	sem_axis = pole_1_vec - pole_2_vec
	return sem_axis


In [None]:
def get_axis_sim(lex: str, pole_word_1: str, pole_word_2: str, model):
	sem_axis = make_sem_axis(model, pole_word_1, pole_word_2)
	lex_vec = model.wv.get_vector(lex)
	sim_cos = 1 - spatial.distance.cosine(lex_vec, sem_axis)
	return sim_cos

In [None]:
lex = 'lockdown'
pole_1 = 'good'
pole_2 = 'bad'

In [None]:
for model in models:
	print(f'{model["subreddit"]}: {get_axis_sim(lex, pole_1, pole_2, model["model"])}')

Coronavirus: -0.08022712916135788
LockdownSkepticism: -0.12990982830524445


In [None]:
lexs = [
	'lockdown', 'lockdowns', 
	'shutdown', 'shutdowns', 
	'vaccine', 'vaccines', 
	'mask', 'masks',
	'order', 'police',
	'thing', 'tree', 'yellow', 'give'
	]

In [None]:
sims = []
for lex in lexs:
	for model in models:
		sim = {}
		sim['subreddit'] = model['subreddit']
		sim['lex'] = lex
		sim['sim'] = get_axis_sim(lex, pole_1, pole_2, model['model'])
		sims.append(sim)

In [None]:
sims_df = pd.DataFrame(sims)

alt.Chart(sims_df).mark_line(point=True).encode(
	x='sim',
	y=alt.Y('lex', sort=None),
	color='subreddit'
)


#### biggest discrepancies in nearest neighbours for target lexemes

In [None]:
nbs_model_1, nbs_model_2 = get_nearest_neighbours_models(
    lex=LEX, 
    freq_min=150,
    model_1=model_1, 
    model_2=model_2,
    k=100_000
)

In [None]:
nbs_diffs = pd.merge(
    nbs_model_1, nbs_model_2, 
    on='lex',
    suffixes = ('_1', '_2')
)

In [None]:
nbs_diffs = nbs_diffs\
    .assign(sim_diff = abs(nbs_diffs['similarity_1'] - nbs_diffs['similarity_2']))\
    .sort_values('sim_diff', ascending=False)\
    .reset_index(drop=True)\
    .query('lex.str.len() >= 4')

In [None]:
topn = 10

subr_1_nbs = nbs_diffs\
    .query('similarity_1 > similarity_2')\
    .nlargest(topn, 'sim_diff')

subr_2_nbs = nbs_diffs\
    .query('similarity_2 > similarity_1')\
    .nlargest(topn, 'sim_diff')

display(subr_1_nbs, subr_2_nbs)