In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# all_data

In [None]:
from neocov.read_data import *
from neocov.preproc import *
from neocov.type_emb import *
from neocov.communities import *

In [None]:
from pathlib import Path
import pandas as pd
pd.set_option('display.max_rows', 100)
import altair as alt
from altair_saver import save
from gensim.models import Word2Vec

In [None]:
DATA_DIR = '../data/'
COMMENTS_DIAC_DIR = f'{DATA_DIR}comments/by_date/'
OUT_DIR = '../out/'

# NeoCov

> Semantic change and social semantic variation of Covid-related English neologisms on Reddit.

## Semantic change

In [None]:
YEAR = '2020'

### Read data

#### Get file paths

In [None]:
comment_paths_year = get_comments_paths_year(COMMENTS_DIAC_DIR, YEAR)

#### Read comments

In [None]:
%%time
comments = read_comm_csvs(comment_paths_year)

In [None]:
comments

### Pre-process comments

In [None]:
%%time
comments_clean = clean_comments(comments)

In [None]:
docs = comments_clean['body'].to_list()

In [None]:
import pickle

In [None]:
with open(f'{OUT_DIR}docs_clean/diac_{YEAR}.pickle', 'wb') as fp:
    pickle.dump(docs, fp)

In [None]:
with open(f'{OUT_DIR}docs_clean/diac_{YEAR}.pickle', 'rb') as fp:
    docs = pickle.load(fp)

### Train models

#### Create corpus

In [None]:
corpus = Corpus(docs)

#### Train model

In [None]:
%%time
model = train_model(corpus, EPOCHS=20)

In [None]:
len(model.wv.key_to_index)

#### Save model

In [None]:
model.save(f'{OUT_DIR}models/{YEAR}_ep-20.model')

### Load models

In [None]:
model_2019 = Word2Vec.load(f'{OUT_DIR}models/2019_ep-20.model')

In [None]:
model_2020 = Word2Vec.load(f'{OUT_DIR}models/2020_ep-20.model')

### Align models

In [None]:
model_2019_vocab = len(model_2019.wv.key_to_index)
model_2020_vocab = len(model_2020.wv.key_to_index)

In [None]:
smart_procrustes_align_gensim(model_2019, model_2020)

In [None]:
assert len(model_2019.wv.key_to_index) == len(model_2020.wv.vectors)

In [None]:
models_vocab = pd.DataFrame(
    columns=['Model', 'Words'],
    data=[
        ['2019', model_2019_vocab],
        ['2020', model_2020_vocab],
        ['intersection', len(model_2019.wv.key_to_index)]
    ],
)

models_vocab

In [None]:
models_vocab.to_csv(f'{OUT_DIR}models_vocab.csv', index=False)

### Measure distances

In [None]:
distances = measure_distances(model_2019, model_2020)

TODO: filter by true type frequency; `Gensim`'s type frequency seems incorrect; it probably reflects frequency ranks instead of total counts.

In [None]:
blacklist_lex = load_blacklist_lex()

k = 20
freq_min = 100

sem_change_cands = (distances\
    .query('freq_1 > @freq_min and freq_2 > @freq_min')
    .query('lex.str.isalpha() == True')
    .query('lex.str.len() > 3')
    .query('lex not in @blacklist_lex')
    .nlargest(k, 'dist_sem')
    .reset_index(drop=True)
)

sem_change_cands

In [None]:
sem_change_cands_out = (sem_change_cands
    .nlargest(100, 'dist_sem')
    .assign(index_1 = lambda df: df.index + 1)
    .assign(dist_sem = lambda df: df['dist_sem'].round(2))
    .assign(dist_sem = lambda df: df['dist_sem'].apply('{:.2f}'.format))
    .rename({'index_1': '', 'lex': 'Lexeme', 'dist_sem': 'SemDist'}, axis=1)
)

In [None]:
sem_change_cands_out.to_csv(
        f'{OUT_DIR}sem_change_cands.csv',
        columns=['', 'Lexeme', 'SemDist'],
        index=False
    )

### Inspect nearest neighbours of lexemes

In [None]:
LEX_NBS = 'lockdowns'

In [None]:
nbs_model_1, nbs_model_2 = get_nearest_neighbours_models(
    lex=LEX_NBS, 
    freq_min=1,
    model_1=model_2019, 
    model_2=model_2020,
    k=10
)

display(
    nbs_model_1,
    nbs_model_2
)

Not related to Covid:

- sunsetting: > gaming-related meaning in 2020
- childe: > gaming-related proper name in 2020
- megalodon: > gaming-related proper name in 2020
- newf: (derogatory) slang term for people from Newfoundland (Canada)
- chaz: > Capitol Hill Autonomous Zone (CHAZ)
- klee: > computer game character, proper name
- rittenhouse: whiskey brand > proper name, involved in shooting related to BLM protests

Related to Covid:

- cerb: > Canada Emergency Response Benefit for Covid
- vacuo: > medical term, 'vacuum'
- moderna: > vaccine

## Social semantic variation

### Inspect subreddits

#### read comments

In [None]:
comments_dir_path = Path('../data/comments/lexeme/')

In [None]:
comments_paths = list(comments_dir_path.glob(f'Covid*.csv'))

In [None]:
%%time
comments = read_comm_csvs(comments_paths)
comments

#### get subreddit counts

In [None]:
subr_counts = get_subr_counts(comments)

In [None]:
subr_counts_plt = plot_subr_counts(subr_counts, k=15)
subr_counts_plt

In [None]:
subr_counts_plt.save(f'{OUT_DIR}subr_counts.png', scale_factor=2.0)

### Train models

In [None]:
COMMENTS_DIR_SUBR = '../data/comments/subr/'

In [None]:
SUBR = 'conspiracy'

In [None]:
fpaths = get_comments_paths_subr(COMMENTS_DIR_SUBR, SUBR)

In [None]:
%%time
comments = read_comm_csvs(fpaths)

In [None]:
%%time
comments_clean = clean_comments(comments)

In [None]:
docs = comments_clean['body']

In [None]:
docs = docs.to_list()

In [None]:
import pickle

In [None]:
with open(f'{OUT_DIR}docs_clean/subr_{SUBR}.pickle', 'wb') as fp:
    pickle.dump(docs, fp)

In [None]:
with open('{OUT_DIR}docs_clean/subr_{SUBR}.pickle', 'rb') as fp:
    docs = pickle.load(fp)

Corpus information

| Subreddit          | Comments  | DateFirst  | DateLast   |
|:-------------------|---------: |:-----------|:-----------|
| LockdownSkepticism |   520,392 | 2020-03-26 | 2020-12-27 |  
| Coronavirus        | 4,121,144 | 2020-01-21 | 2020-12-27 |
| conspiracy         | 3,973,514 | 2020-01-01 | 2020-12-27 |

In [None]:
corpus = Corpus(docs)

In [None]:
%%time
model = train_model(corpus)

In [None]:
len(model.wv.key_to_index)

In [None]:
model.save(f'{OUT_DIR}models/{SUBR}.model')

### Load models

In [None]:
model_names = ['Coronavirus', 'conspiracy']
# model_names = ['Coronavirus', 'LockdownSkepticism']

In [None]:
models = [dict() for name in model_names]
for i, model in enumerate(models):
	model['name'] = model_names[i]
	model['path'] = f'../out/models/{model["name"]}.model'
	model['model'] = Word2Vec.load(model['path'])

models

[{'name': 'Coronavirus',
  'path': '../out/models/Coronavirus.model',
  'model': <gensim.models.word2vec.Word2Vec at 0x16d3aa1d0>},
 {'name': 'conspiracy',
  'path': '../out/models/conspiracy.model',
  'model': <gensim.models.word2vec.Word2Vec at 0x16df29f90>}]

### Align models

In [None]:
for model in models:
	model['vocab'] = len(model['model'].wv.key_to_index)

In [None]:
smart_procrustes_align_gensim(models[0]['model'], models[1]['model'])

In [None]:
assert len(models[0]['model'].wv.key_to_index) == len(models[1]['model'].wv.key_to_index)

In [None]:
models_vocab = (pd.DataFrame(models)
	.filter(['name', 'vocab'])
	.rename({'name': 'Model', 'vocab': 'Words'}, axis=1)
)

models_vocab

In [None]:
models_vocab.to_csv(f'../out/vocabs/vocab_{models[0]["name"]}--{models[1]["name"]}.csv', index=False)

### Measure distances

In [None]:
distances = measure_distances(models[0]['model'], models[1]['model'])

#### words that differ the most between both communities

In [None]:
blacklist_lex = load_blacklist_lex()

k = 20
freq_min = 100

sem_change_cands = (distances\
    .query('freq_1 > @freq_min and freq_2 > @freq_min')
    .query('lex.str.isalpha() == True')
    .query('lex.str.len() > 3')
    .query('lex not in @blacklist_lex')
    .nlargest(k, 'dist_sem')
    .reset_index(drop=True)
)

sem_change_cands

In [None]:
sem_change_cands_out = (sem_change_cands
    .nlargest(100, 'dist_sem')
    .assign(index_1 = lambda df: df.index + 1)
    .assign(dist_sem = lambda df: df['dist_sem'].round(2))
    .assign(dist_sem = lambda df: df['dist_sem'].apply('{:.2f}'.format))
    .rename({'index_1': '', 'lex': 'Lexeme', 'dist_sem': 'SemDist'}, axis=1)
)
sem_change_cands_out.to_csv(
        f'{OUT_DIR}sem_var_soc_cands.csv',
        columns=['', 'Lexeme', 'SemDist'],
        index=False
    )

#### nearest neighbours for target lexemes in both communities

In [None]:
LEX_NBS = 'plandemic'

In [None]:
nbs_model_1, nbs_model_2 = get_nearest_neighbours_models(
    lex=LEX_NBS, 
    freq_min=10,
    model_1=models[0]['model'], 
    model_2=models[1]['model'],
    k=10
)

display(
    nbs_model_1,
    nbs_model_2
)

#### biggest discrepancies in nearest neighbours for target lexemes

In [None]:
nbs_model_1, nbs_model_2 = get_nearest_neighbours_models(
    lex='vaccine', 
    freq_min=150,
    model_1=models[0]['model'], 
    model_2=models[1]['model'],
    k=100_000
)

In [None]:
nbs_diffs = pd.merge(
    nbs_model_1, nbs_model_2, 
    on='lex',
    suffixes = ('_1', '_2')
)

In [None]:
nbs_diffs = nbs_diffs\
    .assign(sim_diff = abs(nbs_diffs['similarity_1'] - nbs_diffs['similarity_2']))\
    .sort_values('sim_diff', ascending=False)\
    .reset_index(drop=True)\
    .query('lex.str.len() >= 4')

In [None]:
topn = 10

subr_1_nbs = nbs_diffs\
    .query('similarity_1 > similarity_2')\
    .nlargest(topn, 'sim_diff')

subr_2_nbs = nbs_diffs\
    .query('similarity_2 > similarity_1')\
    .nlargest(topn, 'sim_diff')

display(subr_1_nbs, subr_2_nbs)

### Project embeddings into subspaces

In [None]:
lexs = [
	'regulations', 'politics',
	'government', 'mandate', 
	'science', 'research',
	'shutdown', 'shutdowns', 
	'lockdown', 'lockdowns', 
	'vaccine', 'vaccines', 
	'mask', 'masks',
	]

#### _good_ vs _bad_

In [None]:
pole_words_pos = ['good', 'bad']

In [None]:
proj_sims_pos = get_axis_sims(lexs, models, pole_words_pos, k=10)

In [None]:
proj_sims_pos_chart = alt.Chart(proj_sims_pos).mark_line(point=True).encode(
	x='sim',
	y=alt.Y('lex', sort=None),
	color='subreddit'
).properties(title=f'{pole_words_pos[0]} vs {pole_words_pos[1]}')

proj_sims_pos_chart

In [None]:
proj_sims_pos_chart.save(f'../out/proj-emb_pos_{models[0]["name"]}--{models[1]["name"]}.pdf')

#### _objective_ vs _subjective_

In [None]:
pole_words_subj = ['objective', 'subjective']

In [None]:
proj_sims_subj = get_axis_sims(lexs, models, pole_words_subj, k=10)

In [None]:
proj_sims_subj_chart = alt.Chart(proj_sims_subj).mark_line(point=True).encode(
	x='sim',
	y=alt.Y('lex', sort=None),
	color='subreddit'
).properties(title=f'{pole_words_subj[0]} vs {pole_words_subj[1]}')

proj_sims_subj_chart

In [None]:
proj_sims_subj_chart.save(f'../out/proj-emb_subj_{models[0]["name"]}--{models[1]["name"]}.pdf')

### Plot embedding space

In [None]:
lex_vecs = []
for lex in lexs:
	for model in models:
		lex_d = {}
		lex_d['lex'] = lex
		lex_d['subreddit'] = model['subreddit']
		lex_d['vec'] = model['model'].wv.get_vector(lex)
		lex_vecs.append(lex_d)

In [None]:
lex = 'lockdown'
lex_vecs = []

for model in models:
	lex_d = {}
	lex_d['lex'] = lex
	lex_d['type'] = 'center'
	lex_d['subreddit'] = model['subreddit']
	lex_d['vec'] = model['model'].wv.get_vector(lex)
	lex_vecs.append(lex_d)
	for nb, sim in model['model'].wv.most_similar(lex, topn=50):
		lex_d = {}
		lex_d['lex'] = nb
		lex_d['type'] = 'nb'
		lex_d['subreddit'] = model['subreddit']
		lex_d['vec'] =  model['model'].wv.get_vector(nb)
		lex_vecs.append(lex_d)

In [None]:
vecs_df = pd.DataFrame(lex_vecs)
vecs_df

In [None]:
from sklearn.manifold import TSNE

In [None]:

Y_tsne = TSNE(
    perplexity=70,
    method='exact',
    init='pca',
    verbose=True
    )\
    .fit_transform(list(vecs_df['vec']))

vecs_df['x_tsne'] = Y_tsne[:, [0]]
vecs_df['y_tsne'] = Y_tsne[:, [1]]


In [None]:
brush = alt.selection(
    type="interval",
    on="[mousedown[event.altKey], mouseup] > mousemove",
    translate="[mousedown[event.altKey], mouseup] > mousemove!",
    zoom="wheel![event.altKey]",
)

interaction = alt.selection(
    type="interval",
    bind="scales",
    on="[mousedown[!event.altKey], mouseup] > mousemove",
    translate="[mousedown[!event.altKey], mouseup] > mousemove!",
    zoom="wheel![!event.altKey]",
)

chart = (alt.Chart(vecs_df).mark_text(point=True).encode(
	x = 'x_tsne',
	y = 'y_tsne',
	text = 'lex',
	size = alt.condition("datum.type == 'center'", alt.value(25), alt.value(10)),
	color = alt.condition(brush, 'subreddit', alt.value('lightgray')),
	column = 'subreddit'
	)
	.properties(title=f"Social semantic variation for the word '{lex}'.")
	.add_selection(brush, interaction)
)

chart

Link to interactive chart: https://wuqui.github.io/neocov/#Plot-embedding-space.

Press and hold the <kbd>alt</kbd> key to select regions of the semantic space.