In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# default_exp type_emb

In [None]:
#export
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from scipy import spatial
import altair as alt
from sklearn.manifold import TSNE

# Type embeddings

## train model

### create corpus

In [None]:
docs = [
    ['A', 'test', 'sentence'],
    ['Another', 'test', 'sentence']
]

In [None]:
#export
class Corpus:
    """An iterator that yields sentences (lists of str)."""
    def __init__(self, docs):
        self.docs_clean = docs

    def __iter__(self):
        for doc in self.docs_clean:
            yield doc

In [None]:
corpus = Corpus(docs)

### train model

In [None]:
#export
def train_model(corpus,
              MIN_COUNT=5,
              SIZE=300,
              WORKERS=8,
              WINDOW=5,
              EPOCHS=5
              ):
    model = Word2Vec(
        corpus,
        min_count=MIN_COUNT,
        vector_size=SIZE,
        workers=WORKERS,
        window=WINDOW,
        epochs=EPOCHS
    )
    return model


In [None]:
model = train_model(corpus, MIN_COUNT=1)

## load models

In [None]:
model_names = ['Coronavirus', 'conspiracy']

In [None]:
#export
def make_model_dict(model_name: str, models_dir_path: str='../out/models/'):
	model = {}
	model['name'] = model_name
	model['path'] = f'{models_dir_path}{model_name}.model'
	return model

In [None]:
#data
models = []
for name in model_names:
	model = make_model_dict(name)
	model['model'] = Word2Vec.load(model['path'])
	models.append(model)

## align models

In [None]:
model_1 = train_model(corpus=[['The', 'bank', 'of', 'the', 'river']], MIN_COUNT=1)

In [None]:
model_2 = train_model(corpus=[['The', 'bank', 'of', 'England']], MIN_COUNT=1)

In [None]:
assert len(model_1.wv.key_to_index) != len(model_2.wv.vectors)


In [None]:
#export
def intersection_align_gensim(m1, m2, words=None):
    """
    Intersect two gensim word2vec models, m1 and m2.
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
    The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    """

    # Get the vocab for each model
    vocab_m1 = set(m1.wv.index_to_key)
    vocab_m2 = set(m2.wv.index_to_key)

    # Find the common vocabulary
    common_vocab = vocab_m1 & vocab_m2
    if words: common_vocab &= set(words)

    # If no alignment necessary because vocab is identical...
    if not vocab_m1 - common_vocab and not vocab_m2 - common_vocab:
        return (m1,m2)

    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: m1.wv.get_vecattr(w, "count") + m2.wv.get_vecattr(w, "count"), reverse=True)
    # print(len(common_vocab))

    # Then for each model...
    for m in [m1, m2]:
        # Replace old syn0norm array with new one (with common vocab)
        indices = [m.wv.key_to_index[w] for w in common_vocab]
        old_arr = m.wv.vectors
        new_arr = np.array([old_arr[index] for index in indices])
        m.wv.vectors = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        new_key_to_index = {}
        new_index_to_key = []
        for new_index, key in enumerate(common_vocab):
            new_key_to_index[key] = new_index
            new_index_to_key.append(key)
        m.wv.key_to_index = new_key_to_index
        m.wv.index_to_key = new_index_to_key
        
        print(len(m.wv.key_to_index), len(m.wv.vectors))
        
    return (m1,m2)

In [None]:
#export
def smart_procrustes_align_gensim(base_embed, other_embed, words=None):
    """
    Original script: https://gist.github.com/quadrismegistus/09a93e219a6ffc4f216fb85235535faf
    Procrustes align two gensim word2vec models (to allow for comparison between same word across models).
    Code ported from HistWords <https://github.com/williamleif/histwords> by William Hamilton <wleif@stanford.edu>.
        
    First, intersect the vocabularies (see `intersection_align_gensim` documentation).
    Then do the alignment on the other_embed model.
    Replace the other_embed model's syn0 and syn0norm numpy matrices with the aligned version.
    Return other_embed.
    If `words` is set, intersect the two models' vocabulary with the vocabulary in words (see `intersection_align_gensim` documentation).
    """

    # make sure vocabulary and indices are aligned
    in_base_embed, in_other_embed = intersection_align_gensim(base_embed, other_embed, words=words)

    # get the (normalized) embedding matrices
    base_vecs = in_base_embed.wv.get_normed_vectors()
    other_vecs = in_other_embed.wv.get_normed_vectors()

    # just a matrix dot product with numpy
    m = other_vecs.T.dot(base_vecs) 
    # SVD method from numpy
    u, _, v = np.linalg.svd(m)
    # another matrix operation
    ortho = u.dot(v) 
    # Replace original array with modified one, i.e. multiplying the embedding matrix by "ortho"
    other_embed.wv.vectors = (other_embed.wv.vectors).dot(ortho)    
    
    return other_embed

In [None]:
smart_procrustes_align_gensim(model_1, model_2)

NameError: name 'model_1' is not defined

In [None]:
assert len(model_1.wv.key_to_index) == len(model_2.wv.vectors)


## measure distances between types

In [None]:
#export
def measure_distances(model_1, model_2):
    distances = pd.DataFrame(
        columns=('lex', 'dist_sem', "freq_1", "freq_2"),
        data=(
            #[w, spatial.distance.euclidean(model_1.wv[w], model_2.wv[w]),
            #[w, np.sum(model_1.wv[w] * model_2.wv[w]) / (np.linalg.norm(model_1.wv[w]) * np.linalg.norm(model_2.wv[w])),
            [w, spatial.distance.cosine(model_1.wv[w], model_2.wv[w]),
             model_1.wv.get_vecattr(w, "count"),
             model_2.wv.get_vecattr(w, "count")
             ] for w in model_1.wv.index_to_key
        )
    )
    return distances


In [None]:
distances = measure_distances(model_1, model_2)

In [None]:
distances\
    .sort_values('dist_sem', ascending=False)


## get nearest neighbours of lexemes for 2 models

In [None]:
#export
def get_nearest_neighbours_models(lex, freq_min, model_1, model_2, topn=100_000, k=10):
    nbs = []
    for count, model in enumerate([model_1, model_2]):
        for nb, sim in model.wv.most_similar(lex, topn=topn):
            if model.wv.get_vecattr(nb, 'count') > freq_min:
                d = {}
                d['Model'] = count + 1
                d['Word'] = nb
                d['SemDist'] = round(1 - sim, 2)
                d['Freq'] = model.wv.get_vecattr(nb, "count")
                d['vec'] = model.wv.get_vector(lex)
                nbs.append(d)
    nbs_df = pd.DataFrame(nbs)
    nbs_df = nbs_df\
        .query('Freq > @freq_min')\
        .groupby('Model', group_keys=False)\
        .apply(lambda group: group.nsmallest(k, 'SemDist'))
    nbs_model_1 = nbs_df.query('Model == 1')
    nbs_model_2 = nbs_df.query('Model == 2')
    return nbs_model_1, nbs_model_2

## project embeddings

In [None]:
#data
smart_procrustes_align_gensim(models[0]['model'], models[1]['model'])

In [None]:
#export
def get_pole_avg(model, lex: str, k=10):
	vecs = []
	vecs.append(model.wv[lex])
	for closest_word, similarity in model.wv.most_similar(positive=lex, topn=k):
		vecs.append(model.wv[closest_word])
		# print(closest_word)
	pole_avg = np.mean(vecs, axis=0)
	return pole_avg

In [None]:
#export
def make_sem_axis_avg(model, pole_word_1: str, pole_word_2: str, k=10):
	pole_1_avg = get_pole_avg(model, pole_word_1, k)
	pole_2_avg = get_pole_avg(model, pole_word_2, k)
	sem_axis = pole_1_avg - pole_2_avg
	return sem_axis

In [None]:
#export
def get_axis_sim(lex: str, pole_word_1: str, pole_word_2: str, model, k=10):
	sem_axis = make_sem_axis_avg(model, pole_word_1, pole_word_2, k)
	lex_vec = model.wv.get_vector(lex)
	sim_cos = 1 - spatial.distance.cosine(lex_vec, sem_axis)
	return sim_cos

In [None]:
#export
def get_axis_sims(lexs: list, models, pole_words: list, k=10):
	sims = []
	for lex in lexs:
		for model in models:
			sim = {}
			sim['model'] = model['name']
			sim['lex'] = lex
			sim['sim'] = get_axis_sim(lex, pole_words[0], pole_words[1], model['model'], k)
			sims.append(sim)
	sims_df = pd.DataFrame(sims)
	return sims_df

In [None]:
#export
def plot_emb_proj(proj_sims, pole_words):
	chart = alt.Chart(proj_sims).mark_line(point=True).encode(
		x=alt.X('sim', title='SemSim'),
		y=alt.Y('lex', title='', sort=None),
		color=alt.Color('model', title='Model')
	).properties(title=f'{pole_words[0]} vs {pole_words[1]}')
	return chart

In [None]:
pole_words = ['good', 'bad']
# pole_words = ['objective', 'subjective']

In [None]:
lexs = [
	'regulations', 'politics',
	'government', 'mandate', 
	'science', 'research',
	'shutdown', 'shutdowns', 
	'lockdown', 'lockdowns', 
	'vaccine', 'vaccines', 
	'mask', 'masks',
	# 'tree', 'food', 'drink', 'air', 'sun'
	# 'yellow', 'purple', 'orange' 
	# 'give', 'take',
	# 'you', 'i', 'the', 'of',
	# 'good', 'bad'
	]

In [None]:
#data
proj_sims = get_axis_sims(lexs, models, pole_words, k=10)

proj_sims_chart = alt.Chart(proj_sims).mark_line(point=True).encode(
	x='sim',
	y=alt.Y('lex', sort=None),
	color='subreddit'
)

proj_sims_chart


## map social embedding spaces

In [None]:
#data
smart_procrustes_align_gensim(models[0]['model'], models[1]['model'])

67181 67181
67181 67181


<gensim.models.word2vec.Word2Vec at 0x170a5b7f0>

In [None]:
lex = 'vaccines'

In [None]:
#export
def get_nbs_vecs(lex, model, k=50):
	lex_vecs = []
	lex_d = {}
	lex_d['lex'] = lex
	lex_d['type'] = 'center'
	lex_d['subreddit'] = model['name']
	lex_d['vec'] = model['model'].wv.get_vector(lex)
	lex_vecs.append(lex_d)
	for nb, sim in model['model'].wv.most_similar(lex, topn=k):
		lex_d = {}
		lex_d['lex'] = nb
		lex_d['type'] = 'nb'
		lex_d['sim'] = sim
		lex_d['subreddit'] = model['name']
		lex_d['vec'] =  model['model'].wv.get_vector(nb)
		lex_d['freq'] = model['model'].wv.get_vecattr(nb, "count")
		lex_vecs.append(lex_d)
	lex_vecs_df = pd.DataFrame(lex_vecs)
	return lex_vecs_df

In [None]:
#data
nbs_vecs = pd.concat([get_nbs_vecs(lex, model, k=750) for model in models])

In [None]:
#export
def dim_red_nbs_vecs(nbs_vecs, perplexity=50):
    Y_tsne = TSNE(
            perplexity=perplexity,
            method='exact',
            init='pca',
            verbose=False,
            learning_rate='auto'
        )\
        .fit_transform(np.array(list(nbs_vecs['vec'])))
        # .fit_transform(list(nbs_vecs['vec']))

    nbs_vecs['x_tsne'] = Y_tsne[:, [0]]
    nbs_vecs['y_tsne'] = Y_tsne[:, [1]]

    return nbs_vecs


### common neighbours

In [None]:
#data
nbs_vecs = dim_red_nbs_vecs(nbs_vecs, perplexity=0)



In [None]:
#data
nbs_sim = (nbs_vecs
	.groupby('subreddit')
	.apply(lambda df: df.nlargest(10, 'sim'))
	.reset_index(drop=True)
)

In [None]:
#data
chart_sims = (alt.Chart(nbs_sim).mark_text().encode(
		x='x_tsne:Q',
		y='y_tsne:Q',
		text='lex',
		color='subreddit:N'
	))

chart_sims

In [None]:
#data
chart_sims.save(f'../out/map-sem-space_{lex}_sims.pdf')
chart_sims.save(f'../out/map-sem-space_{lex}_sims.html')

### differences in neighbours

In [None]:
#data
nbs_vecs = dim_red_nbs_vecs(nbs_vecs, perplexity=70)



In [None]:
#data
# final version

nbs_diff = nbs_vecs.drop_duplicates(subset='lex', keep=False)
nbs_diff = (nbs_diff
	.groupby('subreddit')
	.apply(lambda df: df.nlargest(20, 'sim'))
	.reset_index(drop=True)
)

In [None]:
#data
# final version

chart_diffs = (alt.Chart(nbs_diff).mark_text().encode(
		x='x_tsne:Q',
		y='y_tsne:Q',
		text='lex:N',
		color='subreddit:N',
		# column='subr_nb:N',
	)).interactive()


chart_diffs

In [None]:
#data
chart_diffs.save(f'../out/map-sem-space_{lex}_diffs.pdf')
chart_diffs.save(f'../out/map-sem-space_{lex}_diffs.html')

### pool bak

In [None]:
#data
nbs_vecs_grp = pd.concat(g for _, g in nbs_vecs.groupby("lex") if len(g) > 1)

df = nbs_vecs_grp
df = (df
    .sort_values(['lex', 'subreddit']) 
    .groupby(['lex'])
    .apply(lambda gdf: gdf.assign(SimDiff = lambda gdf: gdf['sim'].diff()))
    .reset_index(drop=True)
    .assign(subreddit = lambda df: np.where(df['SimDiff'] < 0, 'Coronavirus', 'conspiracy'))
    .dropna(subset=['SimDiff'])
    .query('freq > 100')
    )

nbs_diff = pd.concat([
    df.nsmallest(20, 'SimDiff'),
	df.nlargest(20, 'SimDiff')]
    )


df.nlargest(20, 'SimDiff')

Unnamed: 0,lex,type,subreddit,vec,sim,freq,x_tsne,y_tsne,SimDiff
329,masks,nb,conspiracy,"[-2.2547746, -0.4834074, -1.4954194, 2.8459356...",0.530679,118227.0,10.175615,-40.975052,0.261322
3,additives,nb,conspiracy,"[1.459601, 0.82089573, 0.18472618, -1.2612789,...",0.502207,145.0,-27.079416,1.559038,0.18749
475,preservatives,nb,conspiracy,"[0.39230525, 1.0395596, 0.26660278, -2.267249,...",0.494384,126.0,-26.630289,1.359555,0.182868
583,thimerosal,nb,conspiracy,"[-0.24996677, 1.3901852, 0.24130212, -0.655101...",0.471022,181.0,-20.110189,-2.633359,0.180409
321,lockdowns,nb,conspiracy,"[-1.1396934, -0.4576898, -0.86728644, -0.96539...",0.44809,15881.0,15.561445,-24.185442,0.175846
633,vax,nb,conspiracy,"[-1.4118998, 0.7732877, 1.4240036, -0.02489084...",0.581255,3208.0,12.931235,-29.958838,0.169978
275,inoculated,nb,conspiracy,"[-0.32055232, 0.5532584, -0.19591443, 0.22722,...",0.441003,279.0,17.602747,37.039684,0.164212
629,varicella,nb,conspiracy,"[0.20554425, 0.4610833, 0.9360308, -0.00546899...",0.441637,109.0,-4.945457,-23.341425,0.155615
463,polio,nb,conspiracy,"[-1.8392371, -0.80445474, 2.7950146, -1.970325...",0.418946,4101.0,-8.307308,-28.911093,0.150968
435,pertussis,nb,conspiracy,"[0.20283101, 0.4985287, 2.7471106, 0.4256801, ...",0.431533,210.0,-3.269825,-25.152451,0.141406


In [None]:
#data
nbs_vecs = pd.concat([get_nbs_vecs(lex, model, k=100) for model in models])
nbs_vecs = (pd.concat(g for _, g in nbs_vecs.groupby("lex") if len(g) > 1)
	.sort_values('sim', ascending=False)
	.iloc[:20]
)



In [None]:
#data
chart = (alt.Chart(nbs_vecs).mark_text(point=True).encode(
	x = 'x_tsne:Q',
	y = 'y_tsne:Q',
	color='subreddit:N',
	size = 'sim:Q',
	text = 'lex:O',
	).interactive()
)
chart

### old version

In [None]:
#export
def plot_nbs_vecs(lex, nbs_vecs, perplexity=50):
	brush = alt.selection(
		type="interval",
		on="[mousedown[event.altKey], mouseup] > mousemove",
		translate="[mousedown[event.altKey], mouseup] > mousemove!",
		zoom="wheel![event.altKey]",
	)

	interaction = alt.selection(
		type="interval",
		bind="scales",
		on="[mousedown[!event.altKey], mouseup] > mousemove",
		translate="[mousedown[!event.altKey], mouseup] > mousemove!",
		zoom="wheel![!event.altKey]",
	)

	chart = (alt.Chart(nbs_vecs).mark_text(point=True).encode(
		x = 'x_tsne:Q',
		y = 'y_tsne:Q',
		text = 'lex:O',
		size = alt.condition("datum.type == 'center'", alt.value(25), alt.value(10)),
		color = alt.condition(brush, 'subreddit', alt.value('lightgray')),
		column = 'subreddit'
		)
		.properties(title=f"Social semantic variation for the word '{lex}'.")
		.add_selection(brush, interaction)
	)

	return chart

In [None]:
#data
nbs_vecs_chart = plot_nbs_vecs(lex, nbs_vecs)
nbs_vecs_chart

In [None]:
#data
nbs_vecs_chart.save(f'../out/map-sem-space_{lex}_{models[0]["name"]}--{models[1]["name"]}.pdf')
nbs_vecs_chart.save(f'../out/map-sem-space_{lex}_{models[0]["name"]}--{models[1]["name"]}.html')

Link to interactive chart: https://wuqui.github.io/neocov/#Plot-embedding-space.

Press and hold the <kbd>alt</kbd> key to select regions of the semantic space.