In [None]:
# default_exp type_emb
# all_data

In [None]:
# hide
%load_ext autoreload
%autoreload 2

# Type embeddings

## Imports

In [None]:
from socemb.read_data import *

In [None]:
# export
from gensim.models import Word2Vec
import pandas as pd
import re
from gensim.models import KeyedVectors
from scipy.spatial.distance import cosine

## Variables

In [None]:
VECS_DIR = 'data/vecs/'

In [None]:
LIMIT = 100_000

In [None]:
SUBREDDIT = 'askreddit'
T_1 = 2010
T_2 = 2020

In [None]:
COMM_1 = 'askaconservative'
COMM_2 = 'politics'

In [None]:
YEAR = 2020

## Train model

### Read comments

In [None]:
f_paths = get_fpaths_yr(YEAR)

In [None]:
comments = read_comm_csvs(f_paths)

In [None]:
comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380964 entries, 0 to 380963
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   body         380964 non-null  string        
 1   created_utc  380964 non-null  datetime64[ns]
 2   id           380964 non-null  string        
 3   subreddit    380964 non-null  string        
dtypes: datetime64[ns](1), string(3)
memory usage: 11.6 MB


### Split data (inactive)

### Split in temporal bins

In [None]:
comments = comments.assign(date = pd.to_datetime(
    comments['created_utc'],
    errors='coerce'
))

In [None]:
comments['date'].dt.year.value_counts()

In [None]:
comments = comments[comments.date.dt.year == TIME]

### Split by communities

In [None]:
comments\
    .value_counts('subreddit')\
    .head(20)

In [None]:
comments = comments.query('subreddit == "politics"')

In [None]:
comments = comments.query('subreddit == "AskHistorians"')

### Pre-processing

In [None]:
# export
def conv_to_lowerc(doc):
    return doc.lower()

In [None]:
assert conv_to_lowerc('Test') == 'test'

In [None]:
# export
def rm_punct(doc):
    return re.sub(r'[^\w\s]+', ' ', doc)

In [None]:
assert rm_punct('No-punctuation!') == 'No punctuation '

In [None]:
# export
def tokenize(doc):
    return doc.split()

In [None]:
assert len(tokenize('These are three-tokens')) == 3

In [None]:
# export
def detect_short_doc(doc, limit=10):
    if len(doc) >= limit:
        return False
    else:
        return True

In [None]:
assert detect_short_doc(['oans', 'zwoa', 'drei'], 10) == True

In [None]:
assert detect_short_doc(['oans', 'zwoa', 'drei', 'viere', 'fuenfe', 'sechse', 'simme', 'achte', 'neine', 'zehne'], 10) == False

In [None]:
# export
def clean_docs(docs):
    docs_clean = docs\
        .apply(conv_to_lowerc)\
        .apply(rm_punct)\
        .apply(tokenize)\
        .where(lambda x : x.apply(detect_short_doc) == False)\
        .dropna()    
    return docs_clean

In [None]:
docs_clean = clean_docs(comments['body'])

In [None]:
docs_clean

0         [you, re, asking, how, they, re, going, to, be...
1         [gt, i, don, t, think, there, are, any, varyin...
2         [its, split, on, copyright, it, leans, anti, n...
3         [that, would, be, up, to, the, land, owners, a...
4         [i, have, him, here, gun, to, his, head, round...
                                ...                        
380954    [the, nature, police, are, supposed, to, be, p...
380957    [i, got, reddit, last, year, and, only, starte...
380958    [i, m, just, curious, how, this, is, clear, to...
380962    [staying, at, said, job, while, being, underpa...
380963    [i, m, not, sure, that, this, has, ever, been,...
Name: body, Length: 264796, dtype: object

### Lexeme-based

In [None]:
docs_clean = docs_clean\
    .str.replace('anglo-saxon', 'anglosaxon')\
    .str.replace('anglo saxon', 'anglosaxon')

In [None]:
docs_clean = docs_clean.str.replace('anglosaxon', 'Anglo-Saxon')

### Create corpus

In [None]:
# export
class Corpus:
    """An iterator that yields sentences (lists of str)."""
    def __init__(self, docs_clean):
        self.docs_clean = docs_clean

    def __iter__(self):
        for doc in self.docs_clean:
            yield doc

In [None]:
corpus = Corpus(docs_clean)

### Train embeddings

In [None]:
# export
def train_emb(corpus, MIN_COUNT=5, SIZE=300, WORKERS=8, WINDOW=3):
    model = Word2Vec(
        corpus, 
        min_count=MIN_COUNT,
        size=SIZE,
        workers=WORKERS, 
        window=WINDOW
    )
    return model

In [None]:
%%time
model = train_emb(corpus)

CPU times: user 2min 35s, sys: 11.1 s, total: 2min 46s
Wall time: 1min 9s


In [None]:
wv = model.wv

In [None]:
wv['you'].shape

(300,)

### Evaluate embeddings

In [None]:
for index, word in enumerate(wv.index2word):
    if index == 10:
        break
    print(f"{index}/{len(wv.index2word)}: {word}")

0/4732: the
1/4732: to
2/4732: a
3/4732: and
4/4732: i
5/4732: of
6/4732: that
7/4732: it
8/4732: is
9/4732: you


In [None]:
wv.most_similar('person', topn=20)

[('guy', 0.6144881248474121),
 ('woman', 0.6049772500991821),
 ('man', 0.5703142881393433),
 ('politician', 0.5600118637084961),
 ('girl', 0.5361282229423523),
 ('candidate', 0.5117915868759155),
 ('cop', 0.5107803344726562),
 ('someone', 0.4916432201862335),
 ('senator', 0.4843234419822693),
 ('kid', 0.4767877161502838),
 ('persons', 0.46128833293914795),
 ('victim', 0.45970290899276733),
 ('somebody', 0.4551505148410797),
 ('lady', 0.4502111077308655),
 ('citizen', 0.44879597425460815),
 ('owner', 0.44654402136802673),
 ('mother', 0.44434842467308044),
 ('parent', 0.44392669200897217),
 ('fetus', 0.4342397451400757),
 ('entity', 0.43350768089294434)]

In [None]:
lex_rel_1 = 'person'
lex_rel_2 = 'man'
lex_unrel = 'time'

In [None]:
sim_rel = wv.similarity(lex_rel_1, lex_rel_2)
sim_rel

0.57419455

In [None]:
sim_unrel_1 = wv.similarity(lex_rel_1, lex_unrel)
sim_unrel_1

0.27401066

In [None]:
sim_unrel_2 = wv.similarity(lex_rel_2, lex_unrel)
sim_unrel_2

0.14052157

In [None]:
assert sim_rel > sim_unrel_1

In [None]:
assert sim_rel > sim_unrel_2

## Save model

In [None]:
wv.save(f'~/promo/socemb/data/vecs/year/{YEAR}.wv')

## Load models

In [None]:
wv = KeyedVectors.load(f'{VECS_DIR}{SUBREDDIT}_{YEAR}_{LIMIT}.wv', mmap='r')

In the next cell, I need to to use `init_sims` to normalize the vectors. To do this, I need to save and load full models, not just the vector tables. Therefore, I need to train all models again.

In [None]:
# export
def load_model(SUBREDDIT: str, YEAR: int, LIMIT: int = 100_000):
    """Load word2vec model from disk."""
    model = KeyedVectors.load(f'data/vecs/{SUBREDDIT}_{YEAR}_{LIMIT}.wv', mmap='r')
    return model

In [None]:
YEAR = 2020

In [None]:
LEX = 'trump'

In [None]:
model_1 = load_model(COMM_1, YEAR)

In [None]:
model_2 = load_model(COMM_2, YEAR)

## Extract vector for target word

In [None]:
# export
def get_vec_from_model(lex: str, model):
    return model[lex]

In [None]:
get_vec_from_model(LEX, model_1)

array([-0.85950464, -0.15552747,  0.68602735, -0.83750105, -1.294974  ,
        0.10401776, -0.06652975,  0.23461531,  0.2702048 ,  0.26216885,
        0.8760099 ,  1.2155982 ,  1.0869384 ,  0.00500609, -0.8161697 ,
       -1.5187039 , -0.93977666,  0.06295428,  0.6648812 ,  0.31061354,
       -0.33731386, -1.6166893 ,  0.5991787 ,  0.13134459,  0.46875864,
       -0.4895301 ,  0.39174125, -0.88394624,  0.08596516,  0.7791669 ,
        0.500071  ,  0.05248855, -0.60904044, -0.16645384, -1.7530011 ,
       -0.37262776, -0.04840833,  0.31690505, -0.23659422,  0.42466795,
       -0.99259675, -0.18115623, -1.6040664 ,  0.07952278,  0.7500193 ,
        0.0036784 , -1.2042639 ,  0.37242904,  0.6376366 , -0.3261277 ,
       -0.45251164, -0.02645606, -1.1056576 ,  0.40041718,  1.4039432 ,
       -0.47432435, -0.82201827, -0.12088189,  1.0664238 ,  0.06684092,
       -0.22534947,  0.6871513 , -0.25023615,  0.47216108,  0.6159361 ,
       -0.7014341 ,  0.59670144,  0.8569966 ,  0.9293675 , -0.08

In [None]:
vec_1 = model_1['trump']

In [None]:
vec_2 = model_2['trump']

Soc / 2020 / conservative vs. politics

In [None]:
cosine(trump_cons_vec, trump_pol_vec)

0.6027644872665405

Soc / 2013 / conservative vs. politics

In [None]:
cosine(vec_1, vec_2)

0.6027644872665405

## Measure distances

### For `1` pair of words

In [None]:
vec_t1 = model_1['person']

In [None]:
vec_t2 = model_2['person']

In [None]:
assert vec_t1.shape[0] == 300

In [None]:
assert vec_t2.shape[0] == 300

In [None]:
cosine(vec_t1, vec_t2)

0.6892097592353821

### For full vocabulary

In [None]:
def get_distances(model_1, model_2):
    dist_dict = {}
    # will need to replace `vecs_t1.vocab` with a general vocab
    for word in model_1.vocab:
        if word in model_2.vocab:
            dist_dict[word] = cosine(model_1[word], model_2[word])
        else:
            pass
    dist_dict_df = pd.DataFrame(
        data=dist_dict.items(), 
        columns=['word', 'dist']
    )\
        .sort_values('dist', ascending=False)
    return dist_dict_df

In [None]:
dists = get_distances(model_1, model_2)

In [None]:
dists.nlargest(20, 'dist')

Unnamed: 0,word,dist
415,2000,1.09413
8749,kentucky,1.061203
2872,mitch,1.040116
12751,irreparable,1.026567
12012,2k,1.025281
5045,r,1.010832
706,600,1.001934
484,georgia,0.998421
11903,moscow,0.993207
4954,mcconnell,0.990447


### Inspect nearest neighbours

In [None]:
COMM_1

'askaconservative'

In [None]:
for model in [model_1, model_2]:
    for nb, dist in model.most_similar(LEX, topn=20):
        print(f'{nb:<15}{model.vocab[nb].count:>6}')
    print('\n---\n')

biden            4567
obama            1776
hillary           503
joe               765
him              5313
bernie            423
he              20095
kamala            171
clinton           545
campaign          820
gop               625
trumps            377
harris            282
dnc               269
hunter            212
himself           673
presidency        471
reagan            324
putin             143
whoever           234

---

biden            3161
bernie           2063
obama             964
himself           790
he              24090
sanders           979
hillary           275
him              6684
nixon             155
putin             132
pelosi            516
pence             666
trumps            345
clinton           338
kemp              116
aoc               322
bush              308
dnc               246
warren            162
sleepy             17

---



In [None]:
for model in [model_1, model_2]:
    i = 0
    while i < 10:
        for nb, dist in model.most_similar(lex):
            if model.vocab[nb].count > 300:
                print(f'{nb:<15}{model.vocab[nb].count:>6}')
                i += 1
    print('\n---\n')

KeyboardInterrupt: 

### Temporal variation

In [None]:
lex = 'trump'

In [None]:
for model in [model_1, model_2]:
    for nb, dist in model.most_similar(lex, topn=10):
        print(nb)
    print('\n---\n')

bankrupt
absorb
distributed
detect
essentials
composite
bypass
ideally
glide
prop

---

biden
bernie
obama
hillary
sanders
he
himself
nixon
putin
pelosi

---



### Social variation

In [None]:
lex = 'quarantine'

In [None]:
print(f'{COMM_1}\n---')
for nb, dist in model_1.most_similar(lex, topn=10):
    print(nb)

askaconservative
---
ppe
rain
bombings
port
aerosols
bullets
communications
missiles
weekly
2006


In [None]:
print(f'{COMM_2}\n---')
for nb, dist in model_2.most_similar(lex, topn=10):
    print(nb)

asklibertarians
---
defense
defence
righteous
sufficiency
determination
isolate
legally
agency
permitted
explanatory


## Align models

In [None]:
import numpy as np
import gensim

In [None]:
smart_procrustes_align_gensim(model_1, model_2)

  old_arr = m.syn0norm


IndexError: index 18164 is out of bounds for axis 0 with size 17915

In [None]:
def smart_procrustes_align_gensim(base_embed, other_embed, words=None):
	"""Procrustes align two gensim word2vec models (to allow for comparison between same word across models).
	Code ported from HistWords <https://github.com/williamleif/histwords> by William Hamilton <wleif@stanford.edu>.
		(With help from William. Thank you!)
	First, intersect the vocabularies (see `intersection_align_gensim` documentation).
	Then do the alignment on the other_embed model.
	Replace the other_embed model's syn0 and syn0norm numpy matrices with the aligned version.
	Return other_embed.
	If `words` is set, intersect the two models' vocabulary with the vocabulary in words (see `intersection_align_gensim` documentation).
	"""
	
	# patch by Richard So [https://twitter.com/richardjeanso) (thanks!) to update this code for new version of gensim
	base_embed.init_sims()
	other_embed.init_sims()

	# make sure vocabulary and indices are aligned
	in_base_embed, in_other_embed = intersection_align_gensim(base_embed, other_embed, words=words)

	# get the embedding matrices
	base_vecs = in_base_embed.syn0norm
	other_vecs = in_other_embed.syn0norm

	# just a matrix dot product with numpy
	m = other_vecs.T.dot(base_vecs) 
	# SVD method from numpy
	u, _, v = np.linalg.svd(m)
	# another matrix operation
	ortho = u.dot(v) 
	# Replace original array with modified one
	# i.e. multiplying the embedding matrix (syn0norm)by "ortho"
	other_embed.syn0norm = other_embed.syn0 = (other_embed.syn0norm).dot(ortho)
	return other_embed
	

In [None]:
def intersection_align_gensim(m1,m2, words=None):
	"""
	Intersect two gensim word2vec models, m1 and m2.
	Only the shared vocabulary between them is kept.
	If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
	Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
	These indices correspond to the new syn0 and syn0norm objects in both gensim models:
		-- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
		-- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
	The .vocab dictionary is also updated for each model, preserving the count but updating the index.
	"""

	# Get the vocab for each model
	vocab_m1 = set(m1.vocab.keys())
	vocab_m2 = set(m2.vocab.keys())

	# Find the common vocabulary
	common_vocab = vocab_m1&vocab_m2
	if words: common_vocab&=set(words)

	# If no alignment necessary because vocab is identical...
	if not vocab_m1-common_vocab and not vocab_m2-common_vocab:
		return (m1,m2)

	# Otherwise sort by frequency (summed for both)
	common_vocab = list(common_vocab)
	common_vocab.sort(key=lambda w: m1.vocab[w].count + m2.vocab[w].count,reverse=True)

	# Then for each model...
	for m in [m1,m2]:
		# Replace old syn0norm array with new one (with common vocab)
		indices = [m.vocab[w].index for w in common_vocab]
		old_arr = m.syn0norm
		new_arr = np.array([old_arr[index] for index in indices])
		m.syn0norm = m.syn0 = new_arr

		# Replace old vocab dictionary with new one (with common vocab)
		# and old index2word with new one
		m.index2word = common_vocab
		old_vocab = m.vocab
		new_vocab = {}
		for new_index,word in enumerate(common_vocab):
			old_vocab_obj=old_vocab[word]
			new_vocab[word] = gensim.models.word2vec.Vocab(index=new_index, count=old_vocab_obj.count)
		m.vocab = new_vocab

	return (m1,m2)

In [None]:
m1 = model_1
m2 = model_2

In [None]:
vocab_m1 = set(m1.vocab.keys())
vocab_m2 = set(m2.vocab.keys())

In [None]:
common_vocab = vocab_m1 & vocab_m2

In [None]:
common_vocab = list(common_vocab)
common_vocab.sort(key=lambda w: m1.vocab[w].count + m2.vocab[w].count,reverse=True)

Note that for the new Gensim versions, calls for .index2word, .vocab, .syn0 and .syn0norm should be replaced with .wv.index2word, .wv.vocab, .wv.syn0 and .wv.syn0norm respectively.

In [None]:
for m in [m1,m2]:
    # Replace old syn0norm array with new one (with common vocab)
    indices = [m.vocab[w].index for w in common_vocab]
    old_arr = m.syn0norm
    new_arr = np.array([old_arr[index] for index in indices])
    m.syn0norm = m.syn0 = new_arr

  old_arr = m.syn0norm


IndexError: index 18164 is out of bounds for axis 0 with size 17915

  m.syn0norm = m.syn0 = new_arr
  m.syn0norm = m.syn0 = new_arr


In [None]:
    # Replace old vocab dictionary with new one (with common vocab)
    # and old index2word with new one
    m.index2word = common_vocab
    old_vocab = m.vocab
    new_vocab = {}
    for new_index,word in enumerate(common_vocab):
        old_vocab_obj=old_vocab[word]
        new_vocab[word] = gensim.models.word2vec.Vocab(index=new_index, count=old_vocab_obj.count)
    m.vocab = new_vocab

In [None]:
def align_gensim_models(models, words=None):
    """
    Returns the aligned/intersected models from a list of gensim word2vec models.
    Generalized from original two-way intersection as seen above.
    
    Also updated to work with the most recent version of gensim
    Requires reduce from functools
    
    In order to run this, make sure you run 'model.init_sims()' for each model before you input them for alignment.
    
    ##############################################
    ORIGINAL DESCRIPTION
    ##############################################
    
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
    The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    """

    # Get the vocab for each model
    vocabs = [set(m.vocab.keys()) for m in models]

    # Find the common vocabulary
    common_vocab = reduce((lambda vocab1,vocab2: vocab1&vocab2), vocabs)
    if words: common_vocab&=set(words)

    # If no alignment necessary because vocab is identical...
    
    # This was generalized from:
    # if not vocab_m1-common_vocab and not vocab_m2-common_vocab and not vocab_m3-common_vocab:
    #   return (m1,m2,m3)
    if all(not vocab-common_vocab for vocab in vocabs):
        print("All identical!")
        return models
        
    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: sum([m.vocab[w].count for m in models]),reverse=True)
    
    # Then for each model...
    for m in models:
        
        # Replace old vectors_norm array with new one (with common vocab)
        indices = [m.vocab[w].index for w in common_vocab]
                
        old_arr = m.vectors_norm
                
        new_arr = np.array([old_arr[index] for index in indices])
        m.vectors_norm = m.syn0 = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        m.index2word = common_vocab
        old_vocab = m.vocab
        new_vocab = {}
        for new_index,word in enumerate(common_vocab):
            old_vocab_obj=old_vocab[word]
            new_vocab[word] = gensim.models.word2vec.Vocab(index=new_index, count=old_vocab_obj.count)
        m.vocab = new_vocab

    return models

In [None]:
from functools import reduce

In [None]:
model_1.init_sims()
model_2.init_sims()

In [None]:
align_gensim_models([model_1, model_2])

IndexError: index 18164 is out of bounds for axis 0 with size 17915

## Obsolete stuff

### Gensim preprocessing