In [None]:
# default_exp type_emb
# all_data

In [None]:
# hide
%load_ext autoreload
%autoreload 2

# Type embeddings

## Imports

In [None]:
from socemb.read_data import *

In [None]:
# export
import re
import numpy as np
import pandas as pd

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from scipy.spatial.distance import cosine

## Read comments

In [None]:
f_path_1 = get_fpath_subr_yr('askaconservative', 100_000, 2019)

In [None]:
f_path_2 = get_fpath_subr_yr('askaconservative', 100_000, 2020)

In [None]:
comments_1 = read_comm_csv(f_path_1)

In [None]:
comments_2 = read_comm_csv(f_path_2)

In [None]:
comments_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97243 entries, 0 to 97242
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   body         97243 non-null  string        
 1   created_utc  97243 non-null  datetime64[ns]
 2   id           97243 non-null  string        
 3   subreddit    97243 non-null  string        
dtypes: datetime64[ns](1), string(3)
memory usage: 3.7 MB


## Split comments

### Split in temporal bins

In [None]:
comments = comments.assign(date = pd.to_datetime(
    comments['created_utc'],
    errors='coerce'
))

In [None]:
comments['date'].dt.year.value_counts()

In [None]:
comments = comments[comments.date.dt.year == TIME]

### Split by communities

In [None]:
comments\
    .value_counts('subreddit')\
    .head(20)

In [None]:
comments = comments.query('subreddit == "politics"')

In [None]:
comments = comments.query('subreddit == "AskHistorians"')

## Pre-process comments

In [None]:
# export
def conv_to_lowerc(doc):
    return doc.lower()

In [None]:
assert conv_to_lowerc('Test') == 'test'

In [None]:
# export
def rm_punct(doc):
    return re.sub(r'[^\w\s]+', ' ', doc)

In [None]:
assert rm_punct('No-punctuation!') == 'No punctuation '

In [None]:
# export
def tokenize(doc):
    return doc.split()

In [None]:
assert len(tokenize('These are three-tokens')) == 3

In [None]:
# export
def detect_short_doc(doc, limit=10):
    if len(doc) >= limit:
        return False
    else:
        return True

In [None]:
assert detect_short_doc(['oans', 'zwoa', 'drei'], 10) == True

In [None]:
assert detect_short_doc(['oans', 'zwoa', 'drei', 'viere', 'fuenfe', 'sechse', 'simme', 'achte', 'neine', 'zehne'], 10) == False

In [None]:
# export
def clean_docs(docs):
    docs_clean = docs\
        .apply(conv_to_lowerc)\
        .apply(rm_punct)\
        .apply(tokenize)\
        .where(lambda x : x.apply(detect_short_doc) == False)\
        .dropna()    
    return docs_clean

In [None]:
docs_clean_1 = clean_docs(comments_1['body'])

In [None]:
docs_clean_2 = clean_docs(comments_2['body'])

## Create corpus

In [None]:
# export
class Corpus:
    """An iterator that yields sentences (lists of str)."""
    def __init__(self, docs_clean):
        self.docs_clean = docs_clean

    def __iter__(self):
        for doc in self.docs_clean:
            yield doc

In [None]:
corpus_1 = Corpus(docs_clean_1)

In [None]:
corpus_2 = Corpus(docs_clean_2)

## Train embeddings

In [None]:
# export
def train_emb(corpus, MIN_COUNT=5, SIZE=300, WORKERS=8, WINDOW=3):
    model = Word2Vec(
        corpus, 
        min_count=MIN_COUNT,
        size=SIZE,
        workers=WORKERS, 
        window=WINDOW
    )
    return model

In [None]:
model_1 = train_emb(corpus_1)

In [None]:
%%time
model_2 = train_emb(corpus_2)

CPU times: user 45.2 s, sys: 253 ms, total: 45.5 s
Wall time: 12.3 s


## Evaluate embeddings

In [None]:
for index, word in enumerate(model_1.wv.index2word):
    if index == 10:
        break
    print(f"{index}/{len(model_1.wv.index2word)}: {word}")

0/20738: the
1/20738: to
2/20738: and
3/20738: a
4/20738: of
5/20738: that
6/20738: is
7/20738: i
8/20738: it
9/20738: in


In [None]:
model_1.wv.most_similar('person', topn=20)

[('woman', 0.7169589400291443),
 ('man', 0.6905370950698853),
 ('guy', 0.6834359169006348),
 ('baby', 0.6324769258499146),
 ('someone', 0.6222299337387085),
 ('doctor', 0.6170642971992493),
 ('kid', 0.6133965253829956),
 ('parent', 0.6090949773788452),
 ('anyone', 0.596699595451355),
 ('politician', 0.5884082317352295),
 ('fetus', 0.5837575793266296),
 ('child', 0.5680943727493286),
 ('somebody', 0.5629948377609253),
 ('mother', 0.5546822547912598),
 ('user', 0.534717857837677),
 ('friend', 0.5345014333724976),
 ('people', 0.5293498039245605),
 ('victim', 0.5277709364891052),
 ('citizen', 0.5250868201255798),
 ('president', 0.520645022392273)]

In [None]:
lex_rel_1 = 'person'
lex_rel_2 = 'man'
lex_unrel = 'time'

In [None]:
sim_rel = model_1.wv.similarity(lex_rel_1, lex_rel_2)
sim_rel

0.69053704

In [None]:
sim_unrel_1 = model_1.wv.similarity(lex_rel_1, lex_unrel)
sim_unrel_1

0.18910645

In [None]:
sim_unrel_2 = model_1.wv.similarity(lex_rel_2, lex_unrel)
sim_unrel_2

0.12229807

In [None]:
assert sim_rel > sim_unrel_1

In [None]:
assert sim_rel > sim_unrel_2

## Save model

In [None]:
model_1.wv.save(f'~/promo/socemb/data/vecs/year/{YEAR}.model_1.wv')

## Load models (deprecated)

In [None]:
wv = KeyedVectors.load(f'{VECS_DIR}{SUBREDDIT}_{YEAR}_{LIMIT}.wv', mmap='r')

In the next cell, I need to to use `init_sims` to normalize the vectors. To do this, I need to save and load full models, not just the vector tables. Therefore, I need to train all models again.

In [None]:
# export
def load_model(SUBREDDIT: str, YEAR: int, LIMIT: int = 100_000):
    """Load word2vec model from disk."""
    model = KeyedVectors.load(f'data/vecs/{SUBREDDIT}_{YEAR}_{LIMIT}.wv', mmap='r')
    return model

In [None]:
YEAR = 2020

In [None]:
LEX = 'trump'

In [None]:
model_1 = load_model(COMM_1, YEAR)

In [None]:
model_2 = load_model(COMM_2, YEAR)

In [None]:
# export
def get_vec_from_model(lex: str, model):
    return model[lex]

In [None]:
get_vec_from_model(LEX, model_1)

In [None]:
vec_1 = model_1['trump']

In [None]:
vec_2 = model_2['trump']

Soc / 2020 / conservative vs. politics

In [None]:
cosine(trump_cons_vec, trump_pol_vec)

Soc / 2013 / conservative vs. politics

In [None]:
cosine(vec_1, vec_2)

## Align models

In [None]:
from functools import reduce
import gensim

In [None]:
def align_gensim_models(models, words=None):
    """
    Returns the aligned/intersected models from a list of gensim word2vec models.
    Generalized from original two-way intersection as seen above.
    
    Also updated to work with the most recent version of gensim
    Requires reduce from functools
    
    In order to run this, make sure you run 'model.init_sims()' for each model before you input them for alignment.
    
    ##############################################
    ORIGINAL DESCRIPTION
    ##############################################
    
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
    The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    """

    # Get the vocab for each model
    vocabs = [set(m.vocab.keys()) for m in models]

    # Find the common vocabulary
    common_vocab = reduce((lambda vocab1,vocab2: vocab1&vocab2), vocabs)
    if words: common_vocab&=set(words)

    # If no alignment necessary because vocab is identical...
    
    # This was generalized from:
    # if not vocab_m1-common_vocab and not vocab_m2-common_vocab and not vocab_m3-common_vocab:
    #   return (m1,m2,m3)
    if all(not vocab-common_vocab for vocab in vocabs):
        print("All identical!")
        return models
        
    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: sum([m.vocab[w].count for m in models]),reverse=True)
    
    # Then for each model...
    for m in models:
        
        # Replace old vectors_norm array with new one (with common vocab)
        indices = [m.vocab[w].index for w in common_vocab]
                
        old_arr = m.vectors_norm
                
        new_arr = np.array([old_arr[index] for index in indices])
        m.vectors_norm = m.syn0 = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        m.index2word = common_vocab
        old_vocab = m.vocab
        new_vocab = {}
        for new_index,word in enumerate(common_vocab):
            old_vocab_obj=old_vocab[word]
            new_vocab[word] = gensim.models.word2vec.Vocab(index=new_index, count=old_vocab_obj.count)
        m.vocab = new_vocab

    return models

In [None]:
model_1.init_sims(replace=True)
model_2.init_sims(replace=True)

In [None]:
models_ali = align_gensim_models([model_1.wv, model_2.wv])

  m.vectors_norm = m.syn0 = new_arr


In [None]:
model_1_ali = models_ali[0]

In [None]:
model_2_ali = models_ali[1]

## Measure distances

### For `1` pair of words

In [None]:
vec_t1 = model_1['person']

In [None]:
vec_t2 = model_2['person']

In [None]:
assert vec_t1.shape[0] == 300

In [None]:
assert vec_t2.shape[0] == 300

In [None]:
cosine(vec_t1, vec_t2)

### For full vocabulary

In [None]:
def get_distances(model_1, model_2):
    dist_dict = {}
    # will need to replace `vecs_t1.vocab` with a general vocab
    for word in model_1.wv.vocab:
        if word in model_2.wv.vocab:
            dist_dict[word] = cosine(model_1[word], model_2[word])
        else:
            pass
    dist_dict_df = pd.DataFrame(
        data=dist_dict.items(), 
        columns=['word', 'dist']
    )\
        .sort_values('dist', ascending=False)
    return dist_dict_df

In [None]:
dists = get_distances(model_1, model_2)

  dist_dict[word] = cosine(model_1[word], model_2[word])


In [None]:
dists.nlargest(20, 'dist')

Unnamed: 0,word,dist
1000,virus,0.988359
907,masks,0.919138
792,mask,0.890782
2554,anyways,0.839805
13655,treatise,0.818108
15832,pertain,0.813345
4855,cloth,0.812976
2418,flu,0.812539
435,x200b,0.811742
7472,reopen,0.791699


### Inspect nearest neighbours

In [None]:
COMM_1

In [None]:
for model in [model_1, model_2]:
    for nb, dist in model.most_similar(LEX, topn=20):
        print(f'{nb:<15}{model.vocab[nb].count:>6}')
    print('\n---\n')

In [None]:
for model in [model_1, model_2]:
    i = 0
    while i < 10:
        for nb, dist in model.most_similar(lex):
            if model.vocab[nb].count > 300:
                print(f'{nb:<15}{model.vocab[nb].count:>6}')
                i += 1
    print('\n---\n')

### Temporal variation

In [None]:
lex = 'trump'

In [None]:
for model in [model_1, model_2]:
    for nb, dist in model.most_similar(lex, topn=10):
        print(nb)
    print('\n---\n')

### Social variation

In [None]:
lex = 'quarantine'

In [None]:
print(f'{COMM_1}\n---')
for nb, dist in model_1.most_similar(lex, topn=10):
    print(nb)

In [None]:
print(f'{COMM_2}\n---')
for nb, dist in model_2.most_similar(lex, topn=10):
    print(nb)

## Obsolete stuff

### Extract vector for target word

### Gensim preprocessing