In [None]:
# default_exp type_emb
# all_data

In [None]:
# hide
%load_ext autoreload
%autoreload 2

# Type embeddings

## Imports

In [None]:
from gensim.models import Word2Vec

## Variables

In [None]:
LEX = 'woke'
TIME = 2013

## Read data

In [None]:
from socemb.read_data import *

In [None]:
fpaths = get_fpaths(LEX, source='local')

In [None]:
%%time
comments = read_comments(fpaths)

  call = lambda f, *a, **k: f(*a, **k)
  call = lambda f, *a, **k: f(*a, **k)
  call = lambda f, *a, **k: f(*a, **k)


CPU times: user 34.9 s, sys: 3.46 s, total: 38.4 s
Wall time: 41.7 s


## Split data

In [None]:
import pandas as pd

In [None]:
comments = comments.assign(date = pd.to_datetime(
    comments['created_utc'],
    errors='coerce'
))

In [None]:
comments = comments[comments.date.dt.year == TIME]

In [None]:
comments = comments[:50000]

## Pre-processing

In [None]:
docs = comments['body']

In [None]:
docs = docs.dropna()

### Gensim

In [None]:
from gensim.parsing.preprocessing import strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short

In [None]:
# docs_clean = preprocess_documents(docs)
# docs_clean = stem_text()
#docs_clean = strip_tags(docs)
docs_clean. = strip_punctuation(docs_clean)
docs_clean = strip_multiple_whitespaces(docs_clean)
docs_clean = strip_numeric(docs_clean)
docs_clean = remove_stopwords(docs_clean)
docs_clean = strip_short(docs_clean)

TypeError: decoding to str: need a bytes-like object, list found

### Manual

In [None]:
import re

In [None]:
docs_clean = docs.str.lower()

In [None]:
docs_clean = docs_clean.str.replace("[^A-Za-z']+", ' ', regex=True)

In [None]:
%%time
docs_clean = docs_clean.str.split()

CPU times: user 314 ms, sys: 9.82 ms, total: 323 ms
Wall time: 344 ms


In [None]:
docs_clean = docs_clean[docs.apply(len) >= 10]

### Create corpus

In [None]:
docs_clean

804845     [woke, up, like, this, flawless, https, media,...
804846     [my, aunt, had, a, c, section, finally, becaus...
804847     [i, had, a, dream, like, that, a, week, before...
804848     [is, the, voice, input, box, ticked, i, use, a...
804849     [the, effects, of, xyrem, vary, heavily, enoug...
                                 ...                        
3262719    [thank, you, all, for, being, so, concerned, a...
3262720    [something, like, that, sort, of, happened, to...
3262721    [guys, this, is, actually, legit, today, i, wo...
3262722    [it, was, more, of, snarky, responses, and, ju...
3262723    [in, the, future, for, commenters, and, other,...
Name: body, Length: 49993, dtype: object

In [None]:
# export
class Corpus:
    """An iterator that yields sentences (lists of str)."""
    def __init__(self, docs_clean):
        self.docs_clean = docs_clean
    def __iter__(self):
        for doc in docs_clean:
            yield doc

In [None]:
corpus = Corpus(docs_clean)

## Train embeddings

In [None]:
%%time
model = Word2Vec(
    corpus, 
    min_count=5,
    size=300,
    workers=8, 
    window=3
)

CPU times: user 1min, sys: 879 ms, total: 1min
Wall time: 18.3 s


In [None]:
model.wv[LEX].shape

(300,)

## Evaluate embeddings

In [None]:
for index, word in enumerate(model.wv.index2word):
    if index == 10:
        break
    print(f"word #{index}/{len(model.wv.index2word)} is {word}")

word #0/22749 is i
word #1/22749 is the
word #2/22749 is and
word #3/22749 is to
word #4/22749 is a
word #5/22749 is was
word #6/22749 is my
word #7/22749 is of
word #8/22749 is in
word #9/22749 is it


In [None]:
model.wv.most_similar(LEX)[:10]

[('waking', 0.7163203358650208),
 ('wake', 0.6867837905883789),
 ('wakes', 0.640035092830658),
 ('woken', 0.601813018321991),
 ('picked', 0.5780839920043945),
 ('fucked', 0.5676211714744568),
 ('messed', 0.541072428226471),
 ('sobered', 0.5383647084236145),
 ('hooked', 0.5369008779525757),
 ('showed', 0.5246539115905762)]

In [None]:
model.wv.get_vector(LEX).shape

(300,)

In [None]:
model.wv.similarity('basketball', 'tennis')

0.61944455

In [None]:
model.wv.similarity('basketball', 'i')

-0.016407378

## Analysis

### Nearest semantic neighbours

In [None]:
model_2012.wv.most_similar(LEX)[:10]

[('wake', 0.7249749302864075),
 ('waking', 0.6852210760116577),
 ('wakes', 0.6623281240463257),
 ('woken', 0.6221311092376709),
 ('messed', 0.5709801912307739),
 ('picked', 0.5658282041549683),
 ('sobered', 0.5360423922538757),
 ('hooked', 0.5333197116851807),
 ('fucked', 0.5170655250549316),
 ('teared', 0.511203944683075)]

In [None]:
model_2013.wv.most_similar(LEX)[:10]

[('waking', 0.7163203358650208),
 ('wake', 0.6867837905883789),
 ('wakes', 0.640035092830658),
 ('woken', 0.601813018321991),
 ('picked', 0.5780839920043945),
 ('fucked', 0.5676211714744568),
 ('messed', 0.541072428226471),
 ('sobered', 0.5383647084236145),
 ('hooked', 0.5369008779525757),
 ('showed', 0.5246539115905762)]

In [None]:
model_2020.wv.most_similar(LEX)[:10]

[('liter', 0.5637393593788147),
 ('“woke', 0.5339571833610535),
 ('noic', 0.5338937044143677),
 ('wake', 0.5205501317977905),
 ('jerk', 0.5148869752883911),
 ('lol', 0.5020809769630432),
 ('“woke”', 0.48733627796173096),
 ('riser', 0.48462629318237305),
 ('woken', 0.4722602963447571),
 ('loudest', 0.4680590033531189)]

### Semantic distances

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
import numpy as np

In [None]:
woke_2012 = model_2012.wv.get_vector('woke')

In [None]:
woke_2013 = model_2013.wv.get_vector('woke')

In [None]:
woke_2020 = model_2020.wv.get_vector('woke')

In [None]:
cosine_similarity(woke_2012.reshape(1, -1), woke_2020.reshape(1, -1))

array([[0.22351377]], dtype=float32)

In [None]:
cosine_similarity(woke_2012.reshape(1, -1), woke_2013.reshape(1, -1))

array([[0.56318325]], dtype=float32)