In [None]:
# default_exp type_emb
# all_data

In [None]:
# hide
%load_ext autoreload
%autoreload 2

# Type embeddings

## Imports

In [None]:
from gensim.models import Word2Vec

In [None]:
import pandas as pd

## Variables

In [None]:
LEX = 'Anglo-Saxon'
TIME = 2020

## Read data

In [None]:
from socemb.read_data import *

In [None]:
fpaths = get_fpaths(LEX, source='local')

In [None]:
%%time
comments = read_comments(fpaths)

CPU times: user 1.29 s, sys: 117 ms, total: 1.4 s
Wall time: 1.42 s


## Split data

### Split in temporal bins

In [None]:
comments = comments.assign(date = pd.to_datetime(
    comments['created_utc'],
    errors='coerce'
))

In [None]:
comments['date'].dt.year.value_counts()

2020.0    37744
2019.0    20995
2018.0    17447
2017.0    15157
2016.0    12187
2015.0    11190
2014.0     9022
2013.0     6667
2012.0     4446
2011.0     2073
2010.0     1122
2009.0      529
2008.0      245
2007.0       82
2006.0       19
Name: date, dtype: int64

In [None]:
comments = comments[comments.date.dt.year == TIME]

### Split by communities

In [None]:
comments\
    .value_counts('subreddit')\
    .head(20)

subreddit
AskReddit                1329
AskHistorians             919
HistoryMemes              832
todayilearned             704
CrusaderKings             634
europe                    573
PoliticalCompassMemes     567
unpopularopinion          552
worldnews                 549
AskEurope                 544
history                   502
assassinscreed            485
ukpolitics                479
politics                  413
france                    397
MapPorn                   380
heathenry                 342
tolkienfans               338
AncestryDNA               325
TikTokCringe              303
dtype: int64

In [None]:
comments = comments.query('subreddit == "politics"')

In [None]:
comments = comments.query('subreddit == "AskHistorians"')

## Pre-processing

In [None]:
docs = comments['body']

In [None]:
docs = docs.dropna()

### Gensim

In [None]:
from gensim.parsing.preprocessing import strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short

In [None]:
# docs_clean = preprocess_documents(docs)
# docs_clean = stem_text()
#docs_clean = strip_tags(docs)
docs_clean = strip_punctuation(docs_clean)
docs_clean = strip_multiple_whitespaces(docs_clean)
docs_clean = strip_numeric(docs_clean)
docs_clean = remove_stopwords(docs_clean)
docs_clean = strip_short(docs_clean)

TypeError: decoding to str: need a bytes-like object, list found

### Manual

In [None]:
import re

In [None]:
docs_clean = docs.str.lower()

In [None]:
docs_clean = docs_clean\
    .str.replace('anglo-saxon', 'anglosaxon')\
    .str.replace('anglo saxon', 'anglosaxon')

In [None]:
docs_clean = docs_clean.str.replace("[^A-Za-z']+", ' ', regex=True)

In [None]:
docs_clean = docs_clean.str.replace('anglosaxon', 'Anglo-Saxon')

In [None]:
%%time
docs_clean = docs_clean.str.split()

CPU times: user 4.29 ms, sys: 45 µs, total: 4.33 ms
Wall time: 4.34 ms


In [None]:
docs_clean = docs_clean[docs.apply(len) >= 10]

### Create corpus

In [None]:
docs_clean

64        [i, have, an, idea, how, about, the, republica...
84        [iirc, they, identified, as, wasps, white, Ang...
327       [yeah, from, what, i've, read, the, main, prob...
366       [by, the, s, the, american, bar, association, ...
447       [is, rudi, even, white, he, is, italian, the, ...
                                ...                        
138332    [i, don't, know, where, you, got, that, inform...
138443    [consider, that, many, countries, like, china,...
138616    [for, some, people, white, wasp, which, stands...
138639    [how, funny, same, exact, mindset, as, wasps, ...
138893    [what, is, it, s, as, simple, as, that, two, p...
Name: body, Length: 413, dtype: object

In [None]:
# export
class Corpus:
    """An iterator that yields sentences (lists of str)."""
    def __init__(self, docs_clean):
        self.docs_clean = docs_clean
    def __iter__(self):
        for doc in docs_clean:
            yield doc

In [None]:
corpus = Corpus(docs_clean)

## Train embeddings

In [None]:
%%time
model = Word2Vec(
    corpus, 
    min_count=5,
    size=300,
    workers=8, 
    window=3
)

CPU times: user 557 ms, sys: 7.88 ms, total: 564 ms
Wall time: 337 ms


In [None]:
model.wv[LEX].shape

(300,)

## Evaluate embeddings

In [None]:
for index, word in enumerate(model.wv.index2word):
    if index == 10:
        break
    print(f"{index}/{len(model.wv.index2word)}: {word}")

0/1356: the
1/1356: of
2/1356: and
3/1356: to
4/1356: a
5/1356: in
6/1356: that
7/1356: is
8/1356: it
9/1356: you


In [None]:
model.wv.most_similar(LEX, topn=20)

[('white', 0.9999111294746399),
 ('protestants', 0.9998713731765747),
 ('men', 0.9998622536659241),
 ('only', 0.9998579025268555),
 ('were', 0.9998517036437988),
 ('wasp', 0.9998478889465332),
 ('who', 0.9998472929000854),
 ('americans', 0.9998464584350586),
 ('people', 0.9998452663421631),
 ('so', 0.9998417496681213),
 ('from', 0.9998407363891602),
 ('for', 0.9998400211334229),
 ('not', 0.9998400211334229),
 ("that's", 0.9998394250869751),
 ('by', 0.99983811378479),
 ('all', 0.9998378753662109),
 ('power', 0.9998376369476318),
 ('being', 0.9998376369476318),
 ('are', 0.9998363256454468),
 ('man', 0.999836266040802)]

In [None]:
model.wv.similarity('basketball', 'tennis')

KeyError: "word 'basketball' not in vocabulary"

In [None]:
model.wv.similarity('basketball', 'i')

-0.02298256

## Analysis

### 'woke'

#### Nearest semantic neighbours

In [None]:
model_2012.wv.most_similar(LEX)[:10]

NameError: name 'model_2012' is not defined

In [None]:
model_2013.wv.most_similar(LEX)[:10]

[('waking', 0.7163203358650208),
 ('wake', 0.6867837905883789),
 ('wakes', 0.640035092830658),
 ('woken', 0.601813018321991),
 ('picked', 0.5780839920043945),
 ('fucked', 0.5676211714744568),
 ('messed', 0.541072428226471),
 ('sobered', 0.5383647084236145),
 ('hooked', 0.5369008779525757),
 ('showed', 0.5246539115905762)]

In [None]:
model_2020.wv.most_similar(LEX)[:10]

[('liter', 0.5637393593788147),
 ('“woke', 0.5339571833610535),
 ('noic', 0.5338937044143677),
 ('wake', 0.5205501317977905),
 ('jerk', 0.5148869752883911),
 ('lol', 0.5020809769630432),
 ('“woke”', 0.48733627796173096),
 ('riser', 0.48462629318237305),
 ('woken', 0.4722602963447571),
 ('loudest', 0.4680590033531189)]

#### Semantic distances

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
import numpy as np

In [None]:
woke_2012 = model_2012.wv.get_vector('woke')

In [None]:
woke_2013 = model_2013.wv.get_vector('woke')

In [None]:
woke_2020 = model_2020.wv.get_vector('woke')

In [None]:
cosine_similarity(woke_2012.reshape(1, -1), woke_2020.reshape(1, -1))

array([[0.22351377]], dtype=float32)

In [None]:
cosine_similarity(woke_2012.reshape(1, -1), woke_2013.reshape(1, -1))

array([[0.56318325]], dtype=float32)

### 'Anglo-Saxon'

#### diachronic

In [None]:
model_anglo_saxon_2015 = model

In [None]:
model_anglo_saxon_2015.wv.most_similar(LEX, topn=20)

[("'Anglo-Saxon'", 0.5941123962402344),
 ('saxon', 0.5766994953155518),
 ('anglo', 0.565537691116333),
 ('nordic', 0.5133814811706543),
 ('scandinavian', 0.5125651359558105),
 ('Anglo-Saxons', 0.5094699263572693),
 ('christian', 0.4999629557132721),
 ('celtic', 0.4792066812515259),
 ('norman', 0.4619556665420532),
 ('speaking', 0.4618777632713318),
 ('reformation', 0.45505085587501526),
 ('english', 0.44923704862594604),
 ('briton', 0.4431315064430237),
 ('germanic', 0.44106483459472656),
 ('oral', 0.4410490095615387),
 ('european', 0.44082000851631165),
 ('norse', 0.43608760833740234),
 ('scottish', 0.4293793737888336),
 ('umbrella', 0.42898502945899963),
 ('french', 0.4280967116355896)]

In [None]:
model_anglo_saxon_2020 = model

In [None]:
model_anglo_saxon_2020.wv.most_similar(LEX, topn=20)

[('anglo', 0.6091771125793457),
 ('saxon', 0.5825512409210205),
 ('scandinavian', 0.5350757837295532),
 ('nordic', 0.5227609872817993),
 ('norman', 0.5140793323516846),
 ('Anglo-Saxons', 0.5086712837219238),
 ('celtic', 0.4723518192768097),
 ('christian', 0.4697306156158447),
 ('germanic', 0.46731916069984436),
 ('briton', 0.45960360765457153),
 ('english', 0.455077201128006),
 ('danish', 0.4478153884410858),
 ('specifically', 0.44695770740509033),
 ('ethnicity', 0.4322441816329956),
 ('slavic', 0.42445483803749084),
 ('unknown', 0.4237750768661499),
 ('speaking', 0.4217537045478821),
 ('norse', 0.42002302408218384),
 ('viking', 0.41853898763656616),
 ('arab', 0.41817015409469604)]

#### social

In [None]:
model_anglo_saxon_politics = model

In [None]:
model_anglo_saxon_politics.wv.most_similar(LEX, topn=20)

[('white', 0.9457637071609497),
 ('appropo', 0.9342678785324097),
 ('american', 0.9291160106658936),
 ('normalizing', 0.9231254458427429),
 ('supremacy', 0.9224283695220947),
 ('wasp', 0.9215584993362427),
 ('males', 0.9192890524864197),
 ('uprising', 0.9160538911819458),
 ('christian', 0.9142899513244629),
 ('male', 0.9067474603652954),
 ('protestant', 0.906610369682312),
 ('non', 0.9024924039840698),
 ('heterosexual', 0.9013996124267578),
 ('supremacist', 0.8997038006782532),
 ('protestants', 0.8967183828353882),
 ('heritage', 0.8873857855796814),
 ('entails', 0.8841987252235413),
 ('anglo', 0.8823980093002319),
 ('cisgendered', 0.8809551000595093),
 ('swamp', 0.8800334930419922)]

In [None]:
model_anglo_saxon_historians = model

In [None]:
model_anglo_saxon_historians.wv.most_similar(LEX, topn=20)

[('saxon', 0.6367413997650146),
 ('medieval', 0.5913364291191101),
 ("'Anglo-Saxon'", 0.5791711211204529),
 ("thelweard's", 0.569437563419342),
 ('nantes', 0.5514806509017944),
 ('peterborough', 0.5486891269683838),
 ('norman', 0.5459719896316528),
 ('livonian', 0.5454285144805908),
 ('scandinavian', 0.5416566133499146),
 ('christian', 0.5314390659332275),
 ('compilers', 0.5277916193008423),
 ('shortcomings', 0.5209999084472656),
 ("fantosme's", 0.5180712938308716),
 ('tumble', 0.5163730382919312),
 ('anglo', 0.5038136839866638),
 ('throes', 0.4838976263999939),
 ('metrical', 0.48360365629196167),
 ('germanic', 0.4821174144744873),
 ('mediaeval', 0.48090994358062744),
 ('primarily', 0.47648146748542786)]