In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# all_data

In [None]:
from neocov.read_data import *
from neocov.preproc import *

In [None]:
import pandas as pd

# NeoCov

> Semantic change and social semantic variation of Covid-related English neologisms on Reddit.

## Variables

In [None]:
DATA_DIR = '../data/'
COMMENTS_DIAC_DIR = f'{DATA_DIR}comments/by_date/'
OUT_DIR = '..out/'

## Read data

### Get file paths

In [None]:
YEAR = '2020'

In [None]:
comment_paths_year = get_comments_paths_year(COMMENTS_DIAC_DIR, YEAR)

### Read comments

In [None]:
%%time
comments = read_comm_csvs(comment_paths_year)

In [None]:
comments

In [None]:
comments.value_counts('subreddit')

## Pre-process comments

### run preprocessing

In [None]:
%%time
comments = clean_comments(comments)

## Train models

### Create corpus

In [None]:
class Corpus:
    """An iterator that yields sentences (lists of str)."""
    def __init__(self, docs_clean):
        self.docs_clean = docs_clean

    def __iter__(self):
        for doc in self.docs_clean:
            yield doc

In [None]:
corpus = Corpus(comments['body'])

### Train model

In [None]:
from gensim.models import Word2Vec

In [None]:
def train_emb(corpus, 
              MIN_COUNT=5, 
              SIZE=300, 
              WORKERS=8, 
              WINDOW=5):
    model = Word2Vec(
        corpus, 
        min_count=MIN_COUNT,
        vector_size=SIZE,
        workers=WORKERS, 
        window=WINDOW
    )
    return model

In [None]:
%%time
model = train_emb(corpus)

In [None]:
len(model.wv.key_to_index)

### Save model

In [None]:
model.save(f'{OUT_DIR}models/{YEAR}.model')

### Load models

In [None]:
model_2019 = Word2Vec.load('{OUT_DIR}models/2019.model')

In [None]:
model_2020 = Word2Vec.load('{OUT_DIR}models/2020.model')

## Align models

In [None]:
import numpy as np

In [None]:
def intersection_align_gensim(m1, m2, words=None):
    """
    Intersect two gensim word2vec models, m1 and m2.
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
    The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    """

    # Get the vocab for each model
    vocab_m1 = set(m1.wv.index_to_key)
    vocab_m2 = set(m2.wv.index_to_key)

    # Find the common vocabulary
    common_vocab = vocab_m1 & vocab_m2
    if words: common_vocab &= set(words)

    # If no alignment necessary because vocab is identical...
    if not vocab_m1 - common_vocab and not vocab_m2 - common_vocab:
        return (m1,m2)

    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: m1.wv.get_vecattr(w, "count") + m2.wv.get_vecattr(w, "count"), reverse=True)
    # print(len(common_vocab))

    # Then for each model...
    for m in [m1, m2]:
        # Replace old syn0norm array with new one (with common vocab)
        indices = [m.wv.key_to_index[w] for w in common_vocab]
        old_arr = m.wv.vectors
        new_arr = np.array([old_arr[index] for index in indices])
        m.wv.vectors = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        new_key_to_index = {}
        new_index_to_key = []
        for new_index, key in enumerate(common_vocab):
            new_key_to_index[key] = new_index
            new_index_to_key.append(key)
        m.wv.key_to_index = new_key_to_index
        m.wv.index_to_key = new_index_to_key
        
        print(len(m.wv.key_to_index), len(m.wv.vectors))
        
    return (m1,m2)

In [None]:
def smart_procrustes_align_gensim(base_embed, other_embed, words=None):
    """
    Original script: https://gist.github.com/quadrismegistus/09a93e219a6ffc4f216fb85235535faf
    Procrustes align two gensim word2vec models (to allow for comparison between same word across models).
    Code ported from HistWords <https://github.com/williamleif/histwords> by William Hamilton <wleif@stanford.edu>.
        
    First, intersect the vocabularies (see `intersection_align_gensim` documentation).
    Then do the alignment on the other_embed model.
    Replace the other_embed model's syn0 and syn0norm numpy matrices with the aligned version.
    Return other_embed.
    If `words` is set, intersect the two models' vocabulary with the vocabulary in words (see `intersection_align_gensim` documentation).
    """

    # make sure vocabulary and indices are aligned
    in_base_embed, in_other_embed = intersection_align_gensim(base_embed, other_embed, words=words)

    # get the (normalized) embedding matrices
    base_vecs = in_base_embed.wv.get_normed_vectors()
    other_vecs = in_other_embed.wv.get_normed_vectors()

    # just a matrix dot product with numpy
    m = other_vecs.T.dot(base_vecs) 
    # SVD method from numpy
    u, _, v = np.linalg.svd(m)
    # another matrix operation
    ortho = u.dot(v) 
    # Replace original array with modified one, i.e. multiplying the embedding matrix by "ortho"
    other_embed.wv.vectors = (other_embed.wv.vectors).dot(ortho)    
    
    return other_embed

In [None]:
smart_procrustes_align_gensim(model_2019, model_2020)

190756 190756
190756 190756


<gensim.models.word2vec.Word2Vec at 0x141570640>

In [None]:
import pandas as pd

In [None]:
models_vocab = pd.DataFrame(
    data=[
        ['2019', len(model_2019.wv.key_to_index)],
        ['2020', len(model_2020.wv.key_to_index)],
        ['intersection', len(set(model_2019.wv.key_to_index).intersection(set(model_2020.wv.key_to_index)))]
    ],
    columns=['', 'words']
)

models_vocab

Unnamed: 0,Unnamed: 1,words
0,2019,190756
1,2020,190756
2,intersection,190756


In [None]:
models_vocab.to_csv(f'{OUT_DIR}models_vocab.csv', index=False)

## Measure distances

In [None]:
from scipy import spatial

In [None]:
def measure_distances(model_1, model_2):
    distances = pd.DataFrame(
    data=(
            #[w, spatial.distance.euclidean(model_1.wv[w], model_2.wv[w]), 
            #[w, np.sum(model_1.wv[w] * model_2.wv[w]) / (np.linalg.norm(model_1.wv[w]) * np.linalg.norm(model_2.wv[w])), 
            [w, spatial.distance.cosine(model_1.wv[w], model_2.wv[w]), 
             model_1.wv.get_vecattr(w, "count"), 
             model_2.wv.get_vecattr(w, "count")
            ] for w in model_1.wv.index_to_key
        ), 
        columns = ('lex', 'dist_sem', "freq_1", "freq_2")
    )
    return distances

In [None]:
distances = measure_distances(model_2019, model_2020)

In [None]:
distances\
    .sort_values('dist_sem', ascending=False)


Unnamed: 0,lex,dist_sem,freq_1,freq_2
181299,financiados,1.270406,8,9
165232,______________________________________________...,1.257892,9,10
181454,2ffireemblem,1.247719,8,9
189647,obedece,1.239514,7,8
126402,1281,1.218590,14,16
...,...,...,...,...
175,years,0.027202,175105,192696
171086,ppx_yo_dt_b_asin_title_o09_s00,0.025620,8,9
46607,imagestabilization,0.025614,85,92
144119,ppx_yo_dt_b_asin_title_o03_s00,0.018814,11,13


In [None]:
def get_sem_change_cands(distances, k=20, freq_min=1):
    sem_change_cands = (distances
        .query('freq_1 > @freq_min and freq_2 > @freq_min')
        .query('lex.str.isalpha() == True')
        .query('lex.str.len() > 3')
        .nlargest(k, 'dist_sem')
        .reset_index(drop=True)
        )
    return sem_change_cands

In [None]:
sem_change_cands = get_sem_change_cands(distances, k=100, freq_min=1000)
sem_change_cands

Unnamed: 0,lex,dist_sem,freq_1,freq_2
0,corona,0.927504,3553,3684
1,pandemic,0.912615,9504,9957
2,snapchatting,0.912304,2262,2345
3,dodo,0.864197,1651,1716
4,rubric,0.839424,1058,1109
...,...,...,...,...
95,neon,0.393886,1326,1391
96,villagers,0.393821,1274,1333
97,goose,0.391982,1197,1260
98,mute,0.391320,5323,5505


In [None]:
sem_change_cands_out = sem_change_cands\
    .nlargest(100, 'dist_sem')\
    .assign(index_1 = lambda df: df.index + 1)\
    .assign(dist_sem = lambda df: df['dist_sem'].round(2))\
    .assign(dist_sem = lambda df: df['dist_sem'].apply('{:.2f}'.format))\
    .rename({'index_1': '', 'lex': 'Lexeme', 'dist_sem': 'SemDist'}, axis=1)

sem_change_cands_out.head(20)

Unnamed: 0,Lexeme,SemDist,freq_1,freq_2,Unnamed: 5
0,corona,0.93,3553,3684,1
1,pandemic,0.91,9504,9957,2
2,snapchatting,0.91,2262,2345,3
3,dodo,0.86,1651,1716,4
4,rubric,0.84,1058,1109,5
5,nices,0.81,7457,7710,6
6,hyphens,0.81,1044,1096,7
7,asterisks,0.81,1085,1138,8
8,distancing,0.79,2910,3038,9
9,newbies,0.78,1566,1644,10


In [None]:
sem_change_cands_out.to_csv(
        '{OUT_DIR}sem_change_cands.csv',
        columns=['', 'Lexeme', 'SemDist'],
        index=False
    )

## Inspect nearest neighbours

In [None]:
LEX_NBS = 'distancing'

In [None]:
def get_nearest_neighbours_models(lex, freq_min, model_1, model_2, topn=100_000):
    nbs = []
    for count, model in enumerate([model_1, model_2]):
        for nb, dist in model.wv.most_similar(lex, topn=topn):
            if model.wv.get_vecattr(nb, 'count') > freq_min:
                d = {}
                d['model'] = count + 1
                d['lex'] = nb
                d['similarity'] = dist
                d['freq'] = model.wv.get_vecattr(nb, "count")
                nbs.append(d)
    nbs_df = pd.DataFrame(nbs)
    nbs_df = nbs_df\
        .query('freq > @freq_min')\
        .groupby('model', group_keys=False)\
        .apply(lambda group: group.nlargest(10, 'similarity'))
    nbs_model_1 = nbs_df.query('model == 1')
    nbs_model_2 = nbs_df.query('model == 2')
    return nbs_model_1, nbs_model_2

In [None]:
nbs_model_1, nbs_model_2 = get_nearest_neighbours_models(
    lex=LEX_NBS, 
    freq_min=50,
    model_1=model_2019, 
    model_2=model_2020
)

display(
    nbs_model_1,
    nbs_model_2
)

Unnamed: 0,model,lex,similarity,freq
0,1,distanced,0.837815,309
1,1,disassociate,0.717895,93
2,1,detaching,0.685801,61
3,1,deluding,0.667654,104
4,1,bettering,0.633784,198
5,1,incriminate,0.629239,80
6,1,isolating,0.629057,685
7,1,distract,0.617911,1553
8,1,handicapping,0.6106,54
9,1,detach,0.603991,244


Unnamed: 0,model,lex,similarity,freq
36688,2,distanced,0.553989,326
36689,2,isolation,0.547227,2037
36690,2,gatherings,0.519332,921
36691,2,distance,0.511493,11355
36692,2,lockdowns,0.499619,991
36693,2,quarantines,0.487039,159
36694,2,lockdown,0.483064,4642
36695,2,masks,0.477628,8997
36696,2,precautions,0.469785,1237
36697,2,quarantine,0.468756,5225


In [None]:
nbs_model_1.to_csv(f'{OUT_DIR}neighbours/{LEX_NBS}_2019.csv')
nbs_model_2.to_csv(f'{OUT_DIR}neighbours/{LEX_NBS}_2020.csv')

# Inspect subreddits

## read comments

In [None]:
YEAR = 2019

In [None]:
comments_paths = get_comments_paths_year(COMMENTS_DIAC_DIR, YEAR)

In [None]:
%%time
comments = read_comm_csvs(comments_paths)
comments

CPU times: user 47.7 s, sys: 6.22 s, total: 54 s
Wall time: 54.8 s


Unnamed: 0,author,body,created_utc,id,subreddit
0,Avinse,Username Checks Out,2019-05-07 21:11:36,emrv0h9,AskReddit
1,KeepingDankMemesDank,"If this is a dank meme, **Upvote** this commen...",2019-05-07 21:11:37,emrv0jp,dankmemes
2,UhPhrasing,Just threaten them that you'll call the corpor...,2019-05-07 21:11:37,emrv0jq,golf
3,[deleted],[removed],2019-05-07 21:11:37,emrv0jr,Barca
4,EnergetikNA,"honestly, do you really wanna go through an en...",2019-05-07 21:11:37,emrv0js,soccer
...,...,...,...,...,...
9599974,DogBeersHadOne,Guy who made the crossbuck had one job. One go...,2019-06-19 21:59:59,erl9mvx,trains
9599975,VenomousCoffee,Page number? Picture of the page?,2019-06-19 21:59:59,erl9mvw,marvelstudios
9599976,Homerundude698,So sexy baby,2019-06-19 21:59:59,erl9mvv,gonewild30plus
9599977,CircusRama,Removed for Rule 8,2019-06-19 21:59:59,erl9mwa,fivenightsatfreddys


TODO: filter comments

- [ ] remove duplicates
- [ ] remove bots

## get subreddit counts

In [None]:
def get_subr_counts(comments):
    subr_counts = comments\
        .groupby('subreddit')\
        .agg(comments_num = ('subreddit', 'count'))\
        .sort_values('comments_num', ascending=False)
    return subr_counts

In [None]:
subr_counts = get_subr_counts(comments)

In [None]:
import altair as alt

In [None]:
def plot_subr_counts(subr_counts, k=20):
    chart = subr_counts\
        .reset_index()\
        .iloc[:k]\
        .pipe(alt.Chart)\
            .mark_bar()\
            .encode(
                x=alt.X('comments_num:Q'),
                y=alt.Y('subreddit:N', sort='-x')
            )
    return chart

In [None]:
subr_counts_plt = plot_subr_counts(subr_counts, k=20)
subr_counts_plt

In [None]:
subr_counts_fname = 'Covid'

In [None]:
subr_counts_plt.save(f'out/subr_counts_plt_{subr_counts_fname}.svg', scale_factor=2.0)

In [None]:
comments\
    .query('subreddit == "hdsportsfeedtv"')\
     .sample(10)