In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# all_data

In [None]:
from neocov.read_data import *
from neocov.preproc import *
from neocov.type_emb import *
from neocov.communities import *

In [None]:
from gensim.models import Word2Vec
import pandas as pd
from pathlib import Path

In [None]:
DATA_DIR = '../data/'
COMMENTS_DIAC_DIR = f'{DATA_DIR}comments/by_date/'
OUT_DIR = '../out/'

# NeoCov

> Semantic change and social semantic variation of Covid-related English neologisms on Reddit.

## Variables

## Semantic change

### Read data

#### Get file paths

In [None]:
YEAR = '2019'

In [None]:
comment_paths_year = get_comments_paths_year(COMMENTS_DIAC_DIR, YEAR)

#### Read comments

In [None]:
%%time
comments = read_comm_csvs(comment_paths_year)

In [None]:
comments

### Pre-process comments

In [None]:
%%time
comments = clean_comments(comments)

### Train models

#### Create corpus

In [None]:
corpus = Corpus(comments['body'])

#### Train model

In [None]:
%%time
model = train_model(corpus)

In [None]:
len(model.wv.key_to_index)

#### Save model

In [None]:
model.save(f'{OUT_DIR}models/{YEAR}.model')

#### Load models

In [None]:
model_2019 = Word2Vec.load(f'{OUT_DIR}models/2019.model')

In [None]:
model_2020 = Word2Vec.load(f'{OUT_DIR}models/2020.model')

### Align models

In [None]:
model_2019_vocab = len(model_2019.wv.key_to_index)
model_2020_vocab = len(model_2020.wv.key_to_index)

In [None]:
smart_procrustes_align_gensim(model_2019, model_2020)

190756 190756
190756 190756


<gensim.models.word2vec.Word2Vec at 0x184aee710>

In [None]:
assert len(model_2019.wv.key_to_index) == len(model_2020.wv.vectors)

In [None]:
models_vocab = pd.DataFrame(
    columns=['Model', 'Words'],
    data=[
        ['2019', model_2019_vocab],
        ['2020', model_2020_vocab],
        ['intersection', len(model_2019.wv.key_to_index)]
    ],
)

models_vocab

Unnamed: 0,Model,Words
0,2019,252564
1,2020,277707
2,intersection,190756


In [None]:
models_vocab.to_csv(f'{OUT_DIR}models_vocab.csv', index=False)

### Measure distances

In [None]:
distances = measure_distances(model_2019, model_2020)

In [None]:
distances\
    .sort_values('dist_sem', ascending=False)


Unnamed: 0,lex,dist_sem,freq_1,freq_2
182174,financiados,1.270406,7,9
164003,______________________________________________...,1.257892,9,10
181373,2ffireemblem,1.247719,8,9
190665,obedece,1.239514,7,8
126286,1281,1.218590,14,16
...,...,...,...,...
175,years,0.027202,175105,192696
174923,ppx_yo_dt_b_asin_title_o09_s00,0.025620,8,9
46509,imagestabilization,0.025614,85,93
144055,ppx_yo_dt_b_asin_title_o03_s00,0.018814,11,13


TODO: filter by true type frequency; `Gensim`'s type frequency seems incorrect; it probably reflects frequency ranks instead of total counts.

In [None]:
def get_sem_change_cands(distances, k=10, freq_min=1):
    sem_change_cands = (distances
        .query('freq_1 > @freq_min and freq_2 > @freq_min')
        .query('lex.str.isalpha() == True')
        .query('lex.str.len() > 3')
        .nlargest(k, 'dist_sem')
        .reset_index(drop=True)
        )
    return sem_change_cands

In [None]:
k = 20
freq_min = 100

sem_change_cands = distances\
    .query('freq_1 > @freq_min and freq_2 > @freq_min')\
    .query('lex.str.isalpha() == True')\
    .query('lex.str.len() > 3')\
    .nlargest(k, 'dist_sem')\
    .reset_index(drop=True)

sem_change_cands

Unnamed: 0,lex,dist_sem,freq_1,freq_2
0,maskless,1.100272,118,127
1,lockdowns,1.070362,940,991
2,sunsetting,1.039729,111,120
3,chaz,1.010383,190,202
4,childe,0.957373,209,222
5,cerb,0.957321,315,333
6,megalodon,0.937414,752,792
7,spreader,0.932299,164,175
8,corona,0.927504,3553,3684
9,ventilators,0.925241,384,405


In [None]:
sem_change_cands_out = sem_change_cands\
    .nlargest(100, 'dist_sem')\
    .assign(index_1 = lambda df: df.index + 1)\
    .assign(dist_sem = lambda df: df['dist_sem'].round(2))\
    .assign(dist_sem = lambda df: df['dist_sem'].apply('{:.2f}'.format))\
    .rename({'index_1': '', 'lex': 'Lexeme', 'dist_sem': 'SemDist'}, axis=1)

In [None]:
sem_change_cands_out.to_csv(
        f'{OUT_DIR}sem_change_cands.csv',
        columns=['', 'Lexeme', 'SemDist'],
        index=False
    )

### Inspect nearest neighbours of lexemes

In [None]:
LEX_NBS = 'lockdown'

In [None]:
nbs_model_1, nbs_model_2 = get_nearest_neighbours_models(
    lex=LEX_NBS, 
    freq_min=1,
    model_1=model_2019, 
    model_2=model_2020,
    k=10
)

display(
    nbs_model_1,
    nbs_model_2
)

NameError: name 'model_2019' is not defined

In [None]:
nbs_model_1.to_csv(f'{OUT_DIR}neighbours/{LEX_NBS}_2019.csv')
nbs_model_2.to_csv(f'{OUT_DIR}neighbours/{LEX_NBS}_2020.csv')

## Social semantic variation

### Inspect subreddits

#### read comments

In [None]:
comments_dir_path = Path('../data/comments/lexeme/')

In [None]:
comments_paths = list(comments_dir_path.glob(f'Covid*.csv'))

In [None]:
%%time
comments = read_comm_csvs(comments_paths)
comments

CPU times: user 34.2 s, sys: 3.62 s, total: 37.8 s
Wall time: 37.9 s


Unnamed: 0,author,body,created_utc,id,subreddit
0,Gloob_Patrol,I assume you work too so he's feeling like he ...,2020-09-08 18:53:06,g4guhl5,LongDistance
1,amtrusc,"Strep swab and culture negative, I’m sure? Cou...",2020-09-08 18:53:08,g4guhsm,tonsilstones
2,Ephuntz,&gt;Good point. My apologies. It's just becomi...,2020-09-08 18:53:09,g4guhua,Winnipeg
3,cstransfer,Have you noticed an increase of people going e...,2020-09-08 18:53:09,g4guhu4,financialindependence
4,IlliniWhoDat,"I haven't. I have seen it online, but haven't...",2020-09-08 18:53:13,g4gui6o,KoreanBeauty
...,...,...,...,...,...
3800760,willw,Last group pre COVID!,2020-07-01 21:59:48,fwmqfbj,jawsurgery
3800761,Daikataro,"If everyone is infected with COVID, new cases ...",2020-07-01 21:59:49,fwmqff2,politics
3800762,StabYourBloodIntoMe,&gt; If the mortality rate is actually decreas...,2020-07-01 21:59:50,fwmqfib,dataisbeautiful
3800763,Shorse_rider,I was a freelancer until covid and earned more...,2020-07-01 21:59:55,fwmqfuw,AskWomen


TODO: filter comments

- [ ] remove duplicates
- [ ] remove bots

#### get subreddit counts

In [None]:
subr_counts = get_subr_counts(comments)

In [None]:
subr_counts_plt = plot_subr_counts(subr_counts, k=20)
subr_counts_plt

In [None]:
subr_counts_plt.save(f'{OUT_DIR}subr_counts.png', scale_factor=2.0)

## Train models

In [None]:
COMMENTS_DIR_SUBR = '../data/comments/subr/'

In [None]:
SUBR = 'Coronavirus'

In [None]:
fpaths = get_comments_paths_subr(COMMENTS_DIR_SUBR, SUBR)

In [None]:
%%time
comments = read_comm_csvs(fpaths)

CPU times: user 24 s, sys: 2.71 s, total: 26.7 s
Wall time: 26.8 s


In [None]:
%%time
comments = clean_comments(comments)

conv_to_lowerc       (4121144, 5) 0:00:03.492902      
rm_punct             (4121144, 5) 0:00:26.037508      
tokenize             (4121144, 5) 0:03:10.999946      
rem_short_comments   (3462555, 5) 0:00:58.288521      
CPU times: user 1min 4s, sys: 1min 29s, total: 2min 33s
Wall time: 4min 57s


In [None]:
comments

Unnamed: 0,author,body,created_utc,id,subreddit
0,bikbar1,"[gt, but, it, s, still, impossible, to, hide, ...",2020-09-06 10:11:45,g47wejw,Coronavirus
1,righteousprovidence,"[my, guess, is, americans, don, t, see, weldin...",2020-09-06 10:13:17,g47whmx,Coronavirus
2,liriodendron1,"[i, dont, want, compensation, i, want, it, to,...",2020-09-06 10:13:27,g47whzg,Coronavirus
3,ArbitraryBaker,"[except, the, testing, is, flawed, too, have, ...",2020-09-06 10:14:02,g47wj5l,Coronavirus
4,mogambis,"[little, did, he, know, she, is, an, it]",2020-09-06 10:15:33,g47wm3e,Coronavirus
...,...,...,...,...,...
4121139,LouQuacious,"[it, s, the, 21st, century, no, excuse, for, n...",2020-12-14 22:59:46,gfv1k7s,Coronavirus
4121140,immibis,"[covid, has, a, 1, in, 500, side, effect, of, ...",2020-12-14 22:59:50,gfv1kho,Coronavirus
4121141,starlordbg,"[i, would, personally, wait, a, few, years, to...",2020-12-14 22:59:53,gfv1kqa,Coronavirus
4121142,ihadanamebutforgot,"[cool, dude, lemme, know, when, science, eradi...",2020-12-14 22:59:57,gfv1kzb,Coronavirus


In [None]:
corpus = Corpus(comments['body'])

In [None]:
%%time
model = train_model(corpus)

CPU times: user 27min 59s, sys: 45.9 s, total: 28min 45s
Wall time: 6min 48s


In [None]:
len(model.wv.key_to_index)

94816

In [None]:
model.save(f'{OUT_DIR}models/{SUBR}.model')

## Load models

In [None]:
SUBRS = ['Coronavirus', 'conspiracy']

In [None]:
model_1 = Word2Vec.load(f'{OUT_DIR}models/{SUBRS[0]}.model')

In [None]:
model_2 = Word2Vec.load(f'{OUT_DIR}models/{SUBRS[1]}.model')

### Align models

In [None]:
model_1_vocab = len(model_1.wv.key_to_index)
model_2_vocab = len(model_2.wv.key_to_index)

In [None]:
smart_procrustes_align_gensim(model_1, model_2)

67181 67181
67181 67181


<gensim.models.word2vec.Word2Vec at 0x1c5394850>

In [None]:
assert len(model_1.wv.key_to_index) == len(model_2.wv.vectors)

In [None]:
models_vocab = pd.DataFrame(
    columns=['Model', 'Words'],
    data=[
        [SUBRS[0], model_1_vocab],
        [SUBRS[1], model_2_vocab],
        ['intersection', len(model_1.wv.key_to_index)]
    ],
)

models_vocab

Unnamed: 0,Model,Words
0,Coronavirus,94816
1,conspiracy,112599
2,intersection,67181


In [None]:
models_vocab.to_csv(f'{OUT_DIR}models_subrs_vocab.csv', index=False)

### Measure distances

In [None]:
distances = measure_distances(model_1, model_2)

#### words that differ the most between both communities

In [None]:
freq_min = 100

distances\
    .query('freq_1 > @freq_min and freq_2 > @freq_min')\
    .sort_values('dist_sem', ascending=False)\
    .head(20)


Unnamed: 0,lex,dist_sem,freq_1,freq_2
21288,ptb,1.14401,103,144
270,sticky,1.035551,76776,69776
18320,refraction,1.021251,142,196
9409,accumulative,1.011545,539,667
1262,pms,1.010546,11535,11405
3996,soliciting,0.989504,2233,2472
4719,resubmit,0.944866,1763,1928
895,ss,0.944818,18100,17373
16111,ets,0.933242,184,252
17845,waiters,0.929376,150,206


#### nearest neighbours for target lexemes in both communities

In [None]:
LEX = 'vaccine'

In [None]:
nbs_model_1, nbs_model_2 = get_nearest_neighbours_models(
    lex=LEX, 
    freq_min=100,
    model_1=model_1, 
    model_2=model_2,
    k=10
)

display(nbs_model_1, nbs_model_2)

Unnamed: 0,model,lex,similarity,freq
0,1,vaccines,0.754159,41005
1,1,vaccin,0.745905,108
2,1,vaccination,0.633033,7667
3,1,vaccinations,0.569226,3305
4,1,jab,0.53142,713
5,1,drug,0.519127,19090
6,1,novavax,0.515097,158
7,1,cure,0.507441,8142
8,1,vax,0.491517,2940
9,1,eua,0.490937,702


Unnamed: 0,model,lex,similarity,freq
21542,2,vaccines,0.770874,37084
21543,2,vaccination,0.723819,7780
21544,2,vaccinations,0.656477,3624
21545,2,vax,0.64908,3208
21546,2,vac,0.586291,206
21547,2,immunization,0.543347,701
21548,2,inoculation,0.538037,319
21549,2,jab,0.530465,850
21550,2,rubella,0.528231,333
21551,2,vaccinated,0.526829,11762


#### biggest discrepancies in nearest neighbours for target lexemes

In [None]:
nbs_model_1, nbs_model_2 = get_nearest_neighbours_models(
    lex=LEX, 
    freq_min=150,
    model_1=model_1, 
    model_2=model_2,
    k=100_000
)

In [None]:
nbs_diffs = pd.merge(
    nbs_model_1, nbs_model_2, 
    on='lex',
    suffixes = ('_1', '_2')
)

In [None]:
nbs_diffs = nbs_diffs\
    .assign(sim_diff = abs(nbs_diffs['similarity_1'] - nbs_diffs['similarity_2']))\
    .sort_values('sim_diff', ascending=False)\
    .reset_index(drop=True)\
    .query('lex.str.len() >= 4')

In [None]:
topn = 10

subr_1_nbs = nbs_diffs\
    .query('similarity_1 > similarity_2')\
    .nlargest(topn, 'sim_diff')

subr_2_nbs = nbs_diffs\
    .query('similarity_2 > similarity_1')\
    .nlargest(topn, 'sim_diff')

display(subr_1_nbs, subr_2_nbs)

Unnamed: 0,model_1,lex,similarity_1,freq_1,model_2,similarity_2,freq_2,sim_diff
7,1,100m,0.328007,439,2,0.039867,557,0.28814
12,1,beta,0.352028,842,2,0.070071,999,0.281957
14,1,vladimir,0.166982,279,2,-0.108522,375,0.275504
20,1,sputnik,0.367858,279,2,0.113972,376,0.253886
21,1,vanilla,0.104741,168,2,-0.147005,230,0.251746
23,1,lamp,0.179237,224,2,-0.070934,305,0.250171
25,1,fades,0.220316,153,2,-0.027544,211,0.24786
26,1,paintings,0.071031,230,2,-0.176508,312,0.247539
29,1,oxford,0.35403,4128,2,0.114557,4378,0.239473
31,1,fade,0.202643,490,2,-0.034802,610,0.237445


Unnamed: 0,model_1,lex,similarity_1,freq_1,model_2,similarity_2,freq_2,sim_diff
2,1,neuralink,-0.00444,210,2,0.341604,285,0.346044
3,1,optional,-0.055179,731,2,0.262431,871,0.31761
5,1,mandated,-0.07473,2455,2,0.226452,2730,0.301182
6,1,dysphoria,-0.081338,210,2,0.207875,286,0.289212
8,1,coronavirus,0.004444,218095,2,0.289353,261193,0.284909
9,1,cv19,0.072748,539,2,0.357341,667,0.284593
10,1,untested,0.177692,796,2,0.462222,950,0.28453
11,1,locale,-0.117035,575,2,0.165845,704,0.28288
13,1,disrespecting,-0.17982,231,2,0.097078,314,0.276898
15,1,pediatric,0.032584,357,2,0.307942,467,0.275358
