# overview

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
#| all_notest

In [None]:
#| hide
from socsemvar.read_data import *
from socsemvar.preprocessing import *
from socsemvar.embeddings import *

import pandas as pd
from pathlib import Path
import altair as alt

# Semantic change detection (Table 2)

In [None]:
models = load_models(['2019', '2020'], models_dir='../../models')
models

{'2019': <gensim.models.word2vec.Word2Vec>,
 '2020': <gensim.models.word2vec.Word2Vec>}

Vocabulary sizes for the two models before Procrustes alignment:

In [None]:
pd.DataFrame(
	columns=['Model', 'VocabSize'],
	data=[
	['2019', f"{len(models['2019'].wv.key_to_index):,}"],
	['2020', f"{len(models['2020'].wv.key_to_index):,}"],
])

Unnamed: 0,Model,VocabSize
0,2019,252564
1,2020,277707


In [None]:
smart_procrustes_align_gensim(models['2019'], models['2020'])

190756 190756
190756 190756


<gensim.models.word2vec.Word2Vec>

Intersecting vocabulary size after alignment:

In [None]:
pd.DataFrame(
	columns=['Model', 'VocabSize'],
	data=[
	['2019', f"{len(models['2019'].wv.key_to_index):,}"],
	['2020', f"{len(models['2020'].wv.key_to_index):,}"],
])

Unnamed: 0,Model,VocabSize
0,2019,190756
1,2020,190756


Measuring semantic distances (~ cosine distance) between the 2019 and the 2020 model for all words contained in the aligned vocabulary.

In [None]:
distances = measure_distances(models['2019'], models['2020'])

<a id='semantic-neologisms'></a>

20 words that show the highest semantic distance between 2019 and 2020. This output is presented in Table 2 in the paper.

In [None]:
get_change_candidates(20, distances)

Unnamed: 0,lex,dist_sem
0,lockdowns,1.02
1,maskless,1.0
2,sunsetting,1.0
3,childe,0.98
4,megalodon,0.98
5,newf,0.96
6,corona,0.93
7,filtrate,0.92
8,chaz,0.9
9,klee,0.89


Extended list for the Appendix (Table 3)

In [None]:
get_change_candidates(50, distances, propNouns=False)

Unnamed: 0,lex,dist_sem
0,lockdowns,1.02
1,maskless,1.0
2,sunsetting,1.0
3,newf,0.96
4,corona,0.93
5,filtrate,0.92
6,chaz,0.9
7,rona,0.89
8,cerb,0.87
9,vacuo,0.86


# Covid-related communities (Figure 1) 

In this section, we determine those communities which are most actively engaged in Covid-related discourse.

In [None]:
comments_dir_path = Path('../../data/covid/')
comments_paths = list(comments_dir_path.glob(f'Covid*.csv'))
comments = read_multi_comments_csvs(comments_paths)
comments

  comments = pd.read_csv(


Unnamed: 0,author,body,created_utc,id,subreddit
0,Gloob_Patrol,I assume you work too so he's feeling like he ...,2020-09-08 18:53:06,g4guhl5,LongDistance
1,amtrusc,"Strep swab and culture negative, I’m sure? Cou...",2020-09-08 18:53:08,g4guhsm,tonsilstones
2,Ephuntz,&gt;Good point. My apologies. It's just becomi...,2020-09-08 18:53:09,g4guhua,Winnipeg
3,cstransfer,Have you noticed an increase of people going e...,2020-09-08 18:53:09,g4guhu4,financialindependence
4,IlliniWhoDat,"I haven't. I have seen it online, but haven't...",2020-09-08 18:53:13,g4gui6o,KoreanBeauty
...,...,...,...,...,...
3800760,willw,Last group pre COVID!,2020-07-01 21:59:48,fwmqfbj,jawsurgery
3800761,Daikataro,"If everyone is infected with COVID, new cases ...",2020-07-01 21:59:49,fwmqff2,politics
3800762,StabYourBloodIntoMe,&gt; If the mortality rate is actually decreas...,2020-07-01 21:59:50,fwmqfib,dataisbeautiful
3800763,Shorse_rider,I was a freelancer until covid and earned more...,2020-07-01 21:59:55,fwmqfuw,AskWomen


In [None]:
subreddit_counts = (comments
 .groupby('subreddit')
 .agg(comments_num = ('subreddit', 'count'))
 .sort_values('comments_num', ascending=False)
)

<a id='covid-communities'></a>

Plot top 15 communities that are most actively engaged in Covid-related discourse.

In [None]:
subreddits_chart = subreddit_counts\
    .reset_index()\
    .iloc[:15]\
    .pipe(alt.Chart)\
        .mark_bar()\
        .encode(
            x=alt.X('comments_num:Q', title='Number of Covid-related comments'),
            y=alt.Y('subreddit:N', title='Community', sort='-x')
        )

subreddits_chart

# Semantic axes (Figure 2)

In [None]:
models = load_models(['Coronavirus', 'conspiracy'], models_dir='../../models')

In [None]:
lexs = [ 'corona', 'rona', 'moderna', 'sars', 'spreader', 'maskless', 'distancing', 'quarantines', 'pandemic', 'science', 'research', 'masks', 'lockdowns', 'vaccines' ]

evaluative dimension: *good* vs *bad*

In [None]:
pole_words = ['good', 'bad']

proj_sims = get_axis_sims(lexs, models, pole_words, k=10)
proj_sims = aggregate_proj_sims(proj_sims)
proj_sims_melted = proj_sims.melt(id_vars=['lex', 'SimDiff'], var_name='model', value_name='SemSim')
sem_axis_evaluative_plot = plot_sem_axis(proj_sims_melted,  pole_words)
sem_axis_evaluative_plot

moral-based dimension: *loyalty* vs *betrayal*

In [None]:
pole_words = ['loyalty', 'betrayal']

proj_sims = get_axis_sims(lexs, models, pole_words, k=10)
proj_sims = aggregate_proj_sims(proj_sims)
proj_sims_melted = proj_sims.melt(id_vars=['lex', 'SimDiff'], var_name='model', value_name='SemSim')
sem_axis_evaluative_plot = plot_sem_axis(proj_sims_melted,  pole_words)
sem_axis_evaluative_plot

# Maps of socio-semantic variation (Figure 3)

Note that the plots in this notebook are not identical to the ones in the paper since the dimensionality reduction via t-SNE leads to differences in results between runs.

In [None]:
models = load_models(['Coronavirus', 'conspiracy'], models_dir='../../models')

In [None]:
smart_procrustes_align_gensim(models['Coronavirus'], models['conspiracy'])

67181 67181
67181 67181


<gensim.models.word2vec.Word2Vec>

In [None]:
nbs_vecs = pd.concat([get_nbs_vecs('vaccines', model_name, model, k=750) for model_name, model in models.items()])

## common neighbours

In [None]:
nbs_vecs_2d = dim_red_nbs_vecs(nbs_vecs, perplexity=0.1)
nbs_sim = (nbs_vecs_2d
	.groupby('subreddit')
	.apply(lambda df: df.nlargest(10, 'sim'))
	.reset_index(drop=True)
)

  .apply(lambda df: df.nlargest(10, 'sim'))


In [None]:
map_sims_plot = (alt.Chart(nbs_sim).mark_text().encode(
		x='x_tsne:Q',
		y='y_tsne:Q',
		text='lex',
		color='subreddit:N'
	))

map_sims_plot

## differences in neighbours

In [None]:
nbs_vecs = dim_red_nbs_vecs(nbs_vecs, perplexity=70)
nbs_diff = nbs_vecs.drop_duplicates(subset='lex', keep=False)
nbs_diff = (nbs_diff
	.groupby('subreddit')
	.apply(lambda df: df.nlargest(20, 'sim'))
	.reset_index(drop=True)
)

  .apply(lambda df: df.nlargest(20, 'sim'))


In [None]:
map_diffs_plot = (alt.Chart(nbs_diff).mark_text().encode(
		x='x_tsne:Q',
		y='y_tsne:Q',
		text='lex:N',
		color='subreddit:N'
	))


map_diffs_plot