In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# all_data

In [None]:
from neocov.read_data import *
from neocov.preproc import *
from neocov.type_emb import *
from neocov.communities import *

In [None]:
from gensim.models import Word2Vec
import pandas as pd

In [None]:
DATA_DIR = '../data/'
COMMENTS_DIAC_DIR = f'{DATA_DIR}comments/by_date/'
OUT_DIR = '../out/'

# NeoCov

> Semantic change and social semantic variation of Covid-related English neologisms on Reddit.

## Variables

## Semantic change

### Read data

#### Get file paths

In [None]:
YEAR = '2019'

In [None]:
comment_paths_year = get_comments_paths_year(COMMENTS_DIAC_DIR, YEAR)

#### Read comments

In [None]:
%%time
comments = read_comm_csvs(comment_paths_year)

In [None]:
comments

### Pre-process comments

In [None]:
%%time
comments = clean_comments(comments)

### Train models

#### Create corpus

In [None]:
corpus = Corpus(comments['body'])

#### Train model

In [None]:
%%time
model = train_model(corpus)

In [None]:
len(model.wv.key_to_index)

#### Save model

In [None]:
model.save(f'{OUT_DIR}models/{YEAR}.model')

#### Load models

In [None]:
model_2019 = Word2Vec.load(f'{OUT_DIR}models/2019.model')

In [None]:
model_2020 = Word2Vec.load(f'{OUT_DIR}models/2020.model')

### Align models

In [None]:
model_2019_vocab = len(model_2019.wv.key_to_index)
model_2020_vocab = len(model_2020.wv.key_to_index)

In [None]:
smart_procrustes_align_gensim(model_2019, model_2020)

In [None]:
assert len(model_2019.wv.key_to_index) == len(model_2020.wv.vectors)

In [None]:
models_vocab = pd.DataFrame(
    columns=['Model', 'Words'],
    data=[
        ['2019', model_2019_vocab],
        ['2020', model_2020_vocab],
        ['intersection', len(model_2019.wv.key_to_index)]
    ],
)

models_vocab

In [None]:
models_vocab.to_csv(f'{OUT_DIR}models_vocab.csv', index=False)

### Measure distances

In [None]:
distances = measure_distances(model_2019, model_2020)

In [None]:
distances\
    .sort_values('dist_sem', ascending=False)


TODO: filter by true type frequency; `Gensim`'s type frequency seems incorrect; it probably reflects frequency ranks instead of total counts.

In [None]:
def get_sem_change_cands(distances, k=10, freq_min=1):
    sem_change_cands = (distances
        .query('freq_1 > @freq_min and freq_2 > @freq_min')
        .query('lex.str.isalpha() == True')
        .query('lex.str.len() > 3')
        .nlargest(k, 'dist_sem')
        .reset_index(drop=True)
        )
    return sem_change_cands

In [None]:
k = 20
freq_min = 1_000

sem_change_cands = distances\
    .query('freq_1 > @freq_min and freq_2 > @freq_min')\
    .query('lex.str.isalpha() == True')\
    .query('lex.str.len() > 3')\
    .nlargest(k, 'dist_sem')\
    .reset_index(drop=True)

sem_change_cands

In [None]:
sem_change_cands_out = sem_change_cands\
    .nlargest(100, 'dist_sem')\
    .assign(index_1 = lambda df: df.index + 1)\
    .assign(dist_sem = lambda df: df['dist_sem'].round(2))\
    .assign(dist_sem = lambda df: df['dist_sem'].apply('{:.2f}'.format))\
    .rename({'index_1': '', 'lex': 'Lexeme', 'dist_sem': 'SemDist'}, axis=1)

sem_change_cands_out.head(20)

In [None]:
sem_change_cands_out.to_csv(
        f'{OUT_DIR}sem_change_cands.csv',
        columns=['', 'Lexeme', 'SemDist'],
        index=False
    )

### Inspect nearest neighbours of lexemes

In [None]:
LEX_NBS = 'distancing'

In [None]:
nbs_model_1, nbs_model_2 = get_nearest_neighbours_models(
    lex=LEX_NBS, 
    freq_min=50,
    model_1=model_2019, 
    model_2=model_2020
)

display(
    nbs_model_1,
    nbs_model_2
)

In [None]:
nbs_model_1.to_csv(f'{OUT_DIR}neighbours/{LEX_NBS}_2019.csv')
nbs_model_2.to_csv(f'{OUT_DIR}neighbours/{LEX_NBS}_2020.csv')

## Social semantic variation

### Inspect subreddits

#### read comments

In [None]:
YEAR = 2019

In [None]:
comments_paths = get_comments_paths_year(COMMENTS_DIAC_DIR, YEAR)

In [None]:
%%time
comments = read_comm_csvs(comments_paths)
comments

CPU times: user 48.6 s, sys: 6.54 s, total: 55.1 s
Wall time: 56 s


Unnamed: 0,author,body,created_utc,id,subreddit
0,Avinse,Username Checks Out,2019-05-07 21:11:36,emrv0h9,AskReddit
1,KeepingDankMemesDank,"If this is a dank meme, **Upvote** this commen...",2019-05-07 21:11:37,emrv0jp,dankmemes
2,UhPhrasing,Just threaten them that you'll call the corpor...,2019-05-07 21:11:37,emrv0jq,golf
3,[deleted],[removed],2019-05-07 21:11:37,emrv0jr,Barca
4,EnergetikNA,"honestly, do you really wanna go through an en...",2019-05-07 21:11:37,emrv0js,soccer
...,...,...,...,...,...
9599974,DogBeersHadOne,Guy who made the crossbuck had one job. One go...,2019-06-19 21:59:59,erl9mvx,trains
9599975,VenomousCoffee,Page number? Picture of the page?,2019-06-19 21:59:59,erl9mvw,marvelstudios
9599976,Homerundude698,So sexy baby,2019-06-19 21:59:59,erl9mvv,gonewild30plus
9599977,CircusRama,Removed for Rule 8,2019-06-19 21:59:59,erl9mwa,fivenightsatfreddys


TODO: filter comments

- [ ] remove duplicates
- [ ] remove bots

#### get subreddit counts

In [None]:
subr_counts = get_subr_counts(comments)

In [None]:
subr_counts_plt = plot_subr_counts(subr_counts, k=20)
subr_counts_plt

In [None]:
subr_counts_plt.save(f'{OUT_DIR}subr_counts.png', scale_factor=2.0)

In [None]:
comments\
    .query('subreddit == "AskReddit"')\
     .sample(10)

Unnamed: 0,author,body,created_utc,id,subreddit
2720703,Blueowl789,oh yeah? quote it,2019-12-07 22:45:53,fa0ql3q,AskReddit
6612440,ghostoflops,"Lurking for jerking off, participating for con...",2019-09-01 21:07:19,eyrlxlg,AskReddit
8541016,WhenAllElseFail,"so i put my hands up, they're playing my song,...",2019-08-14 21:46:21,eww2p0q,AskReddit
6482914,Tyr_ranical,"Wait is this reply about stuff here, or a thin...",2019-03-19 22:29:49,eiwy4fz,AskReddit
599903,JazzUnlikeTheCaroot,You as a director 😀,2019-07-14 21:59:58,etsbx0o,AskReddit
6102560,_luckybandit_,DashieXP or Dashie games He stopped uploading...,2019-05-01 21:34:58,em9fj76,AskReddit
7595345,neovangelis,"Periscope for ""wizards""",2019-08-19 21:58:51,exfzg04,AskReddit
2155937,Byrinthion,"I’m not a developer haha, I’m a writer. So I d...",2019-03-14 22:47:57,eijpp3m,AskReddit
2698189,AutoModerator,**PLEASE READ THIS MESSAGE IN ITS ENTIRETY BEF...,2019-12-07 22:41:05,fa0q2w2,AskReddit
7483454,neovangelis,The Native Americans who came before them woul...,2019-08-19 21:30:52,exfwxci,AskReddit
