In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# all_data

In [None]:
from neocov.read_data import *
from neocov.preproc import *

In [None]:
import pandas as pd

# NeoCov

> Semantic change and social semantic variation of Covid-related English neologisms on Reddit.

## Read data

### Get file paths

In [None]:
COMMENTS_DIR = '../data/comments/by_date/'

In [None]:
YEAR = '2019'

In [None]:
comment_paths_year = get_comments_paths_year(COMMENTS_DIR, YEAR)

### Read comments

In [None]:
%%time
comments = read_comm_csvs(comment_paths_year)

CPU times: user 54.9 s, sys: 35.7 s, total: 1min 30s
Wall time: 3min 38s


In [None]:
comments

Unnamed: 0,author,body,created_utc,id,subreddit
0,Avinse,Username Checks Out,2019-05-07 21:11:36,emrv0h9,AskReddit
1,KeepingDankMemesDank,"If this is a dank meme, **Upvote** this commen...",2019-05-07 21:11:37,emrv0jp,dankmemes
2,UhPhrasing,Just threaten them that you'll call the corpor...,2019-05-07 21:11:37,emrv0jq,golf
3,[deleted],[removed],2019-05-07 21:11:37,emrv0jr,Barca
4,EnergetikNA,"honestly, do you really wanna go through an en...",2019-05-07 21:11:37,emrv0js,soccer
...,...,...,...,...,...
9599974,DogBeersHadOne,Guy who made the crossbuck had one job. One go...,2019-06-19 21:59:59,erl9mvx,trains
9599975,VenomousCoffee,Page number? Picture of the page?,2019-06-19 21:59:59,erl9mvw,marvelstudios
9599976,Homerundude698,So sexy baby,2019-06-19 21:59:59,erl9mvv,gonewild30plus
9599977,CircusRama,Removed for Rule 8,2019-06-19 21:59:59,erl9mwa,fivenightsatfreddys


In [None]:
comments.value_counts('subreddit')

subreddit
AskReddit             429516
politics              146023
memes                  99027
teenagers              89685
dankmemes              84107
                       ...  
no_u                       1
CuteBobby                  1
no_drama                   1
WorldBoxGodSandbox         1
FatFurryPorn               1
Length: 66885, dtype: int64

## Pre-process comments

### run preprocessing

In [None]:
%%time
docs_clean = clean_docs(comments['body'])

CPU times: user 2min 12s, sys: 6min 28s, total: 8min 41s
Wall time: 36min 17s


In [None]:
comments_sm = comments.iloc[:100]

In [None]:
comments_sm.loc[:, 'body'].apply(conv_to_lowerc)

0                                   username checks out
1     if this is a dank meme, **upvote** this commen...
2     just threaten them that you'll call the corpor...
3                                             [removed]
4     honestly, do you really wanna go through an en...
                            ...                        
95    thank you! \n\ni had someone ask me in person ...
96    people always imagine robots taking over the h...
97                 sexy before and after!   good job...
98                                jk, i only want frank
99    not sure if this belongs here but here we go.\...
Name: body, Length: 100, dtype: object

### save to disk

#### `csv`

In [None]:
%%time
docs_clean.to_csv(f'../data/docs_clean/{YEAR}.csv', index=False)

CPU times: user 51.3 s, sys: 36.7 s, total: 1min 28s
Wall time: 3min 55s


#### `feather`

In [None]:
docs_clean_fr = docs_clean.to_frame()

In [None]:
type(docs_clean_fr.iloc[0])

pandas.core.series.Series

In [None]:
docs_clean.to_feather(f'../data/docs_clean/{YEAR}.feather')

AttributeError: 'Series' object has no attribute 'to_feather'

In [None]:
docs_clean_feath = pd.read_feather('~/Desktop/comments.feather')

In [None]:
comments

### load from disk

In [None]:
%%time
docs_clean = pd.read_csv(f'../data/docs_clean/{YEAR}.csv', index_col=0, header=None)

CPU times: user 21 s, sys: 1.52 s, total: 22.5 s
Wall time: 22.9 s


In [None]:
%%time
docs_clean = pd.read_csv(f'../data/docs_clean/{YEAR}.csv', converters={'body': pd.eval})

KeyboardInterrupt: 

In [None]:
docs_clean

Series([], Name: ['if', 'this', 'is', 'a', 'dank', 'meme', 'upvote', 'this', 'comment', 'if', 'this', 'is', 'not', 'a', 'dank', 'meme', 'downvote', 'this', 'comment', 'if', 'this', 'post', 'breaks', 'the', 'rules', 'report', 'it', 'and', 'downvote', 'this', 'comment', 'thank', 'you', 'for', 'helping', 'us', 'in', 'keeping', 'r', 'dankmemes', 'dank', 'hit', 'us', 'up', 'https', 'www', 'reddit', 'com', 'message', 'compose', 'to', 'r', 'dankmemes', 'if', 'you', 'have', 'any', 'questions', 'i', 'm', 'a', 'bot'], dtype: float64)

##### from `parquet`

In [None]:
%%time
docs_clean = pd.read_parquet(f'~/promo/socemb/data/docs_clean/{YEAR}.parquet')

CPU times: user 18.8 s, sys: 6.62 s, total: 25.5 s
Wall time: 55.2 s


In [None]:
%%time
docs_clean['body'] = docs_clean['body'].apply(lambda x: x.tolist())

In [None]:
docs_clean = docs_clean['body']

In [None]:
docs_clean

## Train models

### Create corpus

In [None]:
corpus = Corpus(docs_clean)

### Train model

In [None]:
from gensim.models import Word2Vec

In [None]:
%%time
model = train_emb(corpus)

CPU times: user 54min 54s, sys: 3min 16s, total: 58min 11s
Wall time: 24min 2s


In [None]:
len(model.wv.key_to_index)

244740

In [None]:
len(model.wv.key_to_index)

244740

### Save model

In [None]:
model.save(f'../out/models/{YEAR}.model')

### Load models

In [None]:
model_2019 = gensim.models.Word2Vec.load('out/models/2019.model')

In [None]:
model_2020 = gensim.models.Word2Vec.load('out/models/2020.model')