In [2]:
import logging
import pandas as pd
import numpy as np
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
from gensim.models import Word2Vec
from numpy import isin

SEED = 42
np.random.seed(SEED)

news_marlesson = pd.read_csv('datasets/kaggle_marlesson_noticias_folha_sp.csv')

In [3]:
news_marlesson.drop(columns=['subcategory'], inplace=True)

In [4]:
news_marlesson.head()

Unnamed: 0,title,text,date,category,link
0,"Lula diz que está 'lascado', mas que ainda tem...",Com a possibilidade de uma condenação impedir ...,2017-09-10,poder,http://www1.folha.uol.com.br/poder/2017/10/192...
1,"'Decidi ser escrava das mulheres que sofrem', ...","Para Oumou Sangaré, cantora e ativista malines...",2017-09-10,ilustrada,http://www1.folha.uol.com.br/ilustrada/2017/10...
2,Três reportagens da Folha ganham Prêmio Petrob...,Três reportagens da Folha foram vencedoras do ...,2017-09-10,poder,http://www1.folha.uol.com.br/poder/2017/10/192...
3,Filme 'Star Wars: Os Últimos Jedi' ganha trail...,A Disney divulgou na noite desta segunda-feira...,2017-09-10,ilustrada,http://www1.folha.uol.com.br/ilustrada/2017/10...
4,CBSS inicia acordos com fintechs e quer 30% do...,"O CBSS, banco da holding Elopar dos sócios Bra...",2017-09-10,mercado,http://www1.folha.uol.com.br/mercado/2017/10/1...


In [5]:
print('sem drop de NaN: ', news_marlesson.shape)
news_marlesson.dropna(subset=['text'],inplace=True)
print('com drop de NaN: ',news_marlesson.shape )


sem drop de NaN:  (167053, 5)
com drop de NaN:  (166288, 5)


In [6]:
news_marlesson.category.value_counts()

poder                           22022
colunas                         21619
mercado                         20970
esporte                         19730
mundo                           17130
cotidiano                       16967
ilustrada                       15617
opiniao                          4525
paineldoleitor                   4011
saopaulo                         3955
tec                              2260
tv                               2123
educacao                         2118
turismo                          1903
ilustrissima                     1409
ciencia                          1335
equilibrioesaude                 1312
sobretudo                        1057
bbc                               980
folhinha                          875
empreendedorsocial                841
comida                            828
asmais                            547
ambiente                          491
seminariosfolha                   379
serafina                          331
o-melhor-de-

In [7]:
s = news_marlesson['category'].value_counts()
news_marlesson = news_marlesson[news_marlesson.isin(s.index[s >= 1000]).values]

In [8]:
news_marlesson.head()

Unnamed: 0,title,text,date,category,link
0,"Lula diz que está 'lascado', mas que ainda tem...",Com a possibilidade de uma condenação impedir ...,2017-09-10,poder,http://www1.folha.uol.com.br/poder/2017/10/192...
1,"'Decidi ser escrava das mulheres que sofrem', ...","Para Oumou Sangaré, cantora e ativista malines...",2017-09-10,ilustrada,http://www1.folha.uol.com.br/ilustrada/2017/10...
2,Três reportagens da Folha ganham Prêmio Petrob...,Três reportagens da Folha foram vencedoras do ...,2017-09-10,poder,http://www1.folha.uol.com.br/poder/2017/10/192...
3,Filme 'Star Wars: Os Últimos Jedi' ganha trail...,A Disney divulgou na noite desta segunda-feira...,2017-09-10,ilustrada,http://www1.folha.uol.com.br/ilustrada/2017/10...
4,CBSS inicia acordos com fintechs e quer 30% do...,"O CBSS, banco da holding Elopar dos sócios Bra...",2017-09-10,mercado,http://www1.folha.uol.com.br/mercado/2017/10/1...


$ si = [fmax + (ti - tmin)] $

In [9]:
def clean_text(doc):
    result = ""

    for token in doc:
        is_valid = not token.is_stop and token.is_alpha

        if is_valid:
            result += f"{token} "

    return result.rstrip()

In [10]:
news_marlesson['tamanho_noticia'] = news_marlesson.text.str.len()

In [11]:
print(news_marlesson['tamanho_noticia'].mean(), news_marlesson['tamanho_noticia'].max());

2718.365037516478 61154


In [12]:
news_marlesson.sort_values(by=['category', 'tamanho_noticia'], ascending=True, inplace=True);

In [13]:
news_splited = np.array_split(news_marlesson, 100)

In [14]:
for news in news_splited:
    news_to_clean = (news.lower() for news in news["text"])

In [15]:
nlp = spacy.load("pt_core_news_sm")

In [16]:
news_cleaned = [clean_text(doc) for doc in nlp.pipe(news_to_clean,  batch_size = 10000, n_process = -1)]

In [17]:
news_to_list = pd.DataFrame({'news': news_cleaned})
news_to_list.drop_duplicates(inplace=True)

In [18]:
news_list = [news.split(' ') for news in news_to_list.news]

In [19]:
def train_model(sg, window, vector_size, path):
    logging.basicConfig(format='%(asctime)s : - %(message)s', level=logging.INFO)

    model = Word2Vec(sg = sg, window = window, min_count = 5, vector_size = vector_size, alpha = 0.03, min_alpha=0.007)
    model.build_vocab(news_list, progress_per=5000)
    model.train(news_list, total_examples = model.corpus_count, epochs = 1000)
    model.wv.save_word2vec_format(path, binary=False)

# Treinando o modelo

# SG

## Window 5

In [20]:
train_model(sg=1, window=5, vector_size=300, path='models/marlesson/sg/5/300.txt')

2022-11-17 22:46:37,362 : - Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.03>', 'datetime': '2022-11-17T22:46:37.362532', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standard-WSL2-x86_64-with-debian-bullseye-sid', 'event': 'created'}
2022-11-17 22:46:37,364 : - collecting all words and their counts
2022-11-17 22:46:37,365 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-11-17 22:46:37,388 : - collected 20754 word types from a corpus of 107972 raw words and 1589 sentences
2022-11-17 22:46:37,389 : - Creating a fresh vocabulary
2022-11-17 22:46:37,417 : - Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 4486 unique words (21.62% of original 20754, drops 16268)', 'datetime': '2022-11-17T22:46:37.417405', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standa

In [21]:
train_model(sg=1, window=5, vector_size=600, path='models/marlesson/sg/5/600.txt')

2022-11-17 22:52:42,907 : - Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=600, alpha=0.03>', 'datetime': '2022-11-17T22:52:42.906949', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standard-WSL2-x86_64-with-debian-bullseye-sid', 'event': 'created'}
2022-11-17 22:52:42,909 : - collecting all words and their counts
2022-11-17 22:52:42,911 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-11-17 22:52:42,950 : - collected 20754 word types from a corpus of 107972 raw words and 1589 sentences
2022-11-17 22:52:42,958 : - Creating a fresh vocabulary
2022-11-17 22:52:42,989 : - Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 4486 unique words (21.62% of original 20754, drops 16268)', 'datetime': '2022-11-17T22:52:42.989814', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standa

In [22]:
train_model(sg=1, window=5, vector_size=1000, path='models/marlesson/sg/5/1000.txt')

2022-11-17 23:05:09,588 : - Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=1000, alpha=0.03>', 'datetime': '2022-11-17T23:05:09.588587', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standard-WSL2-x86_64-with-debian-bullseye-sid', 'event': 'created'}
2022-11-17 23:05:09,591 : - collecting all words and their counts
2022-11-17 23:05:09,593 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-11-17 23:05:09,626 : - collected 20754 word types from a corpus of 107972 raw words and 1589 sentences
2022-11-17 23:05:09,627 : - Creating a fresh vocabulary
2022-11-17 23:05:09,666 : - Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 4486 unique words (21.62% of original 20754, drops 16268)', 'datetime': '2022-11-17T23:05:09.666256', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-stand

## Window 50

In [23]:
train_model(sg=1, window=50, vector_size=300, path='models/marlesson/sg/50/300.txt')

2022-11-17 23:26:10,253 : - Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.03>', 'datetime': '2022-11-17T23:26:10.253005', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standard-WSL2-x86_64-with-debian-bullseye-sid', 'event': 'created'}
2022-11-17 23:26:10,255 : - collecting all words and their counts
2022-11-17 23:26:10,257 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-11-17 23:26:10,295 : - collected 20754 word types from a corpus of 107972 raw words and 1589 sentences
2022-11-17 23:26:10,296 : - Creating a fresh vocabulary
2022-11-17 23:26:10,327 : - Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 4486 unique words (21.62% of original 20754, drops 16268)', 'datetime': '2022-11-17T23:26:10.327763', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standa

In [24]:
train_model(sg=1, window=50, vector_size=600, path='models/marlesson/sg/50/600.txt')

2022-11-18 00:04:22,030 : - Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=600, alpha=0.03>', 'datetime': '2022-11-18T00:04:22.030169', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standard-WSL2-x86_64-with-debian-bullseye-sid', 'event': 'created'}
2022-11-18 00:04:22,032 : - collecting all words and their counts
2022-11-18 00:04:22,033 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-11-18 00:04:22,056 : - collected 20754 word types from a corpus of 107972 raw words and 1589 sentences
2022-11-18 00:04:22,057 : - Creating a fresh vocabulary
2022-11-18 00:04:22,085 : - Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 4486 unique words (21.62% of original 20754, drops 16268)', 'datetime': '2022-11-18T00:04:22.085641', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standa

In [25]:
train_model(sg=1, window=50, vector_size=1000, path='models/marlesson/sg/50/1000.txt')

2022-11-18 01:06:13,855 : - Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=1000, alpha=0.03>', 'datetime': '2022-11-18T01:06:13.855442', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standard-WSL2-x86_64-with-debian-bullseye-sid', 'event': 'created'}
2022-11-18 01:06:13,857 : - collecting all words and their counts
2022-11-18 01:06:13,858 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-11-18 01:06:13,884 : - collected 20754 word types from a corpus of 107972 raw words and 1589 sentences
2022-11-18 01:06:13,885 : - Creating a fresh vocabulary
2022-11-18 01:06:13,909 : - Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 4486 unique words (21.62% of original 20754, drops 16268)', 'datetime': '2022-11-18T01:06:13.909839', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-stand

## Window 100

In [26]:
train_model(sg=1, window=100, vector_size=300, path='models/marlesson/sg/100/300.txt')

2022-11-18 02:55:20,831 : - Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.03>', 'datetime': '2022-11-18T02:55:20.831772', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standard-WSL2-x86_64-with-debian-bullseye-sid', 'event': 'created'}
2022-11-18 02:55:20,834 : - collecting all words and their counts
2022-11-18 02:55:20,835 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-11-18 02:55:20,861 : - collected 20754 word types from a corpus of 107972 raw words and 1589 sentences
2022-11-18 02:55:20,862 : - Creating a fresh vocabulary
2022-11-18 02:55:20,880 : - Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 4486 unique words (21.62% of original 20754, drops 16268)', 'datetime': '2022-11-18T02:55:20.880290', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standa

In [27]:
train_model(sg=1, window=100, vector_size=600, path='models/marlesson/sg/100/600.txt')

2022-11-18 03:39:15,270 : - Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=600, alpha=0.03>', 'datetime': '2022-11-18T03:39:15.270454', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standard-WSL2-x86_64-with-debian-bullseye-sid', 'event': 'created'}
2022-11-18 03:39:15,272 : - collecting all words and their counts
2022-11-18 03:39:15,272 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-11-18 03:39:15,292 : - collected 20754 word types from a corpus of 107972 raw words and 1589 sentences
2022-11-18 03:39:15,293 : - Creating a fresh vocabulary
2022-11-18 03:39:15,313 : - Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 4486 unique words (21.62% of original 20754, drops 16268)', 'datetime': '2022-11-18T03:39:15.313333', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standa

In [28]:
train_model(sg=1, window=100, vector_size=1000, path='models/marlesson/sg/100/1000.txt')

2022-11-18 05:05:24,892 : - Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=1000, alpha=0.03>', 'datetime': '2022-11-18T05:05:24.892694', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standard-WSL2-x86_64-with-debian-bullseye-sid', 'event': 'created'}
2022-11-18 05:05:24,894 : - collecting all words and their counts
2022-11-18 05:05:24,895 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-11-18 05:05:24,916 : - collected 20754 word types from a corpus of 107972 raw words and 1589 sentences
2022-11-18 05:05:24,918 : - Creating a fresh vocabulary
2022-11-18 05:05:24,945 : - Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 4486 unique words (21.62% of original 20754, drops 16268)', 'datetime': '2022-11-18T05:05:24.945294', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-stand

# CBOW


## Window 5

In [29]:
train_model(sg=0, window=5, vector_size=300, path='models/marlesson/cbow/5/300.txt')

2022-11-18 06:46:16,567 : - Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.03>', 'datetime': '2022-11-18T06:46:16.567137', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standard-WSL2-x86_64-with-debian-bullseye-sid', 'event': 'created'}
2022-11-18 06:46:16,569 : - collecting all words and their counts
2022-11-18 06:46:16,571 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-11-18 06:46:16,587 : - collected 20754 word types from a corpus of 107972 raw words and 1589 sentences
2022-11-18 06:46:16,588 : - Creating a fresh vocabulary
2022-11-18 06:46:16,615 : - Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 4486 unique words (21.62% of original 20754, drops 16268)', 'datetime': '2022-11-18T06:46:16.615200', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standa

In [38]:
train_model(sg=0, window=5, vector_size=600, path='models/marlesson/cbow/5/600.txt')

2022-11-18 10:04:17,905 : - Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=600, alpha=0.03>', 'datetime': '2022-11-18T10:04:17.905450', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standard-WSL2-x86_64-with-debian-bullseye-sid', 'event': 'created'}
2022-11-18 10:04:17,907 : - collecting all words and their counts
2022-11-18 10:04:17,908 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-11-18 10:04:17,926 : - collected 20754 word types from a corpus of 107972 raw words and 1589 sentences
2022-11-18 10:04:17,927 : - Creating a fresh vocabulary
2022-11-18 10:04:17,944 : - Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 4486 unique words (21.62% of original 20754, drops 16268)', 'datetime': '2022-11-18T10:04:17.944875', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standa

In [39]:
train_model(sg=0, window=5, vector_size=1000, path='models/marlesson/cbow/5/1000.txt')

2022-11-18 10:06:02,777 : - Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=1000, alpha=0.03>', 'datetime': '2022-11-18T10:06:02.777735', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standard-WSL2-x86_64-with-debian-bullseye-sid', 'event': 'created'}
2022-11-18 10:06:02,779 : - collecting all words and their counts
2022-11-18 10:06:02,779 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-11-18 10:06:02,799 : - collected 20754 word types from a corpus of 107972 raw words and 1589 sentences
2022-11-18 10:06:02,800 : - Creating a fresh vocabulary
2022-11-18 10:06:02,819 : - Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 4486 unique words (21.62% of original 20754, drops 16268)', 'datetime': '2022-11-18T10:06:02.819775', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-stand

## Window 50

In [32]:
train_model(sg=0, window=50, vector_size=300, path='models/marlesson/cbow/50/300.txt')

2022-11-18 06:53:10,656 : - Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.03>', 'datetime': '2022-11-18T06:53:10.656199', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standard-WSL2-x86_64-with-debian-bullseye-sid', 'event': 'created'}
2022-11-18 06:53:10,658 : - collecting all words and their counts
2022-11-18 06:53:10,659 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-11-18 06:53:10,677 : - collected 20754 word types from a corpus of 107972 raw words and 1589 sentences
2022-11-18 06:53:10,678 : - Creating a fresh vocabulary
2022-11-18 06:53:10,701 : - Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 4486 unique words (21.62% of original 20754, drops 16268)', 'datetime': '2022-11-18T06:53:10.701565', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standa

In [33]:
train_model(sg=0, window=50, vector_size=600, path='models/marlesson/cbow/50/600.txt')

2022-11-18 06:56:09,062 : - Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=600, alpha=0.03>', 'datetime': '2022-11-18T06:56:09.062177', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standard-WSL2-x86_64-with-debian-bullseye-sid', 'event': 'created'}
2022-11-18 06:56:09,063 : - collecting all words and their counts
2022-11-18 06:56:09,065 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-11-18 06:56:09,079 : - collected 20754 word types from a corpus of 107972 raw words and 1589 sentences
2022-11-18 06:56:09,079 : - Creating a fresh vocabulary
2022-11-18 06:56:09,096 : - Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 4486 unique words (21.62% of original 20754, drops 16268)', 'datetime': '2022-11-18T06:56:09.096836', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standa

In [34]:
train_model(sg=0, window=50, vector_size=1000, path='models/marlesson/cbow/50/1000.txt')

2022-11-18 07:00:10,778 : - Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=1000, alpha=0.03>', 'datetime': '2022-11-18T07:00:10.778658', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standard-WSL2-x86_64-with-debian-bullseye-sid', 'event': 'created'}
2022-11-18 07:00:10,780 : - collecting all words and their counts
2022-11-18 07:00:10,781 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-11-18 07:00:10,800 : - collected 20754 word types from a corpus of 107972 raw words and 1589 sentences
2022-11-18 07:00:10,801 : - Creating a fresh vocabulary
2022-11-18 07:00:10,830 : - Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 4486 unique words (21.62% of original 20754, drops 16268)', 'datetime': '2022-11-18T07:00:10.830670', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-stand

## Window 100

In [35]:
train_model(sg=0, window=100, vector_size=300, path='models/marlesson/cbow/100/300.txt')

2022-11-18 07:06:09,437 : - Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.03>', 'datetime': '2022-11-18T07:06:09.437544', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standard-WSL2-x86_64-with-debian-bullseye-sid', 'event': 'created'}
2022-11-18 07:06:09,438 : - collecting all words and their counts
2022-11-18 07:06:09,439 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-11-18 07:06:09,461 : - collected 20754 word types from a corpus of 107972 raw words and 1589 sentences
2022-11-18 07:06:09,462 : - Creating a fresh vocabulary
2022-11-18 07:06:09,494 : - Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 4486 unique words (21.62% of original 20754, drops 16268)', 'datetime': '2022-11-18T07:06:09.494564', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standa

In [36]:
train_model(sg=0, window=100, vector_size=600, path='models/marlesson/cbow/100/600.txt')

2022-11-18 07:09:11,705 : - Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=600, alpha=0.03>', 'datetime': '2022-11-18T07:09:11.705383', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standard-WSL2-x86_64-with-debian-bullseye-sid', 'event': 'created'}
2022-11-18 07:09:11,707 : - collecting all words and their counts
2022-11-18 07:09:11,707 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-11-18 07:09:11,722 : - collected 20754 word types from a corpus of 107972 raw words and 1589 sentences
2022-11-18 07:09:11,722 : - Creating a fresh vocabulary
2022-11-18 07:09:11,741 : - Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 4486 unique words (21.62% of original 20754, drops 16268)', 'datetime': '2022-11-18T07:09:11.741546', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standa

In [37]:
train_model(sg=0, window=100, vector_size=1000, path='models/marlesson/cbow/100/1000.txt')

2022-11-18 07:14:10,959 : - Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=1000, alpha=0.03>', 'datetime': '2022-11-18T07:14:10.959301', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-standard-WSL2-x86_64-with-debian-bullseye-sid', 'event': 'created'}
2022-11-18 07:14:10,960 : - collecting all words and their counts
2022-11-18 07:14:10,961 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-11-18 07:14:10,982 : - collected 20754 word types from a corpus of 107972 raw words and 1589 sentences
2022-11-18 07:14:10,982 : - Creating a fresh vocabulary
2022-11-18 07:14:11,003 : - Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 4486 unique words (21.62% of original 20754, drops 16268)', 'datetime': '2022-11-18T07:14:11.003807', 'gensim': '4.2.0', 'python': '3.7.13 (default, Oct 18 2022, 18:57:03) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.74.2-microsoft-stand