In [8]:
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
import gensim
import os

In [1]:
def tokenize_doc(doc):
    sents = sent_tokenize(doc)
    processed_sents = [word_tokenize(sent) for sent in sents]
    return processed_sents

## Treinando o primeiro modelo - Guimarães

In [3]:
BASE_DIR = 'guimaraes'
guimaraes_texts = [os.path.join(BASE_DIR, file) for file in os.listdir(BASE_DIR)]

In [4]:
guimaraes_texts

['guimaraes/Primeiras Estórias (completo).txt',
 'guimaraes/Noites do Sertão - João Guimarães Rosa.txt',
 'guimaraes/Ave, Palavra - João Guimarães Rosa.txt',
 'guimaraes/Manuelzão e Miguilim - João Guimarães Rosa.txt',
 'guimaraes/No Urubuquaquá, No Pinhém - João Guimarães Rosa.txt',
 'guimaraes/1 Guimarães Rosa - Sagarana.txt',
 'guimaraes/Tutameia - João Guimarães Rosa.txt',
 'guimaraes/Estas Estórias - Joao Guimaraes Rosa.txt']

In [14]:
corpus = []
for text in guimaraes_texts:
    with open(text, "r") as f:
        t = f.read()
        corpus.append(t.replace("\n", ""))

In [15]:
processed_docs = []
for doc in corpus:
    processed_sents = tokenize_doc(doc)
    processed_docs = processed_docs + processed_sents

In [16]:
bigram = Phraser(Phrases(processed_docs, min_count=2, threshold=30))
gensim_corpus = bigram[processed_docs]

In [17]:
w2v = Word2Vec(min_count=4, size=100, window=4) 
w2v.build_vocab(gensim_corpus)

In [18]:
w2v.train(gensim_corpus, total_examples=w2v.corpus_count, epochs=30)


(12347534, 22551960)

In [19]:
w2v.wv.most_similar(positive=["mulher"])


[('irmã', 0.6933686137199402),
 ('filha', 0.676548182964325),
 ('família', 0.6579251885414124),
 ('noiva', 0.6362385749816895),
 ('moça', 0.6296038627624512),
 ('mãe', 0.6248952746391296),
 ('rapariga', 0.622147798538208),
 ('Manuela', 0.6203331351280212),
 ('menina', 0.603524923324585),
 ('senhora', 0.6034640669822693)]

In [20]:
w2v.wv.most_similar(positive=["homem"])

[('rapaz', 0.5376620292663574),
 ('sujeito', 0.5371572375297546),
 ('filho', 0.5266872644424438),
 ('morto', 0.5226889848709106),
 ('pobre', 0.5208336710929871),
 ('rico', 0.5175313353538513),
 ('vaqueiro', 0.4968743622303009),
 ('companheiro', 0.4865562915802002),
 ('pressentimento', 0.47348666191101074),
 ('rapazinho', 0.4694969654083252)]

In [31]:
w2v.wvsave_word2vec_format("word2vec_guimaraes.txt")

## Fine tuning Clarice Lispector

In [25]:
BASE_DIR_FINE_TUNING = 'clarice'
clarice_texts = [os.path.join(BASE_DIR_FINE_TUNING, file) for file in os.listdir(BASE_DIR_FINE_TUNING)]
corpus_ft = []
for text in clarice_texts:
    with open(text, "r") as f:
        t = f.read()
        corpus_ft.append(t.replace("\n", ""))

processed_docs_ft = []
for doc in corpus_ft:
    processed_sents = tokenize_doc(doc)
    processed_docs_ft = processed_docs_ft + processed_sents



In [32]:
bigram_ft = Phraser(Phrases(processed_docs_ft, min_count=2, threshold=30))
gensim_corpus_ft = bigram_ft[processed_docs_ft]

In [39]:
model_ft = Word2Vec(size=100,min_count=4, window=4)
model_ft.build_vocab(gensim_corpus_ft)
model_ft.intersect_word2vec_format("word2vec_guimaraes.txt", lockf=1.0)
model_ft.train(processed_docs_ft, total_examples=model_ft.corpus_count, epochs=30)

(7525147, 12756390)

In [40]:
model_ft.wv.most_similar(positive=["homem"])

[('velho', 0.49869149923324585),
 ('cão', 0.4902336299419403),
 ('rapaz', 0.4862063527107239),
 ('quati', 0.4799765944480896),
 ('morto', 0.4369499683380127),
 ('gato', 0.42170870304107666),
 ('dono', 0.419739305973053),
 ('moço', 0.3973882794380188),
 ('bicho', 0.39719587564468384),
 ('animal', 0.39257094264030457)]