In [1]:
import os
import string
import numpy as np
import pandas as pd
import traceback
import json
from gensim.models.word2vec import Word2Vec, Text8Corpus
from gensim.models import KeyedVectors
from pprint import pprint
from tqdm import tqdm

In [2]:
def load_stop_words(stop_path="data/stop_words_english.txt"):

    stop_words = {}
    with open(stop_path, 'r') as fd:
        word_list = fd.read().splitlines()
        word_list.pop()
        for word in word_list:
            stop_words[word] = True
    return stop_words


def get_text(text_clean_path = "data/text_clean", text_raw_path = "data/text8", chunk_size=1000):
    text = None
    stop_words = load_stop_words()
    # Se o dataset já existir.
    if os.path.exists(text_clean_path):
        with open(text_clean_path, 'r') as fd:
            text = [fd.read().split(' ')]
            return text
    # Se não, pré processa ele.
    else:
        docs = []
        with open(text_raw_path, 'r') as fd:
            sentences = fd.read().split('\n')
            for sent in sentences:
                clean_sent = sent.translate(str.maketrans('','', string.punctuation)).split(' ')
                # Quebrando os documentos em chunks de 1000 palavras.
                cont = 0
                while cont < len(clean_sent):
                    chunck = []
                    # Para cada chunk.
                    for word in clean_sent[cont : cont + chunk_size]:
                        # Verifica se a palavra é stopword.
                        if word not in stop_words:
                            chunck.append(word)
                    cont += chunk_size
                    docs.append(chunck)
        return docs

In [3]:
sentences = get_text()
len(sentences)

17006

# Model Training

In [4]:
def train_model(text, window, vsize, sg, min_count):
    model_path = f"models/window_size-{window}_vector_size-{vsize}_sg-{sg}_min_count-{min_count}"
    model = Word2Vec(sentences=text, vector_size=vsize, window=window, min_count=min_count, sg=sg, workers=10)
    model.save(model_path)

In [5]:
window_sizes = [1,3,5,7,9,11,13,15,19,23,25]
vector_sizes = [50, 75, 100, 150, 200, 300, 500, 1000, 2000]
min_counts = [1, 2, 3, 5, 7, 10]

## Flag de controle para rodar os modelos.

In [6]:
"""
Mude a flag TRAIN_MODELS para true somente se quiser
treinar os modelos.
"""

TRAIN_MODELS = False

In [7]:
if TRAIN_MODELS:
    
    # Variando o tamanho da janela.
    for window in tqdm(window_sizes):
        train_model(sentences, window, 100, 1, 1)
    
    # Variando o tamanho do vetor.
    for vsize in tqdm(vector_sizes):
        train_model(sentences, 5, vsize, 1, 1)
    
    # Variando o modelo (skip or cbow)
    for sg in tqdm([0,1]):
        train_model(sentences, 5, 100, sg, 1)
    
    # Variando o min_count.
    for min_count in tqdm(min_counts):
        train_model(sentences, 5, 100, 1, min_count)


# Model Evaluation

In [11]:
def prep_questions(qpath="data/questions-words.txt"):

    dquestions = {}
    with open(qpath, 'r') as fd:
        key = None
        tests = fd.read().lower().split('\n')
        rest = tests.pop()
        for test in tests:
            categ = test[0]
            if categ == ':':
                dquestions[test] = []
                key = test
            else:
                dquestions[key].append(test.split(' '))
    return dquestions

def are_words_in(words, model_words):

    for word in words:
        if word not in model_words.wv.key_to_index:
            print(f"{word} does not exist")
            return False
        return True

def ranking_words_most_similar(words, model, topn=5):
    
    try:
        w1, w2, w3, _ = words
        return list(model.wv.most_similar(positive=[w1, w3], negative=[w2], topn=topn))
    except Exception as err:
        return []


def evaluate_model(dquestions, model, topn=5):
    
    score_categ = {}
    number_exceptions = 0
    # Para cada categoria de teste.
    for categ in dquestions:
        hits = 0
        misses = 0
        close_to = 0
        score = 0
        score_categ[categ] = {}
        # Para cada test.
        for test in dquestions[categ]:
            target = test[-1]
            ranking = ranking_words_most_similar(test, model, topn=topn)
            if ranking:
                #random_word = np.random.randint(len(ranking))
                #target = ranking[random_word][0]
                #if np.random.randint(2) == 0:
                #    target = test[-1]
                first_word = ranking.pop(0)
                # Verificando se a primeira palavra é o target esperado.
                if first_word[0] == target:
                    hits += 1
                    score += first_word[1]
                # Se não verifique se a palavra está no ranking.
                else:
                    rwords = [w[0] for w in ranking ]
                    if target in rwords:
                        close_to += 1
                        distance = rwords.index(target) + 1
                        score += (1 / distance) * ranking[distance - 1][1]
                    else:
                        misses += 1
            else:
                number_exceptions +=1
        score_categ[categ]["hits"] = hits
        score_categ[categ]["misses"] = misses
        score_categ[categ]["close_to"] = close_to
        score_categ[categ]["score"] = score
    print(f"Exceptions: {number_exceptions}")
    return score_categ

In [12]:
dquestions = prep_questions()
#models_paths = np.random.choice([ f for f in os.listdir("models/") if f.find("npy") == -1 ], 3)
models_paths = [ f for f in os.listdir("models/") if f.find("npy") == -1 ]
models_score = {}
for f in models_paths:
    print(f"\nModel: {f}")
    model = KeyedVectors.load(f"models/{f}")
    models_score[f] = evaluate_model(dquestions, model)
    output = f"outputs/models/{f}.json"
    with open(output, 'w') as fd:
        json.dump(models_score[f], fd, indent=6)



Model: window_size-5_vector_size-2000_sg-1_min_count-1
Exceptions: 2878

Model: window_size-5_vector_size-100_sg-1_min_count-2
Exceptions: 3195

Model: window_size-5_vector_size-50_sg-1_min_count-1
Exceptions: 2878

Model: window_size-5_vector_size-1000_sg-1_min_count-1
Exceptions: 2878

Model: window_size-5_vector_size-100_sg-0_min_count-1
Exceptions: 2878

Model: window_size-25_vector_size-100_sg-1_min_count-1
Exceptions: 2878

Model: window_size-15_vector_size-100_sg-1_min_count-1
Exceptions: 2878

Model: window_size-5_vector_size-200_sg-1_min_count-1
Exceptions: 2878

Model: window_size-5_vector_size-100_sg-1_min_count-10
Exceptions: 5562

Model: window_size-3_vector_size-100_sg-1_min_count-1
Exceptions: 2878

Model: window_size-5_vector_size-100_sg-1_min_count-7
Exceptions: 4698

Model: window_size-5_vector_size-500_sg-1_min_count-1
Exceptions: 2878

Model: window_size-7_vector_size-100_sg-1_min_count-1
Exceptions: 2878

Model: window_size-5_vector_size-75_sg-1_min_count-1
Except