# Processamento de Linguagem Natural - Trabalho Prático 1
### Thaís Ferreira da Silva - 2021092571

### Import

In [None]:
# Imports do gensim - para word2vec
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import Text8Corpus

# Imports essenciais
from scipy.spatial.distance import cosine
import os
import random

### Funções Auxiliares

In [None]:
def generate_hyperparameter_combinations(param_grid):
    from itertools import product
    keys, values = zip(*param_grid.items())
    combinations = [dict(zip(keys, v)) for v in product(*values)]
    return combinations


In [None]:
def train_and_save_model(corpus, params, output_dir):
    model_name = f"word2vec_vs{params['vector_size']}_win{params['window']}_sg{params['sg']}_ep{params['epochs']}"
    print(f"Treinando modelo: {model_name}")
    
    model = Word2Vec(
        sentences=corpus,
        vector_size=params['vector_size'],
        window=params['window'],
        sg=params['sg'],
        epochs=params['epochs']
    )
    
    model_path = os.path.join(output_dir, f"{model_name}.model")
    model.save(model_path)
    print(f"Modelo salvo: {model_path}")
    
    return model

In [None]:
def get_analogies(model, questions_words_path):
    final_analogies = []
    with open(questions_words_path, 'r') as f:
        for line in f:
            if line.startswith(':'):
                continue

            words = [word.lower().strip() for word in line.split()]

            final_words = [word for word in words if word in model.wv]
            final_analogies.append(final_words)
    
    random.shuffle(final_analogies)
    return final_analogies

### Preprocessamento do dados de treino

In [None]:
text8_path = './text8'
corpus = Text8Corpus(text8_path)

sentence = next(iter(corpus))
print(sentence[:50])


### Treinamento do modelo Word2Vec

In [None]:
output_dir = './word2vec_models'
os.makedirs(output_dir, exist_ok=True)

# Hiperparâmetros para o GridSearch
# param_grid = {
#     'vector_size': [50, 100, 200],      # Tamanho do vetor de palavras
#     'window': [3, 5, 7],                # Tamanho da janela de contexto
#     'sg': [0, 1],                      # CBOW (0) ou Skip-gram (1)
#     'epochs': [5, 10, 15],             # Número de iterações de treinamento
# }

param_grid = {
    'vector_size': [50],      # Tamanho do vetor de palavras
    'window': [3, 5, 7],                # Tamanho da janela de contexto
    'sg': [0, 1],                      # CBOW (0) ou Skip-gram (1)
    'epochs': [5],             # Número de iterações de treinamento
}

combinations = generate_hyperparameter_combinations(param_grid)

for i, params in enumerate(combinations):
    print(f"\nTreinando combinação {i+1}/{len(combinations)}: {params}\n")
    model = train_and_save_model(corpus, params, output_dir)

### Avaliação do modelo treinado

In [None]:
def evaluate_analogies(model, analogies):
    avg_distance = 0
    counter = 0
    cosine_vec = []

    #print("numero de analogias ", len(analogies))

    for analogy in analogies:
        result_vector = model.wv[analogy[1]] - model.wv[analogy[0]] + model.wv[analogy[2]]
        predicted = model.wv.similar_by_vector(result_vector, topn=20, restrict_vocab=None)
        predicted_word = next((word for word, _ in predicted if word not in analogy[:3]), None)
        if analogy[3] == predicted_word:
            counter += 1
        # else:
        #     print(f"Analogias: {analogy}, Previsão: {predicted}, Palavra prevista: {predicted_word}, Esperado: {analogy[3]}\n")
        cosine_distance = cosine(model.wv[analogy[3]], model.wv[predicted_word])
        cosine_vec.append(cosine_distance)
        avg_distance += cosine_distance
        accuracy = counter/len(analogies)

    return accuracy, cosine_vec

In [None]:
# model = Word2Vec.load('./word2vec_models/word2vec_vs50_win2_sg1_ep2.model')
# questions_words_path = './mini-questions-words.txt'

# analogies = get_analogies(model, questions_words_path)

# accuracy = evaluate_analogies(model, analogies)

# print(f"Accuracy: {accuracy}")


In [None]:
#para cada modelo treinado, avaliar as analogias
#pegar os modelos da pasta
questions_words_path = './mini-questions-words.txt'
analogies = get_analogies(model, questions_words_path)
print("numero de analogias ", len(analogies))


models = os.listdir(output_dir)
models = [model for model in models if model.endswith('.model')]

for model_name in models:
    model = Word2Vec.load(os.path.join(output_dir, model_name))
    accuracy, cosine_vec = evaluate_analogies(model, analogies)
    print(f"vector_size: {model.vector_size}, window: {model.window}, sg: {model.sg}, epochs: {model.epochs} ---> Accuracy: {accuracy}")
    # formatted_cosine_vec = [f"{value:.5f}" for value in cosine_vec]
    # print("Cosine Vec:", formatted_cosine_vec)
