# Processamento de Linguagem Natural - Trabalho Prático 1
### Thaís Ferreira da Silva - 2021092571

### Import

In [28]:
# Imports do gensim - para word2vec
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import Text8Corpus

# Imports essenciais
from scipy.spatial.distance import cosine
import os
import random

### Funções Auxiliares

In [29]:
def generate_hyperparameter_combinations(param_grid):
    from itertools import product
    keys, values = zip(*param_grid.items())
    combinations = [dict(zip(keys, v)) for v in product(*values)]
    return combinations


In [30]:
def train_and_save_model(corpus, params, output_dir):
    model_name = f"word2vec_vs{params['vector_size']}_win{params['window']}_sg{params['sg']}_ep{params['epochs']}"
    print(f"Treinando modelo: {model_name}")
    
    model = Word2Vec(
        sentences=corpus,
        vector_size=params['vector_size'],
        window=params['window'],
        sg=params['sg'],
        epochs=params['epochs']
    )
    
    model_path = os.path.join(output_dir, f"{model_name}.model")
    model.save(model_path)
    print(f"Modelo salvo: {model_path}")
    
    return model

In [None]:
def get_analogies(model, questions_words_path):
    final_analogies = []
    with open(questions_words_path, 'r') as f:
        for line in f:
            if line.startswith(':'):
                continue

            words = [word.lower().strip() for word in line.split()]

            final_words = [word for word in words if word in model.wv]
            #final_words = [word if word in model.wv else None for word in words]
            if len(final_words) != 4:
                continue
            
            final_analogies.append(final_words)
    
    random.shuffle(final_analogies)
    return final_analogies

In [None]:
def evaluate_models(model, analogies):
    avg_distance = 0
    correct = 0
    count = 0
    accuracy = 0

    #print("numero de analogias ", len(analogies))

    for analogy in analogies:
        if len(analogy) == 4:
            count += 1
            result_vector = model.wv[analogy[1]] - model.wv[analogy[0]] + model.wv[analogy[2]]
            predicted = model.wv.similar_by_vector(result_vector, topn=20, restrict_vocab=None)
            predicted_word = next((word for word, _ in predicted if word not in analogy[:3]), None)
            if analogy[3] == predicted_word:
                correct += 1
            # else:
            #     print(f"Analogias: {analogy}, Previsão: {predicted}, Palavra prevista: {predicted_word}, Esperado: {analogy[3]}\n")
            cosine_distance = cosine(model.wv[analogy[3]], model.wv[predicted_word])
            avg_distance += cosine_distance
            
    accuracy = correct/len(analogies)
    avg_distance /= count
    return accuracy, avg_distance

### Inicialização

In [33]:
text8_path = './text8'
corpus = Text8Corpus(text8_path)

sentence = next(iter(corpus))
print(sentence[:15])

output_dir = './word2vec_models_v1'
os.makedirs(output_dir, exist_ok=True)

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including']


### Treinamento do modelo Word2Vec

In [19]:
# Hiperparâmetros para o GridSearch
param_grid = {
    'vector_size': [50, 100, 200],      # Tamanho do vetor de palavras
    'window': [3, 5, 7],                # Tamanho da janela de contexto
    'sg': [0, 1],                      # CBOW (0) ou Skip-gram (1)
    'epochs': [5, 10, 15],             # Número de iterações de treinamento
}

combinations = generate_hyperparameter_combinations(param_grid)

for i, params in enumerate(combinations):
    print(f"\nTreinando combinação {i+1}/{len(combinations)}: {params}")
    model = train_and_save_model(corpus, params, output_dir)


Treinando combinação 1/54: {'vector_size': 50, 'window': 3, 'sg': 0, 'epochs': 5}
Treinando modelo: word2vec_vs50_win3_sg0_ep5
Modelo salvo: ./word2vec_models/word2vec_vs50_win3_sg0_ep5.model

Treinando combinação 2/54: {'vector_size': 50, 'window': 3, 'sg': 0, 'epochs': 10}
Treinando modelo: word2vec_vs50_win3_sg0_ep10
Modelo salvo: ./word2vec_models/word2vec_vs50_win3_sg0_ep10.model

Treinando combinação 3/54: {'vector_size': 50, 'window': 3, 'sg': 0, 'epochs': 15}
Treinando modelo: word2vec_vs50_win3_sg0_ep15
Modelo salvo: ./word2vec_models/word2vec_vs50_win3_sg0_ep15.model

Treinando combinação 4/54: {'vector_size': 50, 'window': 3, 'sg': 1, 'epochs': 5}
Treinando modelo: word2vec_vs50_win3_sg1_ep5
Modelo salvo: ./word2vec_models/word2vec_vs50_win3_sg1_ep5.model

Treinando combinação 5/54: {'vector_size': 50, 'window': 3, 'sg': 1, 'epochs': 10}
Treinando modelo: word2vec_vs50_win3_sg1_ep10
Modelo salvo: ./word2vec_models/word2vec_vs50_win3_sg1_ep10.model

Treinando combinação 6/54

### Avaliação dos modelos

In [43]:
model = Word2Vec.load('./word2vec_models_v1/word2vec_vs50_win3_sg0_ep5.model')
questions_words_path = './mini-questions-words.txt'

analogies = get_analogies(model, questions_words_path)

accuracy, avg_distance = evaluate_models(model, analogies)
print(f"vector_size: {model.vector_size}, window: {model.window}, sg: {model.sg}, epochs: {model.epochs} ---> Accuracy: {accuracy}, avg_distance: {avg_distance}")


vector_size: 50, window: 3, sg: 0, epochs: 5 ---> Accuracy: 0.13636363636363635, avg_distance: 0.22274907286017107


In [48]:
questions_words_path = './mini-questions-words.txt'

models = os.listdir(output_dir)
models = [model for model in models if model.endswith('.model')]

model_metrics = []

num_modelo = 0

for model_name in models:
    num_modelo +=1
    model = Word2Vec.load(os.path.join(output_dir, model_name))
    analogies = get_analogies(model, questions_words_path)
    accuracy, avg_distance = evaluate_models(model, analogies)
    #avg_distance -> quanto menor melhor
    print(f"Modelo: [{num_modelo}], vector_size: {model.vector_size}, window: {model.window}, sg: {model.sg}, epochs: {model.epochs} ---> Accuracy: {accuracy}, avg_distance: {avg_distance}")
    model_metrics.append({
        'model_name': model_name,
        'accuracy': accuracy,
        'avg_distance': avg_distance,
        'vector_size': model.vector_size,
        'window': model.window,
        'sg': model.sg,
        'epochs': model.epochs
    })

word2vec_vs50_win3_sg0_ep15.model
Modelo: [1], vector_size: 50, window: 3, sg: 0, epochs: 15 ---> Accuracy: 0.13636363636363635, avg_distance: 0.23116602516469917
Modelo: [2], vector_size: 50, window: 5, sg: 0, epochs: 10 ---> Accuracy: 0.22727272727272727, avg_distance: 0.24709466110980002
Modelo: [3], vector_size: 50, window: 7, sg: 1, epochs: 10 ---> Accuracy: 0.5454545454545454, avg_distance: 0.09843173911125498
Modelo: [4], vector_size: 50, window: 7, sg: 0, epochs: 5 ---> Accuracy: 0.18181818181818182, avg_distance: 0.25522425029787316
Modelo: [5], vector_size: 200, window: 3, sg: 0, epochs: 5 ---> Accuracy: 0.13636363636363635, avg_distance: 0.3191292889482323
Modelo: [6], vector_size: 100, window: 7, sg: 1, epochs: 10 ---> Accuracy: 0.7727272727272727, avg_distance: 0.06609374694487317
Modelo: [7], vector_size: 100, window: 3, sg: 0, epochs: 15 ---> Accuracy: 0.3181818181818182, avg_distance: 0.26987056081447164
Modelo: [8], vector_size: 200, window: 7, sg: 1, epochs: 10 ---> A

In [47]:
top_10_models = sorted(model_metrics, key=lambda x: x['avg_distance'])[:10]
for model_info in top_10_models:
    print(f"Model: {model_info['model_name']}, vector_size: {model_info['vector_size']}, window: {model_info['window']}, sg: {model_info['sg']}, epochs: {model_info['epochs']} ---> Accuracy: {model_info['accuracy']}, avg_distance: {model_info['avg_distance']}")

Model: word2vec_vs100_win3_sg1_ep15.model, vector_size: 100, window: 3, sg: 1, epochs: 15 ---> Accuracy: 0.8636363636363636, avg_distance: 0.041363916258210606
Model: word2vec_vs100_win7_sg1_ep15.model, vector_size: 100, window: 7, sg: 1, epochs: 15 ---> Accuracy: 0.8636363636363636, avg_distance: 0.04642604055254532
Model: word2vec_vs100_win5_sg1_ep15.model, vector_size: 100, window: 5, sg: 1, epochs: 15 ---> Accuracy: 0.8181818181818182, avg_distance: 0.050449529920780334
Model: word2vec_vs100_win7_sg1_ep10.model, vector_size: 100, window: 7, sg: 1, epochs: 10 ---> Accuracy: 0.7727272727272727, avg_distance: 0.06609374694487316
Model: word2vec_vs50_win7_sg1_ep15.model, vector_size: 50, window: 7, sg: 1, epochs: 15 ---> Accuracy: 0.6363636363636364, avg_distance: 0.07246233883362131
Model: word2vec_vs50_win3_sg1_ep15.model, vector_size: 50, window: 3, sg: 1, epochs: 15 ---> Accuracy: 0.5909090909090909, avg_distance: 0.07521606618799816
Model: word2vec_vs50_win3_sg1_ep10.model, vector