# Processamento de Linguagem Natural - Trabalho Prático 1
### Thaís Ferreira da Silva - 2021092571

### Import

In [1]:
# Imports do gensim - para word2vec
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import Text8Corpus

#Imports do NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Imports essenciais
import os
import random

### Preprocessamento do dados de treino

In [2]:
text8_path = './text8'
questions_words_path = os.path.abspath('./mini-questions-words.txt')
corpus = Text8Corpus(text8_path)

# Visualizando as primeiras palavras do corpus
sentence = next(iter(corpus))
print(sentence[:50])



['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst', 'the', 'term', 'is', 'still', 'used', 'in', 'a', 'pejorative', 'way', 'to', 'describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the']


### Treinamento do modelo Word2Vec

In [None]:
def generate_hyperparameter_combinations(param_grid):
    from itertools import product
    keys, values = zip(*param_grid.items())
    combinations = [dict(zip(keys, v)) for v in product(*values)]
    return combinations


In [4]:
def train_and_save_model(corpus, params, output_dir):
    model_name = f"word2vec_vs{params['vector_size']}_win{params['window']}_sg{params['sg']}_ep{params['epochs']}"
    print(f"Treinando modelo: {model_name}")
    
    model = Word2Vec(
        sentences=corpus,
        vector_size=params['vector_size'],
        window=params['window'],
        sg=params['sg'],
        epochs=params['epochs']
    )
    
    model_path = os.path.join(output_dir, f"{model_name}.model")
    model.save(model_path)
    print(f"Modelo salvo: {model_path}")
    
    return model

In [None]:
output_dir = './word2vec_models'
os.makedirs(output_dir, exist_ok=True)

# Hiperparâmetros para o GridSearch
# param_grid = {
#     'vector_size': [50, 100, 200],      # Tamanho do vetor de palavras
#     'window': [3, 5, 7],                # Tamanho da janela de contexto
#     'sg': [0, 1],                      # CBOW (0) ou Skip-gram (1)
#     'epochs': [5, 10, 15],             # Número de iterações de treinamento
# }

param_grid = {
    'vector_size': [50],      # Tamanho do vetor de palavras
    'window': [2],                # Tamanho da janela de contexto
    'sg': [1],                      # CBOW (0) ou Skip-gram (1)
    'epochs': [1],             # Número de iterações de treinamento
}

# Gerar combinações de hiperparâmetros
combinations = generate_hyperparameter_combinations(param_grid)

In [6]:
for i, params in enumerate(combinations):
    print(f"\nTreinando combinação {i+1}/{len(combinations)}: {params}\n")
    model = train_and_save_model(corpus, params, output_dir)


Treinando combinação 1/1: {'vector_size': 50, 'window': 2, 'sg': 1, 'epochs': 1}

Treinando modelo: word2vec_vs50_win2_sg1_ep1
Modelo salvo: ./word2vec_models/word2vec_vs50_win2_sg1_ep1.model


### Avaliação do modelo treinado

In [7]:
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [8]:
def preprocessWord(word):
    if word is not None:
        word = word.lower().strip()
        if word in stop_words:
            return None
        word = lemmatizer.lemmatize(word)

    return word

In [53]:
def get_analogies(model, questions_words_path):
    final_analogies = []
    with open(questions_words_path, 'r') as f:
        for line in f:
            if line.startswith(':'):
                continue

            words = [preprocessWord(word) for word in line.split()]
            target = words[3]

            if target not in model.wv:
                continue

            final_words = [word for word in words if word in model.wv]

            if final_words:
                final_analogies.append(final_words + [target])
    
    random.shuffle(final_analogies)
    return final_analogies[:500]

In [54]:
def distanceToMostSimilarWord(model, input_words: list, true_word: str) -> float:
    similar_words = model.wv.most_similar(input_words)
    most_similar_word = similar_words[0][0]
    return model.wv.distance(true_word, most_similar_word)


In [1]:
model = Word2Vec.load('./word2vec_models/word2vec_vs50_win2_sg1_ep1.model')
questions_words_path = './mini-questions-words.txt'

analogies = get_analogies(model, questions_words_path)
print(analogies)

avg_distance = 0
counter = 0

print("numero de analogias ", len(analogies))


NameError: name 'Word2Vec' is not defined