# Processamento de Linguagem Natural - Trabalho Prático 1
### Thaís Ferreira da Silva - 2021092571

### Import

In [53]:
# Imports do gensim - para word2vec
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import Text8Corpus

#Imports do NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Imports essenciais
import os
import random

### Preprocessamento do dados de treino

In [54]:
text8_path = './text8'
questions_words_path = os.path.abspath('./mini-questions-words.txt')
corpus = Text8Corpus(text8_path)

# Visualizando as primeiras palavras do corpus
sentence = next(iter(corpus))
print(sentence[:50])



['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst', 'the', 'term', 'is', 'still', 'used', 'in', 'a', 'pejorative', 'way', 'to', 'describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the']


### Treinamento do modelo Word2Vec

In [55]:
def generate_hyperparameter_combinations(param_grid):
    from itertools import product
    keys, values = zip(*param_grid.items())
    combinations = [dict(zip(keys, v)) for v in product(*values)]
    return combinations


In [56]:
def train_and_save_model(corpus, params, output_dir):
    model_name = f"word2vec_vs{params['vector_size']}_win{params['window']}_sg{params['sg']}_ep{params['epochs']}"
    print(f"Treinando modelo: {model_name}")
    
    model = Word2Vec(
        sentences=corpus,
        vector_size=params['vector_size'],
        window=params['window'],
        sg=params['sg'],
        epochs=params['epochs']
    )
    
    model_path = os.path.join(output_dir, f"{model_name}.model")
    model.save(model_path)
    print(f"Modelo salvo: {model_path}")
    
    return model

In [57]:
output_dir = './word2vec_models'
os.makedirs(output_dir, exist_ok=True)

# Hiperparâmetros para o GridSearch
# param_grid = {
#     'vector_size': [50, 100, 200],      # Tamanho do vetor de palavras
#     'window': [3, 5, 7],                # Tamanho da janela de contexto
#     'sg': [0, 1],                      # CBOW (0) ou Skip-gram (1)
#     'epochs': [5, 10, 15],             # Número de iterações de treinamento
# }

param_grid = {
    'vector_size': [50],      # Tamanho do vetor de palavras
    'window': [2],                # Tamanho da janela de contexto
    'sg': [1],                      # CBOW (0) ou Skip-gram (1)
    'epochs': [2],             # Número de iterações de treinamento
}

# Gerar combinações de hiperparâmetros
combinations = generate_hyperparameter_combinations(param_grid)

In [58]:
for i, params in enumerate(combinations):
    print(f"\nTreinando combinação {i+1}/{len(combinations)}: {params}\n")
    model = train_and_save_model(corpus, params, output_dir)


Treinando combinação 1/1: {'vector_size': 50, 'window': 2, 'sg': 1, 'epochs': 2}

Treinando modelo: word2vec_vs50_win2_sg1_ep2
Modelo salvo: ./word2vec_models/word2vec_vs50_win2_sg1_ep2.model


### Avaliação do modelo treinado

In [59]:
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [60]:
def preprocessWord(word):
    if word is not None:
        word = word.lower().strip()
        if word in stop_words:
            return None
        word = lemmatizer.lemmatize(word)

    return word

In [61]:
def get_analogies(model, questions_words_path):
    final_analogies = []
    with open(questions_words_path, 'r') as f:
        for line in f:
            if line.startswith(':'):
                continue

            words = [preprocessWord(word) for word in line.split()]

            final_words = [word for word in words if word in model.wv]
            final_analogies.append(final_words)
    
    random.shuffle(final_analogies)
    return final_analogies

In [62]:
model = Word2Vec.load('./word2vec_models/word2vec_vs50_win2_sg1_ep2.model')
questions_words_path = './mini-questions-words.txt'

analogies = get_analogies(model, questions_words_path)
#print(analogies)

avg_distance = 0
counter = 0

print("numero de analogias ", len(analogies))

for analogy in analogies:
    result_vector = model.wv[analogy[0]] - model.wv[analogy[1]] + model.wv[analogy[2]]
    predicted = model.wv.similar_by_vector(result_vector, topn=20, restrict_vocab=None)
    predicted_word = next((word for word, _ in predicted if word not in analogy[:3]), None)
    if analogy[3] == predicted_word:
        counter += 1
    else:
        print(analogy, predicted)
        #verificar se alaogy[3] esta em predicted
    avg_distance += model.wv.distance(analogy[3], predicted_word)

print(f"Accuracy: {counter/len(analogies)}")





numero de analogias  22
['athens', 'greece', 'berlin', 'germany'] [('berlin', 0.8309546709060669), ('manhattan', 0.8087501525878906), ('edinburgh', 0.8067633509635925), ('philadelphia', 0.8064784407615662), ('bristol', 0.806315004825592), ('fort', 0.7960829138755798), ('birmingham', 0.7891896367073059), ('cemetery', 0.7830525040626526), ('richmond', 0.7792342305183411), ('pittsburgh', 0.7782689929008484), ('leicester', 0.7735268473625183), ('abbey', 0.7693635821342468), ('chicago', 0.7659742832183838), ('brooklyn', 0.7647572755813599), ('palace', 0.7644233703613281), ('hotel', 0.7625153064727783), ('baltimore', 0.7625142335891724), ('nottingham', 0.7623006105422974), ('conservatory', 0.7622580528259277), ('wembley', 0.7621303200721741)]
['athens', 'greece', 'moscow', 'russia'] [('wembley', 0.8364171981811523), ('moscow', 0.8303816318511963), ('cincinnati', 0.8212544918060303), ('pittsburgh', 0.8208373785018921), ('philadelphia', 0.8169144988059998), ('palace', 0.8120541572570801), ('na