Dependências

In [2]:
%pip install gensim
%pip install nltk

from sklearn.model_selection import ParameterGrid
from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import random
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


Pré-processamento do corpus, são aplicadas as operações de lowercasing, remoção de stopwords e lemmatization (um tipo de stemming). Os tokens são agrupados em sentenças de tamanho 10.

In [3]:
def preprocessWord(word):
    word = word.lower().strip()
    
    if word in stop_words:
        return None
    
    word = lemmatizer.lemmatize(word)

    return word

# Load the corpus
corpus = LineSentence('corpus.txt')
docs = [doc for doc in corpus]
tokens = []
for doc in docs:
    tokens.extend(doc)
print(len(tokens))

nltk.download('stopwords')
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

print("Lowercasing...")
tokens = [token.lower() for token in tokens]

print("Removing stopwords...")
tokens = [token for token in tokens if token not in stop_words]

print("Lemmatizing...")
tokens = [lemmatizer.lemmatize(token) for token in tokens]

print("Grouping into sentences...")
sentences = []
sentence = []
counter = 0
words_per_sentence = 10
for token in tokens:
    counter += 1
    if counter > words_per_sentence:
        sentences.append(sentence)
        sentence = [token]
        counter = 1
        continue

    sentence.append(token)

print(len(tokens))
with open('sentences.txt', 'w') as f:
    for sentence in sentences:
        f.write(' '.join(sentence) + '\n')

17005207


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vip/victor.henrique/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Lowercasing...
Removing stopwords...
Lemmatizing...
Grouping into sentences...
10890638


Removendo palavras desconhecidas do conjunto de avaliação e amostrando o restante para que a etapa de avaliação seja mais rápida. Um conjunto de avaliação composto de mais de 500 analogias não traz impacto significativo na avaliação do modelo.

In [4]:
def getAnalogiesForModel(model):
    final_analogies = []
    with open("questions-words.txt") as f:
        for line in f:
            if line.startswith(":"):
                continue

            line = line.split()
            words = line[:3]
            words = [preprocessWord(word) for word in words if preprocessWord(word) is not None]
            target = preprocessWord(line[3])

            if target not in model.wv or target is None:
                continue
            
            final_words = []
            for word in words:
                if word in model.wv:
                    final_words.append(word)

            if len(final_words) == 0:
                continue

            final_analogies.append(final_words + [target])

    # Randomly truncate the analogies
    random.shuffle(final_analogies)
    final_analogies = final_analogies[:500]
    return final_analogies

Implementando o esquema de avaliação proposto

In [5]:
def distanceToMostSimilarWord(model, input_words: list, true_word: str) -> float:
    similar_words = model.wv.most_similar(input_words)
    most_similar_word = similar_words[0][0]
    return model.wv.distance(true_word, most_similar_word)

def evaluateModel(model):
    analogies = getAnalogiesForModel(model)
    nb_lines = len(analogies)
    avg_distance = 0
    counter = 0
    for analogy in analogies:
        counter += 1
        words = analogy[:-1]
        target = analogy[-1]

        distance = distanceToMostSimilarWord(model, words, target)
        avg_distance += distance

    avg_distance = avg_distance / counter
    return avg_distance

Performando uma grid-search para encontrar uma combinação ótima de hiperparâmetros. 

In [None]:
param_grid = {
    'sg': [0, 1], # 1 = skip-gram, 0 = CBOW
    'vector_size': [50, 100, 200],
    'window': [2, 5, 10],
    'min_count': [1, 2, 3],
}

grid = list(ParameterGrid(param_grid))

best_score = float('inf')
best_params = None

for params in grid:
    model = Word2Vec(sentences=sentences, **params)
    score = evaluateModel(model)

    if score < best_score:
        best_score = score
        best_params = params

        print(f"New best score found => {params}: {score}")

print(f"Best parameters: {best_params}")
print(f"Best score: {best_score}")

Escolhendo o melhor modelo encontrado através do grid-search

In [6]:
model = Word2Vec(sentences=sentences, min_count=1, sg=1, vector_size=50, window=2)

In [7]:
score = evaluateModel(model)
print(score)

0.25681810003519057


Por curiosidade abaixo está a melhor e a pior predição do modelo dentro do conjunto de avaliação utilizado.

In [10]:
analogies = getAnalogiesForModel(model)
min_distance = float('inf')
min_case = [[], ""]
max_distance = float('-inf')
max_case = [[], ""]

for analogy in analogies:
    counter += 1
    words = analogy[:-1]
    target = analogy[-1]

    distance = distanceToMostSimilarWord(model, words, target)
    
    if distance < min_distance:
        min_distance = distance
        min_case[0] = words
        min_case[1] = target
    
    if distance > max_distance:
        max_distance = distance
        max_case[0] = words
        max_case[1] = target

print(f"Min distance: {min_distance}")
print(f"Context: {min_case[0]}, target: {min_case[1]}, prediction: {model.wv.most_similar(min_case[0])[0][0]}")
print()
print(f"Max distance: {max_distance}")
print(f"Context: {max_case[0]}, target: {max_case[1]}, prediction: {model.wv.most_similar(max_case[0])[0][0]}")

Min distance: 0.0
Context: ['see', 'seeing', 'look'], target: looking, prediction: looking

Max distance: 0.5434837341308594
Context: ['playing', 'played', 'increasing'], target: increased, prediction: play
