In [1]:
import os,sys
src = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(src)
from src.baseline_LSTM_POS import *
import copy

In [2]:
sentences, pos_tags = load_data("../UD_French-Sequoia/fr_sequoia-ud-train.conllu")
import random
def replace_with_unk(sentences, probability=0.1):
    """
    Replace words in the sentences with 'UNK' based on a given probability.
    Each word has an independent chance of being replaced.
    The structure of the sentences is maintained.
    
    :param sentences: A list of sentences, where each sentence is a list of words.
    :param probability: The probability of a word being replaced by 'UNK'.
    :return: New list of sentences with some words replaced by 'UNK'.
    """
    new_sentences = []
    for sentence in sentences:
        new_sentence = []
        for word in sentence:
            if random.random() < probability:
                new_sentence.append('UNK')
            else:
                new_sentence.append(word)
        new_sentences.append(new_sentence)
    return new_sentences


sentences=replace_with_unk(sentences, probability=0.001)

In [3]:
word_counts = Counter(word for sentence in sentences for word in sentence)
word_to_ix = {word: i+1 for i, word in enumerate(word_counts)}  # +1 pour le padding
word_to_ix['<PAD>'] = 0
word_to_ix['<OOV>'] = len(word_to_ix)

tag_counts = Counter(tag for tags in pos_tags for tag in tags)
tag_to_ix = {tag: i for i, tag in enumerate(tag_counts)}

In [4]:
embedding_dim = 64
hidden_dim = 128
epochs=50
batch_size=16

In [5]:
dataset = POSDataset(sentences, pos_tags, word_to_ix, tag_to_ix)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

model = POSModel(len(word_to_ix), embedding_dim, hidden_dim, len(tag_to_ix))
loss_function = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

  from .autonotebook import tqdm as notebook_tqdm


<h2> Training

In [6]:
for epoch in range(epochs):
    total_loss = 0
    for sentence_in, targets in data_loader:
        model.zero_grad()
        tag_scores = model(sentence_in)
        loss = loss_function(tag_scores.view(-1, len(tag_to_ix)), targets.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}")

Epoch 1, Loss: 2.411568760871887
Epoch 2, Loss: 1.867871401991163


KeyboardInterrupt: 

In [None]:
loss, accuracy, f1 = evaluate_model(model, data_loader, loss_function, tag_to_ix)
print(f"Train Accuracy : {accuracy:.4f}")
print(f"Train loss : {loss:.4f}")
print(f"Train F1 score : {f1:.4f}")

Train Accuracy : 0.9121
Train loss : 0.3114
Train F1 score : 0.6903


<h2> Testing

In [None]:
test_sentences, test_pos_tags = load_data("../UD_French-Sequoia/fr_sequoia-ud-test.conllu")
test_dataset = POSDataset(test_sentences, test_pos_tags, word_to_ix, tag_to_ix)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [None]:
loss, accuracy, f1 = evaluate_model(model, test_data_loader, loss_function, tag_to_ix)
print(f"Test Accuracy : {accuracy:.4f}")
print(f"Test loss : {loss:.4f}")
print(f"Test F1 score : {f1:.4f}")

Test Accuracy : 0.8742
Test loss : 0.4222
Test F1 score : 0.6943
