In [2]:
pip install conllu

Collecting conllu
  Downloading conllu-4.5.3-py2.py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-4.5.3


In [10]:
import torch
from torch.utils.data import Dataset
from conllu import parse_incr

def load_data(file_path):
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for tokenlist in parse_incr(file):
            words = [token['form'] for token in tokenlist]
            pos_tags = [token['upostag'] for token in tokenlist]
            heads = [token['head'] for token in tokenlist]
            labels = [token['deprel'] for token in tokenlist]
            sentences.append((words, pos_tags, heads, labels))
    return sentences



In [11]:
def build_vocab(data):
    words = set()
    pos_tags = set()
    labels = set()
    for sentence in data:
        words.update(sentence[0])
        pos_tags.update(sentence[1])
        labels.update(sentence[3])
    word2idx = {word: i + 1 for i, word in enumerate(words)}  # +1 to start index from 1
    word2idx['<UNK>'] = 0  # Unknown words
    pos2idx = {tag: i + 1 for i, tag in enumerate(pos_tags)}  # +1 to start index from 1
    pos2idx['<UNK>'] = 0  # Unknown POS tags
    label2idx = {label: i for i, label in enumerate(labels)}
    return word2idx, pos2idx, label2idx

# Load data
dev_data = load_data('dev.gold.conll')
train_data = load_data('train.gold.conll')

# Build vocabularies
word2idx, pos2idx, label2idx = build_vocab(train_data)


In [7]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

class DependencyParsingDataset(Dataset):
    def __init__(self, data, word2idx, pos2idx, label2idx):
        self.data = data
        self.word2idx = word2idx
        self.pos2idx = pos2idx
        self.label2idx = label2idx

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        words, pos_tags, heads, labels = self.data[idx]
        word_idxs = [self.word2idx.get(word.lower(), self.word2idx['<UNK>']) for word in words]
        pos_idxs = [self.pos2idx.get(pos, self.pos2idx['<UNK>']) for pos in pos_tags]
        head_idxs = [int(head) for head in heads]  # Ensure head indices are integers
        label_idxs = [self.label2idx[label] for label in labels]
        return torch.tensor(word_idxs, dtype=torch.long), \
               torch.tensor(pos_idxs, dtype=torch.long), \
               torch.tensor(head_idxs, dtype=torch.long), \
               torch.tensor(label_idxs, dtype=torch.long)

def collate_fn(batch):
    word_idxs, pos_idxs, head_idxs, label_idxs = zip(*batch)
    word_idxs = pad_sequence(word_idxs, batch_first=True, padding_value=0)
    pos_idxs = pad_sequence(pos_idxs, batch_first=True, padding_value=0)
    head_idxs = pad_sequence(head_idxs, batch_first=True, padding_value=0)
    label_idxs = pad_sequence(label_idxs, batch_first=True, padding_value=0)
    return word_idxs, pos_idxs, head_idxs, label_idxs


In [26]:
import torch
import torch.nn as nn

class DependencyParser(nn.Module):
    def __init__(self, vocab_size, pos_size, num_labels, embedding_dim, hidden_dim):
        super(DependencyParser, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.pos_embeddings = nn.Embedding(pos_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim * 2, hidden_dim, batch_first=True, bidirectional=True)
        self.heads_linear = nn.Linear(hidden_dim * 2, vocab_size)  # Predicting heads
        self.labels_linear = nn.Linear(hidden_dim * 2, num_labels)  # Predicting labels

    def forward(self, word_idxs, pos_idxs):
        word_embeds = self.word_embeddings(word_idxs)
        pos_embeds = self.pos_embeddings(pos_idxs)
        embeddings = torch.cat((word_embeds, pos_embeds), dim=2)
        lstm_out, _ = self.lstm(embeddings)
        head_space = self.heads_linear(lstm_out)
        label_space = self.labels_linear(lstm_out)
        return head_space, label_space


In [None]:
from torch.utils.data import DataLoader

def train_model(model, dataset, epochs, learning_rate):
    data_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    head_loss_function = nn.CrossEntropyLoss(ignore_index=0)  # Assuming padding index is 0
    label_loss_function = nn.CrossEntropyLoss(ignore_index=0)

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for word_idxs, pos_idxs, head_idxs, label_idxs in data_loader:
            optimizer.zero_grad()
            head_outputs, label_outputs = model(word_idxs, pos_idxs)
            loss_heads = head_loss_function(head_outputs.transpose(1, 2), head_idxs)
            loss_labels = label_loss_function(label_outputs.transpose(1, 2), label_idxs)
            loss = loss_heads + loss_labels
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch}, Loss: {total_loss / len(data_loader)}")


# Example usage
dataset = DependencyParsingDataset(train_data, word2idx, pos2idx, label2idx)
model = DependencyParser(len(word2idx), len(pos2idx), len(label2idx), 100, 200)
train_model(model, dataset, 1, 0.01)


In [None]:
def evaluate(model, dataset, device=torch.device('cpu')):
    model.eval()
    total_tokens = 0
    correct_heads = 0
    with torch.no_grad():
        for word_idxs, pos_idxs, head_idxs, label_idxs in DataLoader(dataset, batch_size=1, collate_fn=collate_fn):
            word_idxs, pos_idxs = word_idxs.to(device), pos_idxs.to(device)
            head_outputs, label_outputs = model(word_idxs, pos_idxs)
            _, predicted_heads = torch.max(head_outputs, dim=2)
            correct_heads += (predicted_heads == head_idxs.to(device)).sum().item()
            total_tokens += head_idxs.numel()
    uas = correct_heads / total_tokens
    return uas


In [24]:
# from sklearn.metrics import precision_score, recall_score, f1_score
# from sklearn.metrics import accuracy_score

# def evaluate(model, dataset, device=torch.device('cpu')):
#     model.eval()  # Set the model to evaluation mode
#     model.to(device)

#     all_true_heads = []
#     all_pred_heads = []
#     all_true_labels = []
#     all_pred_labels = []

#     with torch.no_grad():
#         for word_idxs, pos_idxs, head_idxs, label_idxs in DataLoader(dataset, batch_size=1, collate_fn=collate_fn):
#             word_idxs, pos_idxs, head_idxs, label_idxs = word_idxs.to(device), pos_idxs.to(device), head_idxs.to(device), label_idxs.to(device)
#             outputs = model(word_idxs, pos_idxs)
#             _, predicted_heads = torch.max(outputs, dim=2)  # Assuming outputs are logits for each head position
#             predicted_labels = predicted_heads  # Adjust if your model also predicts labels differently

#             # Flatten the tensors for metric calculation
#             all_true_heads.extend(head_idxs.view(-1).cpu().numpy())
#             all_pred_heads.extend(predicted_heads.view(-1).cpu().numpy())
#             all_true_labels.extend(label_idxs.view(-1).cpu().numpy())
#             all_pred_labels.extend(predicted_labels.view(-1).cpu().numpy())

#     uas = accuracy_score(all_true_heads, all_pred_heads)
#     las = accuracy_score(all_true_labels, all_pred_labels)
#     precision = precision_score(all_true_labels, all_pred_labels, average='micro')
#     recall = recall_score(all_true_labels, all_pred_labels, average='micro')
#     f1 = f1_score(all_true_labels, all_pred_labels, average='micro')

#     return uas, las, precision, recall, f1



In [45]:
# from sklearn.metrics import accuracy_score

# def evaluate(model, dataset, device=torch.device('cpu')):
#     model.eval()
#     all_true_heads = []
#     all_pred_heads = []

#     with torch.no_grad():
#         for word_idxs, pos_idxs, head_idxs, label_idxs in DataLoader(dataset, batch_size=1, collate_fn=collate_fn):
#             outputs = model(word_idxs.to(device), pos_idxs.to(device))
#             _, predicted_heads = torch.max(outputs, dim=2)
#             all_true_heads.extend(head_idxs.view(-1).cpu().numpy())
#             all_pred_heads.extend(predicted_heads.view(-1).cpu().numpy())

#     uas = accuracy_score(all_true_heads, all_pred_heads)  # Simple accuracy for heads
#     return uas


In [25]:
test_data = load_data('test.gold.conll')
test_dataset = DependencyParsingDataset(test_data, word2idx, pos2idx, label2idx)

uas, las, precision, recall, f1 = evaluate(model, test_dataset)
print(f"UAS: {uas*100:.2f}%")
print(f"LAS: {las*100:.2f}%")
print(f"Precision: {precision*100:.2f}%")
print(f"Recall: {recall*100:.2f}%")
print(f"F1 Score: {f1*100:.2f}%")

dev_dataset = DependencyParsingDataset(dev_data, word2idx, pos2idx, label2idx)
uas, las, precision, recall, f1 = evaluate(model, dev_dataset)
print(f"UAS: {uas*100:.2f}%")
print(f"LAS: {las*100:.2f}%")
print(f"Precision: {precision*100:.2f}%")
print(f"Recall: {recall*100:.2f}%")
print(f"F1 Score: {f1*100:.2f}%")

# uas = evaluate(model,test_dataset)
# print(f"UAS: {uas*100:.2f}%")



UAS: 2.37%
LAS: 94.63%
Precision: 94.63%
Recall: 94.63%
F1 Score: 94.63%
UAS: 2.46%
LAS: 94.38%
Precision: 94.38%
Recall: 94.38%
F1 Score: 94.38%
