<a href="https://colab.research.google.com/github/vishwaj1/NLP/blob/main/GA_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install conllu

Collecting conllu
  Downloading conllu-4.5.3-py2.py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-4.5.3


In [2]:
import torch
from torch.utils.data import Dataset
from conllu import parse_incr

def load_data(file_path):
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for tokenlist in parse_incr(file):
            words = [token['form'] for token in tokenlist]
            pos_tags = [token['upostag'] for token in tokenlist]
            heads = [token['head'] for token in tokenlist]
            labels = [token['deprel'] for token in tokenlist]
            sentences.append((words, pos_tags, heads, labels))
    return sentences



In [3]:
def build_vocab(data):
    words = set()
    pos_tags = set()
    labels = set()
    for sentence in data:
        words.update(sentence[0])
        pos_tags.update(sentence[1])
        labels.update(sentence[3])
    word2idx = {word: i + 1 for i, word in enumerate(words)}  # +1 to start index from 1
    word2idx['<UNK>'] = 0  # Unknown words
    pos2idx = {tag: i + 1 for i, tag in enumerate(pos_tags)}  # +1 to start index from 1
    pos2idx['<UNK>'] = 0  # Unknown POS tags
    label2idx = {label: i for i, label in enumerate(labels)}
    return word2idx, pos2idx, label2idx

# Load data
train_data = load_data('train.gold.conll')

# Build vocabularies
word2idx, pos2idx, label2idx = build_vocab(train_data)


In [4]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

class DependencyParsingDataset(Dataset):
    def __init__(self, data, word2idx, pos2idx, label2idx):
        self.data = data
        self.word2idx = word2idx
        self.pos2idx = pos2idx
        self.label2idx = label2idx

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        words, pos_tags, heads, labels = self.data[idx]
        word_idxs = [self.word2idx.get(word.lower(), self.word2idx['<UNK>']) for word in words]
        pos_idxs = [self.pos2idx.get(pos, self.pos2idx['<UNK>']) for pos in pos_tags]
        head_idxs = [int(head) for head in heads]  # Ensure head indices are integers
        label_idxs = [self.label2idx[label] for label in labels]
        return torch.tensor(word_idxs, dtype=torch.long), \
               torch.tensor(pos_idxs, dtype=torch.long), \
               torch.tensor(head_idxs, dtype=torch.long), \
               torch.tensor(label_idxs, dtype=torch.long)

def collate_fn(batch):
    word_idxs, pos_idxs, head_idxs, label_idxs = zip(*batch)
    word_idxs = pad_sequence(word_idxs, batch_first=True, padding_value=0)
    pos_idxs = pad_sequence(pos_idxs, batch_first=True, padding_value=0)
    head_idxs = pad_sequence(head_idxs, batch_first=True, padding_value=0)
    label_idxs = pad_sequence(label_idxs, batch_first=True, padding_value=0)
    return word_idxs, pos_idxs, head_idxs, label_idxs


In [5]:
import torch
import torch.nn as nn



class DependencyParser(nn.Module):
    def __init__(self, vocab_size, pos_size, num_labels, embedding_dim, hidden_dim):
        super(DependencyParser, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.pos_embeddings = nn.Embedding(pos_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim * 2, hidden_dim, batch_first=True, bidirectional=True)
        self.hidden2label = nn.Linear(hidden_dim * 2, num_labels)

    def forward(self, word_idxs, pos_idxs):
        # Ensure that word_idxs and pos_idxs are of the shape [batch_size, sequence_length]
        word_embeds = self.word_embeddings(word_idxs)  # [batch_size, seq_length, embedding_dim]
        pos_embeds = self.pos_embeddings(pos_idxs)     # [batch_size, seq_length, embedding_dim]

        # Concatenation should work as both have three dimensions
        embeds = torch.cat((word_embeds, pos_embeds), dim=2)  # Concatenate along the last dimension

        lstm_out, _ = self.lstm(embeds)  # LSTM expects input of shape [batch_size, seq_length, features]
        label_space = self.hidden2label(lstm_out)
        return label_space



In [6]:
from torch.utils.data import DataLoader

def train_model(model, dataset, epochs, learning_rate):
    data_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    loss_function = nn.CrossEntropyLoss(ignore_index=0)  # Assuming '0' is used for padding

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for word_idxs, pos_idxs, head_idxs, label_idxs in data_loader:
            optimizer.zero_grad()
            outputs = model(word_idxs, pos_idxs)
            loss = loss_function(outputs.view(-1, outputs.shape[-1]), label_idxs.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch}, Loss: {total_loss / len(data_loader)}")



dataset = DependencyParsingDataset(train_data, word2idx, pos2idx, label2idx)
model = DependencyParser(len(word2idx), len(pos2idx), len(label2idx), 100, 200)
train_model(model, dataset, 5, 0.01)


Epoch 0, Loss: 0.25191973025899334
Epoch 1, Loss: 0.18227184348436723
Epoch 2, Loss: 0.16593886163220348
Epoch 3, Loss: 0.1564775746809431
Epoch 4, Loss: 0.14959591199236222


In [7]:
pip install torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [13]:
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score

def evaluate(model, dataset, device=torch.device('cpu')):
    model.eval()  # Set the model to evaluation mode
    model.to(device)

    all_true_heads = []
    all_pred_heads = []
    all_true_labels = []
    all_pred_labels = []

    with torch.no_grad():
        for word_idxs, pos_idxs, head_idxs, label_idxs in DataLoader(dataset, batch_size=1, collate_fn=collate_fn):
            word_idxs, pos_idxs, head_idxs, label_idxs = word_idxs.to(device), pos_idxs.to(device), head_idxs.to(device), label_idxs.to(device)
            outputs = model(word_idxs, pos_idxs)
            _, predicted_heads = torch.max(outputs, dim=2)
            predicted_labels = predicted_heads
            # Flatten the tensors for metric calculation
            all_true_heads.extend(head_idxs.view(-1).cpu().numpy())
            all_pred_heads.extend(predicted_heads.view(-1).cpu().numpy())
            all_true_labels.extend(label_idxs.view(-1).cpu().numpy())
            all_pred_labels.extend(predicted_labels.view(-1).cpu().numpy())

    uas = accuracy_score(all_true_heads, all_pred_heads)
    las = accuracy_score(all_true_labels, all_pred_labels)


    return uas, las



In [15]:
test_data = load_data('test.gold.conll')
test_dataset = DependencyParsingDataset(test_data, word2idx, pos2idx, label2idx)

print("Results for test data")
uas, las = evaluate(model, test_dataset)
print(f"UAS: {uas*100:.2f}%")
print(f"LAS: {las*100:.2f}%")


print("\nResults for dev data")
dev_data = load_data('dev.gold.conll')
dev_dataset = DependencyParsingDataset(dev_data, word2idx, pos2idx, label2idx)
uas, las = evaluate(model, dev_dataset)
print(f"UAS: {uas*100:.2f}%")
print(f"LAS: {las*100:.2f}%")





Results for test data
UAS: 2.54%
LAS: 2.13%

Results for dev data
UAS: 2.54%
LAS: 2.08%


# German Dataset

In [16]:
def load_data_g(file_path):
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for tokenlist in parse_incr(file):
            if any(token['head'] is None for token in tokenlist):  # Check if any head is None
                continue  # Skip this sentence or handle it differently
            words = [token['form'] for token in tokenlist]
            pos_tags = [token['upostag'] for token in tokenlist]
            heads = [token['head'] for token in tokenlist if token['head'] is not None]
            labels = [token['deprel'] for token in tokenlist]
            sentences.append((words, pos_tags, heads, labels))
    return sentences


In [17]:
def build_vocab_g(data):
    words = set()
    pos_tags = set()
    labels = set()
    for sentence in data:
        words.update(sentence[0])
        pos_tags.update(sentence[1])
        labels.update(sentence[3])
    word2idx = {word: i + 1 for i, word in enumerate(words)}  # +1 to start index from 1
    word2idx['<UNK>'] = 0  # Unknown words
    pos2idx = {tag: i + 1 for i, tag in enumerate(pos_tags)}  # +1 to start index from 1
    pos2idx['<UNK>'] = 0  # Unknown POS tags
    label2idx = {label: i for i, label in enumerate(labels)}
    return word2idx, pos2idx, label2idx

# Load data
#dev_data = load_data('dev.gold.conll')
train_data = load_data_g('de_gsd-ud-train.conllu')

# Build vocabularies
word2idx, pos2idx, label2idx = build_vocab_g(train_data)


In [18]:
class DependencyParsingDataset_g(Dataset):
    def __init__(self, data, word2idx, pos2idx, label2idx):
        self.data = data
        self.word2idx = word2idx
        self.pos2idx = pos2idx
        self.label2idx = label2idx

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        words, pos_tags, heads, labels = self.data[idx]
        word_idxs = [self.word2idx.get(word.lower(), self.word2idx['<UNK>']) for word in words]
        pos_idxs = [self.pos2idx.get(pos, self.pos2idx['<UNK>']) for pos in pos_tags]
        head_idxs = [int(head) if head is not None else -1 for head in heads]  # Replace None with -1
        label_idxs = [self.label2idx[label] for label in labels]
        return torch.tensor(word_idxs, dtype=torch.long), \
               torch.tensor(pos_idxs, dtype=torch.long), \
               torch.tensor(head_idxs, dtype=torch.long), \
               torch.tensor(label_idxs, dtype=torch.long)


In [19]:
def train_model_g(model, dataset, epochs, learning_rate):
    data_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    loss_function = nn.CrossEntropyLoss(ignore_index=-1)  # Assuming -1 is used for missing values

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for word_idxs, pos_idxs, head_idxs, label_idxs in data_loader:
            optimizer.zero_grad()
            outputs = model(word_idxs, pos_idxs)
            loss = loss_function(outputs.view(-1, outputs.shape[-1]), label_idxs.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch}, Loss: {total_loss / len(data_loader)}")

dataset = DependencyParsingDataset_g(train_data, word2idx, pos2idx, label2idx)
model = DependencyParser(len(word2idx), len(pos2idx), len(label2idx), 100, 200)
train_model_g(model, dataset, 5, 0.01)

Epoch 0, Loss: 0.2211680657333798
Epoch 1, Loss: 0.1293135745066499
Epoch 2, Loss: 0.10583970916886178
Epoch 3, Loss: 0.09089360660503781
Epoch 4, Loss: 0.07876359896054344


In [23]:
test_data = load_data_g('de_gsd-ud-test.conllu')
test_dataset = DependencyParsingDataset(test_data, word2idx, pos2idx, label2idx)

print("Results for test data")
uas, las = evaluate(model, test_dataset)
print(f"UAS: {uas*100:.2f}%")
print(f"LAS: {las*100:.2f}%")

print("\nResults for dev data")
dev_data = load_data_g('de_gsd-ud-dev.conllu')
dev_dataset = DependencyParsingDataset(dev_data, word2idx, pos2idx, label2idx)
uas, las = evaluate(model, dev_dataset)
print(f"UAS: {uas*100:.2f}%")
print(f"LAS: {las*100:.2f}%")


Results for test data
UAS: 2.57%
LAS: 2.11%

Results for dev data
UAS: 2.28%
LAS: 1.97%
