### **NER Custom Model**
#### Bi-LSTM with CRF

In [32]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import torch.optim as optim
from TorchCRF import CRF
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

In [86]:
def read_dataset():
    data = pd.read_csv('./ner_dataset/ner_dataset.csv', encoding='latin1')

    # remove white spaces from column names
    data.columns = data.columns.str.strip()

    print(data.columns)
    # print(data.columns)
    # Group by 'Sentence #' and aggregate
    grouped_data = data.groupby('Sentence #').agg({
        'Word': lambda x: ' '.join(x),  # Join words into a single sentence
        'Tag': lambda x: list(x),       # Collect tags into a list
        'Intent': lambda x: x     # Collect intents into a list
    }).reset_index()  # Reset index to make 'Sentence #' a regular column

    # Display the grouped and aggregated data
    # print(grouped_data.columns)
    return data, grouped_data

def prepare_data(dataframe):
    dataset = []
    for index, row in dataframe.iterrows():
        sentence = row['Word']
        tags = row['Tag']
        intents = row['Intent'][0]
        dataset.append((sentence, tags, intents))

    return dataset

data, grouped_data = read_dataset()

prepared_dataset = prepare_data(grouped_data)

# get maximum length of sentence
max_sentence_length = max([len(sentence.split()) for sentence, _, _ in prepared_dataset])

print(max_sentence_length)

print(prepared_dataset[0])

Index(['Sentence #', 'Word', 'Tag', 'Intent'], dtype='object')
20
(' is  approved  equals  clustering  algorithms', [' B-VAR', ' I-VAR', ' O', ' B-VAL', ' I-VAL'], ' variable_declaration')


In [89]:
train_data, val_data = train_test_split(prepared_dataset, test_size=0.1, random_state=42)

In [88]:
class NERDataset(Dataset):
    def __init__(self, data, word_to_ix, tag_to_ix, intent_to_ix):
        self.data = data
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix
        self.intent_to_ix = intent_to_ix

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence, tags, intent = self.data[idx]
        words = sentence.split()
        word_indices = [self.word_to_ix[w] for w in words]
        tag_indices = [self.tag_to_ix[t] for t in tags]
        intent_index = self.intent_to_ix[intent]
        return torch.tensor(word_indices, dtype=torch.long), torch.tensor(tag_indices, dtype=torch.long), torch.tensor(intent_index, dtype=torch.long)

print(len(data['Word'].values))

# Create vocabulary and tag dictionaries
words = set(data['Word'].values)
tags = set(data['Tag'].values)
intents = set(data['Intent'].values)

# print(words)

word_to_ix = {word.strip(): i for i, word in enumerate(words)}
tag_to_ix = {tag: i for i, tag in enumerate(tags)}
intent_to_ix = {intent: i for i, intent in enumerate(intents)}

# Add a special token for OOV words and padding
word_to_ix['<OOV>'] = len(word_to_ix)
word_to_ix['<PAD>'] = len(word_to_ix)
tag_to_ix['<PAD>'] = len(tag_to_ix)

word_vocab_size = len(word_to_ix)
tagset_size = len(tag_to_ix)
intent_vocab_size = len(intent_to_ix)

print(f'Vocabulary size: {word_vocab_size}, Tag set size: {tagset_size}, Intent set size: {intent_vocab_size}')

# print(word_to_ix['with'])

# print(word_to_ix)
# print(tag_to_ix)

# print(len(word_to_ix))
# print(len(tag_to_ix))
# print(len(intent_to_ix))

# Create datasets and dataloaders
train_dataset = NERDataset(train_data, word_to_ix, tag_to_ix, intent_to_ix)
val_dataset = NERDataset(val_data, word_to_ix, tag_to_ix, intent_to_ix)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

1655
Vocabulary size: 305, Tag set size: 7, Intent set size: 1


In [90]:
def prepare_data(data):
    sentences, tags, intents = zip(*data)
    max_len = max(len(s.split()) for s in sentences)
    sentence_tensors = []
    tag_tensors = []
    for sentence, tag in zip(sentences, tags):
        words = sentence.split()
        word_indices = [word_to_ix.get(w, word_to_ix['<OOV>']) for w in words]
        word_indices += [word_to_ix['<PAD>']] * (max_len - len(word_indices))
        tag_indices = [tag_to_ix[t] for t in tag]
        tag_indices += [tag_to_ix['<PAD>']] * (max_len - len(tag_indices))
        sentence_tensors.append(torch.tensor(word_indices, dtype=torch.long))
        tag_tensors.append(torch.tensor(tag_indices, dtype=torch.long))
    intent_indices = [intent_to_ix[intent] for intent in intents]
    sentence_tensors = torch.stack(sentence_tensors)
    tag_tensors = torch.stack(tag_tensors)
    intent_indices = torch.tensor(intent_indices, dtype=torch.long)
    return sentence_tensors, tag_tensors, intent_indices

train_sentences, train_tags, train_intents = prepare_data(train_data)
val_sentences, val_tags, val_intents = prepare_data(val_data)

In [76]:
# Parameters
word_vocab_size = len(word_to_ix)
tagset_size = len(tag_to_ix)
intent_vocab_size = len(intent_to_ix)

word_embedding_dim = 100
intent_embedding_dim = 50

hidden_dim = 128
lstm_input_dim = word_embedding_dim + intent_embedding_dim
# word_embedding = nn.Embedding(word_vocab_size, word_embedding_dim)
# intent_embedding = nn.Embedding(intent_vocab_size, intent_embedding_dim)

In [None]:
# LSTM Layer

# lstm = nn.LSTM(lstm_input_dim, hidden_dim, bidirectional=True, batch_first=True)

In [None]:
# Hidden (Linear) layer and CRF
# hidden2tag = nn.Linear(hidden_dim * 2, tagset_size)

# crf = CRF(tagset_size, batch_first=True)

In [77]:
class NERModel(nn.Module):
    def __init__(self, word_vocab_size, word_embedding_dim, intent_vocab_size, intent_embedding_dim, lstm_input_dim, hidden_dim, tagset_size):
        super(NERModel, self).__init__()
        self.word_embedding = nn.Embedding(word_vocab_size, word_embedding_dim)
        self.intent_embedding = nn.Embedding(intent_vocab_size, intent_embedding_dim)
        self.lstm = nn.LSTM(lstm_input_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim * 2, tagset_size)
        self.crf = CRF(tagset_size)

    def forward(self, sentence, intent):
        word_embeds = self.word_embedding(sentence)
        intent_embed = self.intent_embedding(intent).unsqueeze(1).repeat(1, word_embeds.size(1), 1)
        lstm_input = torch.cat((word_embeds, intent_embed), dim=2)
        lstm_out, _ = self.lstm(lstm_input)
        emissions = self.hidden2tag(lstm_out)
        emissions = emissions.transpose(0, 1)
        return emissions

    def loss(self, emissions, tags, mask):
        tags = tags.transpose(0, 1)
        mask = mask.transpose(0, 1)
        return -self.crf(emissions, tags, mask=mask, reduction='mean')

    def decode(self, emissions, mask):
        emissions = emissions.transpose(0, 1)
        mask = mask.transpose(0, 1)
        return self.crf.decode(emissions, mask=mask)

model = NERModel(word_vocab_size, word_embedding_dim, intent_vocab_size, intent_embedding_dim, lstm_input_dim, hidden_dim, tagset_size)

In [91]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.001)

def mask_padding(tensor, padding_idx):
    return tensor != padding_idx

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    emissions = model(train_sentences, train_intents)
    mask = mask_padding(train_sentences, word_to_ix['<PAD>'])
    loss = model.loss(emissions, train_tags, mask)
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')
    
    model.eval()
    with torch.no_grad():
        emissions = model(val_sentences, val_intents)
        mask = mask_padding(val_sentences, word_to_ix['<PAD>'])
        predictions = model.decode(emissions, mask)
        # Calculate evaluation metrics here using `predictions` and `val_tags`

# The model is now trained, and you can perform evaluation as needed

IndexError: index out of range in self

In [None]:
# Decoding example
model.eval()
with torch.no_grad():
    for batch in batchify(val_data, batch_size):
        sentences, tags, intents = prepare_batch(batch)
        emissions = model(sentences, intents)
        mask = sentences != 0
        predictions = model.decode(emissions, mask)
        # Here, you can compare predictions with the true tags