In [6]:
import torch
from bilstm_crf import BiLSTMCRF
from torch.optim import SGD
from utils import *

torch.manual_seed(1)

ModuleNotFoundError: No module named 'utils'

### **HELPER FUNCTIONS**

In [150]:
# constants
START_TAG = "<START>"
STOP_TAG = "<STOP>"
WORD_EMBEDDING_DIM = 50
INTENT_EMBEDIING_DIM = 50
HIDDEN_DIM = 64
NUMBER_OF_INTENTS = 1

### **PREPARE DATA**

In [111]:
# read data
def read_data(file):
    with open(file, 'r') as f:
        lines = f.readlines()
    training_data = []
    for line in lines:
        line.split('\t')
        line = line[:-1]
        line = line.split('\t')
        line[0] = list(map(str, line[0][1:-1].split(',')))
        line[0] = [word.strip().replace('\'', '') for word in line[0]]

        line[1] = list(map(str, line[1][1:-1].split(',')))
        line[1] = [tag.strip().replace('\'', '') for tag in line[1]]
        training_data.append((line[0], line[1]))

    return training_data

training_data = read_data('dataset.txt')

In [122]:
# convert data to indices
word_to_index = {}
tag_to_index = {}
intent_to_index = {"variable declaration" : 0}
for sentence, tags in training_data:
    for word in sentence:
        if word not in word_to_index:
            word_to_index[word] = len(word_to_index)
    for tag in tags:
        if tag not in tag_to_index:
            tag_to_index[tag] = len(tag_to_index)

In [117]:
vocab_size = len(word_to_index)

print("Vocabulary Size: ", vocab_size)

number_of_tags = len(tag_to_index)

print("Number of Tags: ", number_of_tags)

Vocabulary Size:  304
Number of Tags:  6


In [118]:
# Assertions
i = 1
for sentence, tags in training_data:
    if len(sentence) != len(tags):
        print(f"Example {i}: Sentence Length: {len(sentence)}, Tags Length: {len(tags)}")
    i +=1

for sentence, tags in training_data:
    assert len(sentence) == len(tags)

### **MODEL DEFINITION**

In [180]:
model = BiLSTMCRF(vocab_size, word_embedding_dim=WORD_EMBEDDING_DIM, intent_embedding_dim=INTENT_EMBEDIING_DIM, hidden_dim=HIDDEN_DIM, output_dim=number_of_tags, n_intents=NUMBER_OF_INTENTS)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [181]:
# Check predictions before training
precheck_tag = prepare_sequence(training_data[50][1], tag_to_index)
print(precheck_tag)

with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[50][0], word_to_index)
    intent = ['variable declaration' for _ in training_data[50][0]]
    precheck_intent = prepare_sequence(intent, intent_to_index)
    print(precheck_intent.shape)
    print(model(precheck_sent, precheck_intent))

tensor([2, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 3])
torch.Size([14])
Word Embedding Shape: torch.Size([14, 1, 50])
Intent Embedding Shape: torch.Size([14, 1, 50])
Embedding Shape: torch.Size([14, 1, 100])
LSTM Shape: torch.Size([14, 1, 64])
torch.Size([14, 1, 6])
[[2, 0, 0, 0, 4, 2, 0, 3, 2, 2, 2, 0, 2, 2]]


In [182]:
# training loop
for epoch in range(10):
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        intent = ['variable declaration' for _ in sentence]
        train_intent = prepare_sequence(intent, intent_to_index)
        sentence = prepare_sequence(sentence, word_to_index)
        labels = torch.tensor([tag_to_index[t] for t in tags], dtype=torch.long)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence, labels, train_intent)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

Word Embedding Shape: torch.Size([5, 1, 50])
Intent Embedding Shape: torch.Size([5, 1, 50])
Embedding Shape: torch.Size([5, 1, 100])
LSTM Shape: torch.Size([5, 1, 64])
torch.Size([5, 1, 6])
Word Embedding Shape: torch.Size([4, 1, 50])
Intent Embedding Shape: torch.Size([4, 1, 50])
Embedding Shape: torch.Size([4, 1, 100])
LSTM Shape: torch.Size([4, 1, 64])
torch.Size([4, 1, 6])
Word Embedding Shape: torch.Size([5, 1, 50])
Intent Embedding Shape: torch.Size([5, 1, 50])
Embedding Shape: torch.Size([5, 1, 100])
LSTM Shape: torch.Size([5, 1, 64])
torch.Size([5, 1, 6])
Word Embedding Shape: torch.Size([7, 1, 50])
Intent Embedding Shape: torch.Size([7, 1, 50])
Embedding Shape: torch.Size([7, 1, 100])
LSTM Shape: torch.Size([7, 1, 64])
torch.Size([7, 1, 6])
Word Embedding Shape: torch.Size([4, 1, 50])
Intent Embedding Shape: torch.Size([4, 1, 50])
Embedding Shape: torch.Size([4, 1, 100])
LSTM Shape: torch.Size([4, 1, 64])
torch.Size([4, 1, 6])
Word Embedding Shape: torch.Size([4, 1, 50])
Inten

In [183]:
# Check predictions after training
precheck_tag = prepare_sequence(training_data[50][1], tag_to_index)
print(precheck_tag)

with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[50][0], word_to_index)
    intent = ['variable declaration' for _ in training_data[50][0]]
    precheck_intent = prepare_sequence(intent, intent_to_index)
    print(precheck_intent.shape)
    print(model(precheck_sent, precheck_intent))

tensor([2, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 3])
torch.Size([14])
Word Embedding Shape: torch.Size([14, 1, 50])
Intent Embedding Shape: torch.Size([14, 1, 50])
Embedding Shape: torch.Size([14, 1, 100])
LSTM Shape: torch.Size([14, 1, 64])
torch.Size([14, 1, 6])
[[2, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 3]]


: 