In [552]:
from torch import optim
from utils import *
from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer
import torch
import torch.nn as nn
from TorchCRF import CRF
from typing import List, Tuple, Optional
import math


# import evaluation metrics 
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

torch.manual_seed(1)

<torch._C.Generator at 0x15d454dda50>

In [553]:
if torch.cuda.is_available():
    device = torch.device('cuda')

device

device(type='cuda')

In [554]:
INF = -100

In [555]:
class ConstrainedCRF(CRF):
    def __init__(self, num_tags):
        super(ConstrainedCRF, self).__init__(num_tags)

    def decode(self, emissions: torch.Tensor, constraints: List[Tuple[torch.IntTensor, torch.IntTensor]], mask: Optional[torch.ByteTensor] = None) -> List[List[int]]:
        """Find the most likely tag sequence using Viterbi algorithm.

        Args:
            emissions (`~torch.Tensor`): Emission score tensor of size
                ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
                ``(batch_size, seq_length, num_tags)`` otherwise.
            mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
                if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.

        Returns:
            List of list containing the best tag sequence for each batch.
        """
        self._validate(emissions, mask=mask)
        if mask is None:
            mask = emissions.new_ones(emissions.shape[:2], dtype=torch.uint8)

        if self.batch_first:
            emissions = emissions.transpose(0, 1)
            mask = mask.transpose(0, 1)

        return self._viterbi_decode(emissions, mask, constraints)

    def _viterbi_decode(self, emissions: torch.FloatTensor, mask: torch.ByteTensor, constraints: List[Tuple[torch.IntTensor, torch.IntTensor]]) -> List[List[int]]:
            print("Viterbi Decode")
            '''
                override the viterbi decode function to include the constraints
            '''
            assert emissions.dim() == 3 and mask.dim() == 2 # (sequence_length, batch_size, num_tags)
            assert emissions.shape[:2] == mask.shape # (sequence_length, batch_size)
            assert emissions.size(2) == self.num_tags
            assert mask[0].all()

            sequence_length, batch_size = mask.shape

            constrained_transitions = self.transitions.clone()
            for constraint in constraints:
                constrained_transitions[constraint[0], constraint[1]] = INF


            # start transition and first emission
            # tensor of size (batch_size, num_tags)
            score = self.start_transitions + emissions[0] 

            backpointers = []

            for i in range(1, sequence_length):
                # Broadcast viterbi score for every possible next tag
                # shape: (batch_size, num_tags, 1)
                broadcast_score = score.unsqueeze(2)
                
                    
                # Broadcast emission score for every possible current tag
                # shape: (batch_size, 1, num_tags)
                broadcast_emission = emissions[i].unsqueeze(1)

                # Compute the score tensor of size (batch_size, num_tags, num_tags) where
                # for each sample, entry at row i and column j stores the score of the best
                # tag sequence so far that ends with transitioning from tag i to tag j and emitting
                # shape: (batch_size, num_tags, num_tags)
                # next_score = broadcast_score + self.transitions + broadcast_emission
                next_score = broadcast_score + constrained_transitions + broadcast_emission

                # print("Next Score Shape:", next_score[0].shape)

                # Apply the constraints
                # for constraint in constraints:
                #     # print("Constraint:", constraint)
                #     next_score[0][constraint[0], constraint[1]] = INF

                
                # Find the maximum score over all possible current tag
                # shape: (batch_size, num_tags)
                next_score, indices = next_score.max(dim=1)

                # Set score to the next score if this timestep is valid (mask == 1)
                # and save the index that produces the next score
                # shape: (batch_size, num_tags)
                score = torch.where(mask[i].unsqueeze(1), next_score, score)
                backpointers.append(indices)
            
            
            # End transition score
            # shape: (batch_size, num_tags)
            score += self.end_transitions


            # Now, compute the best path for each sample

            # shape: (batch_size,)
            seq_ends = mask.long().sum(dim=0) - 1
            best_tags_list = []

            for idx in range(batch_size):
                # Find the tag which maximizes the score at the last timestep; this is our best tag
                # for the last timestep
                _, best_last_tag = score[idx].max(dim=0)
                best_tags = [best_last_tag.item()]

                # We trace back where the best last tag comes from, append that to our best tag
                # sequence, and trace it back again, and so on
                for hist in reversed(backpointers[:seq_ends[idx]]):
                    best_last_tag = hist[idx][best_tags[-1]]
                    best_tags.append(best_last_tag.item())

                # Reverse the order because we start from the last timestep
                best_tags.reverse()
                best_tags_list.append(best_tags)

            return best_tags_list



class BiLSTMCRF(nn.Module):
    def __init__(self, vocab_size, word_embedding_dim, intent_embedding_dim, hidden_dim, output_dim, number_of_intents, index_to_tag):
        
        super(BiLSTMCRF, self).__init__()
        # hyperparameters
        self.word_embedding_dim = word_embedding_dim
        self.inten_embedding_dim = intent_embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.output_dim = output_dim
        self.index_to_tag = index_to_tag

        # model layers
        self.word_embedding = nn.Embedding(vocab_size, word_embedding_dim)
        self.intent_embedding = nn.Embedding(number_of_intents, intent_embedding_dim)
        self.lstm = nn.LSTM(word_embedding_dim + intent_embedding_dim, hidden_dim // 2, bidirectional=True, dropout=0.2)
        self.hidden_to_tag = nn.Linear(hidden_dim, output_dim)
        self.crf = ConstrainedCRF(output_dim)

    def __create_constraints(self, mask, intent):
        constraints = []
        one_indices = torch.where(mask == 1)[0]
        zero_indices = torch.where(mask == 0)[0]
        print(one_indices)
        print(zero_indices)
        for i in one_indices:
            for j in zero_indices:
                constraints.append((i, j))
                constraints.append((j, i))
            
            if self.index_to_tag[i.item()] == 'O':
                for j in one_indices:
                    if self.index_to_tag[j.item()][0] == 'I':
                        constraints.append((i, j))
        
        for i in zero_indices:
            for j in zero_indices:
                constraints.append((i, j))

            
        return constraints

    def __init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2), torch.randn(2, 1, self.hidden_dim // 2)) # initialize hidden state

    def __get_lstm_features(self, sentence, intent):
        self.hidden = self.__init_hidden()
        word_embeddings = self.word_embedding(sentence).view(len(sentence), 1, -1)
        # print("Word Embedding Shape:", word_embeddings.shape) # (len(sentence), batch_size, embedding_dimension)
        intent_embeddings = self.intent_embedding(intent).view(1, 1, -1).repeat(len(sentence), 1, 1)
        # print("Intent Embedding Shape:", intent_embeddings.shape) # (len(sentence), batch_size, embedding_dimension)
        embeddings = torch.cat((word_embeddings, intent_embeddings), dim=2) 
        # print("Embedding Shape:", embeddings.shape) # (len(sentence), batch_size, embedding_dimension * 2)
        lstm_out, self.hidden = self.lstm(embeddings, self.hidden)
        # print("LSTM Out Shape:", lstm_out.shape) # (len(sentence), batch_size, hidden_dimension)
        lstm_features = self.hidden_to_tag(lstm_out)
        # print("LSTM Features Shape:", lstm_features.shape) # (len(sentence), batch_size, output_dimension (number of target entities)) -> number of entities 3andena 30 
        return lstm_features

    def neg_log_likelihood(self, sentence, tags, intent, mask):
        # print("Intent Shape:", intent.shape)
        emissions = self.__get_lstm_features(sentence, intent) # (sentence_length, batch_size, number of tags) # emissions where each word in the sequence coressponds to probability for tag
        # print("NEG LIKE FEATURES:", features.shape)
        # print(features)
        # mask = mask.view(1, -1)
        # print("Mask Shape:", mask.shape)
        # repeated_mask = mask.repeat(features.shape[0], 1, 1).contiguous()
        # print(repeated_mask.shape)
        # print(repeated_mask)
        # features = features * mask
        # print("Intent:", intent[0])
        # print("Mask:", mask)
        # print("Features:", features)
        # print("Features Shape:", features.shape)
        # mask = mask.view(-1, 1)
        # print("Emissions[0] before:", emissions[0])
        # print("Mask:", mask)
        intent_mask = torch.where(mask == 0, torch.tensor([INF]), torch.tensor([0.0]))
        # print("Log Mask:", intent_mask)
        # intent_mask = torch.tensor([1 if i not in torch.where(mask == 0)[0] else INF for i, mask in enumerate(mask)], dtype=torch.long)
        # intent_mask = intent_mask.view(1, 1, -1)
        # print("Intent Mask:", intent_mask)
        emissions = emissions + intent_mask
        # print("Mask Shape:", intent_mask.shape)
        # print("Emissions Shape", emissions.shape)
        # print("Emissions[0] after:", emissions)
        tags = tags.view(-1, 1)
        loss = -self.crf(emissions, tags)
        # print("Loss:", loss)
        return loss

    def forward(self, sentence, intent, mask):
        emissions = self.__get_lstm_features(sentence, intent)
        # print(lstm_features)
        # print("LSTM Features", lstm_features)
        # print("Forward LSTM features:", lstm_features.shape)
        # repeated_mask = mask.repeat(lstm_features.shape[0], 1, 1).contiguous()
        # test_mask = torch.ones(len(sentence), 1, self.output_dim)
        # print("Repeated Mask Shape:", repeated_mask.shape)
        # print("Mask Shape:", mask.shape)
        # lstm_features = lstm_features * mask
        # print("Forward LSTM features:", lstm_features.shape)
        # print(self.crf.transitions)
        print(mask)
        intent_mask = torch.where(mask == 0, torch.tensor([INF]), torch.tensor([0.0]))
        emissions = emissions + intent_mask
        constraints = self.__create_constraints(mask, intent)
        tag_sequence = self.crf.decode(emissions, constraints)
        # print("Tag Sequence:", tag_sequence)
        return tag_sequence
    

### **HELPER FUNCTIONS**

In [556]:
# constants
WORD_EMBEDDING_DIM = 50
INTENT_EMBEDIING_DIM = 50
HIDDEN_DIM = 64

In [557]:
def read_dataset():
    data = pd.read_csv('../ner_dataset/ner_dataset.csv', encoding='latin1')

    # remove white spaces from column names
    data.columns = data.columns.str.strip()

    print(data.columns)
    # Group by 'Sentence #' and aggregate
    grouped_data = data.groupby('Sentence #').agg({
        'Word': lambda x: ''.join(x),  # Join words into a single sentence
        'Tag': lambda x: list(x.str.strip()),       # Collect tags into a list
        'Intent': lambda x: list(x.str.strip().str.replace('_', ' '))     # Collect intents into a list
    }).reset_index()  # Reset index to make 'Sentence #' a regular column

    return data, grouped_data


def prepare_data(dataframe):
    dataset = []
    for _, row in dataframe.iterrows():
        sentence = row['Word'][1:]
        tags = row['Tag']
        intents = row['Intent']
        dataset.append((sentence, tags, intents[0]))

    return dataset

In [558]:
# Format data
data, goruped_data = read_dataset()
print(data["Intent"].unique())

goruped_data.head()

Index(['Sentence #', 'Word', 'Tag', 'Intent'], dtype='object')
[' array operation' ' assertion' ' assignment operation'
 ' bitwise operation' ' casting' ' class declaration' ' comment'
 ' conditional operation' ' constant declaration' ' file system'
 ' for loop' ' function declaration' ' git operation' ' ide operation'
 ' input' ' interactive commands' ' libraries' ' mathematical operation'
 ' membership operation' ' output' ' variable declaration' ' while loop']


Unnamed: 0,Sentence #,Word,Tag,Intent
0,0,append the value 12345.6789 to the list packe...,"[B-OPERATION, O, O, B-ELEMENT, O, O, I-ARRAY, ...","[array operation, array operation, array opera..."
1,1,push the value false to the list inventories,"[B-OPERATION, O, O, B-ELEMENT, O, O, O, B-ARRAY]","[array operation, array operation, array opera..."
2,2,append the value -456 to the array player scores,"[B-OPERATION, O, O, B-ELEMENT, O, O, O, B-ARRA...","[array operation, array operation, array opera..."
3,3,push the value 3.14159 to the list links,"[B-OPERATION, O, O, B-ELEMENT, O, O, O, B-ARRAY]","[array operation, array operation, array opera..."
4,4,include the value Back End Development to the...,"[B-OPERATION, O, O, B-ELEMENT, I-ELEMENT, I-EL...","[array operation, array operation, array opera..."


In [559]:
# training data = (sentence, tags, intent)
training_data = prepare_data(goruped_data)

print(training_data[0])

('append the value 12345.6789 to the list packet list', ['B-OPERATION', 'O', 'O', 'B-ELEMENT', 'O', 'O', 'I-ARRAY', 'B-ARRAY', 'O'], 'array operation')


### **PREPARE DATA**

In [560]:
# convert intent to index
with open('../ner_dataset/annotations/annotations.json', 'r') as f:
    data = json.load(f)

annotations = data['annotations']

intents = list(annotations.keys())

print(intents)
print(len(intents))

intent_to_index = {intent: index for index, intent in enumerate(intents)}

intent_to_index["<UNK>"] = len(intent_to_index)

['array operation', 'assertion', 'assignment operation', 'bitwise operation', 'casting', 'class declaration', 'comment', 'conditional operation', 'constant declaration', 'file system', 'for loop', 'function declaration', 'git operation', 'ide operation', 'input', 'interactive commands', 'libraries', 'mathematical operation', 'membership operation', 'output', 'variable declaration', 'while loop']
22


In [561]:
# convert data to indices
word_to_index = {}
tag_to_index = {}
for sentence, tags, intents in training_data:
    for word in sentence.split():
        if word not in word_to_index:
            word_to_index[word] = len(word_to_index)
            
    for tag in tags:
        if tag not in tag_to_index:
            tag_to_index[tag] = len(tag_to_index)

# add the 'UNK' token
word_to_index['<UNK>'] = len(word_to_index)

tag_to_index['<UNK>'] = len(tag_to_index)

tag_to_index

{'B-OPERATION': 0,
 'O': 1,
 'B-ELEMENT': 2,
 'I-ARRAY': 3,
 'B-ARRAY': 4,
 'I-ELEMENT': 5,
 'I-OPERATION': 6,
 'B-VAR': 7,
 'B-CONDITION': 8,
 'I-CONDITION': 9,
 'B-VAL': 10,
 'I-VAL': 11,
 'I-VAR': 12,
 'B-LHS': 13,
 'I-LHS': 14,
 'B-RHS': 15,
 'I-RHS': 16,
 'B-OPERATOR': 17,
 'B-OPERAND': 18,
 'I-OPERAND': 19,
 'I-OPERATOR': 20,
 'B-TYPE': 21,
 'B-CLASS': 22,
 'I-CLASS': 23,
 'I-COMMENT': 24,
 'B-COMMENT': 25,
 'B-LOG': 26,
 'B-ACTION': 27,
 'I-ACTION': 28,
 'B-DIR': 29,
 'B-START': 30,
 'B-END': 31,
 'B-STEP': 32,
 'B-COLLECTION': 33,
 'I-COLLECTION': 34,
 'B-FUNC': 35,
 'I-FUNC': 36,
 'B-PARAM': 37,
 'I-PARAM': 38,
 'B-MESSAGE': 39,
 'I-MESSAGE': 40,
 'B-LINE': 41,
 'B-FILE': 42,
 'B-LIB_NAME': 43,
 '<UNK>': 44}

In [562]:
vocab_size = len(word_to_index)

print("Vocabulary Size: ", vocab_size)

number_of_tags = len(tag_to_index)

print("Number of Tags: ", number_of_tags)

number_of_intents = len(intent_to_index)

print("Number of Intents: ", number_of_intents)

Vocabulary Size:  1101
Number of Tags:  45
Number of Intents:  23


In [563]:
tag_to_index
index_to_tag = {index: tag for tag, index in tag_to_index.items()}
index_to_tag

{0: 'B-OPERATION',
 1: 'O',
 2: 'B-ELEMENT',
 3: 'I-ARRAY',
 4: 'B-ARRAY',
 5: 'I-ELEMENT',
 6: 'I-OPERATION',
 7: 'B-VAR',
 8: 'B-CONDITION',
 9: 'I-CONDITION',
 10: 'B-VAL',
 11: 'I-VAL',
 12: 'I-VAR',
 13: 'B-LHS',
 14: 'I-LHS',
 15: 'B-RHS',
 16: 'I-RHS',
 17: 'B-OPERATOR',
 18: 'B-OPERAND',
 19: 'I-OPERAND',
 20: 'I-OPERATOR',
 21: 'B-TYPE',
 22: 'B-CLASS',
 23: 'I-CLASS',
 24: 'I-COMMENT',
 25: 'B-COMMENT',
 26: 'B-LOG',
 27: 'B-ACTION',
 28: 'I-ACTION',
 29: 'B-DIR',
 30: 'B-START',
 31: 'B-END',
 32: 'B-STEP',
 33: 'B-COLLECTION',
 34: 'I-COLLECTION',
 35: 'B-FUNC',
 36: 'I-FUNC',
 37: 'B-PARAM',
 38: 'I-PARAM',
 39: 'B-MESSAGE',
 40: 'I-MESSAGE',
 41: 'B-LINE',
 42: 'B-FILE',
 43: 'B-LIB_NAME',
 44: '<UNK>'}

In [564]:
# Assertions
i = 0
for sentence, tags, intents in training_data:
    if len(sentence.split()) != len(tags):
        print(f"Example {i}: Sentence Length: {len(sentence.split())}, Tags Length: {len(tags)}")
        print("Example:", training_data[i])
    i +=1

for sentence, tags, intents in training_data:
    assert len(sentence.split()) == len(tags)

In [565]:
# get the tags for each intent
with open('../ner_dataset/intent_to_tags.json', 'w') as intent_to_tags_file:
    intent_to_tags = {}

    for file in os.listdir('../ner_dataset/annotations/final_annotations'):
        with open(f'../ner_dataset/annotations/final_annotations/{file}', 'r') as f:
            data = json.load(f)
            # print(data)
            tags = data['classes']
            tags.append('O')
            # print(tags)
            intent = file.split('.')[0].replace('_', ' ')
            intent_to_tags[intent] = tags
            # print(intent_to_tags[intent])

    json.dump(intent_to_tags, intent_to_tags_file, indent=4)


all_tags = list(tag_to_index.keys())

### **INTENTS HANDLING**

In [566]:
# here we want to filter out the tags that are not relevent to the given intent
def create_mask(intent, all_tags, intent_to_tags):
    intent_tags = intent_to_tags[intent]
    final_tags = []


    # create BI tags for the intent
    for tag in intent_tags:
        # print(tag)
        if tag == 'O': 
            final_tags.append(tag)
            continue
        
        final_tags.append('B-' + tag)
        final_tags.append('I-' + tag)

    mask = [tag in final_tags for tag in all_tags]
        
    mask = torch.tensor(mask, dtype=torch.long)
    return mask

mask = create_mask('variable declaration', all_tags, intent_to_tags)
print(mask)
print(len(mask))

tensor([0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
45


### **MODEL DEFINITION**

In [567]:
# model = BiLSTMCRF(vocab_size, word_embedding_dim=WORD_EMBEDDING_DIM, intent_embedding_dim=INTENT_EMBEDIING_DIM, hidden_dim=HIDDEN_DIM, output_dim=number_of_tags, number_of_intents=number_of_intents)
# optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [568]:
test_model = BiLSTMCRF(vocab_size, word_embedding_dim=WORD_EMBEDDING_DIM, intent_embedding_dim=INTENT_EMBEDIING_DIM, hidden_dim=HIDDEN_DIM, output_dim=number_of_tags, number_of_intents=number_of_intents, index_to_tag=index_to_tag)
test_optimizer = optim.SGD(test_model.parameters(), lr=0.01, weight_decay=1e-4)
# test_optimizer = optim.Adam(test_model.parameters(), lr=0.01)



In [569]:
# Check predictions before training
precheck_tag = prepare_sequence(training_data[50][1], tag_to_index)
print(precheck_tag)

mask = create_mask(training_data[50][2], all_tags, intent_to_tags)

print(mask)

with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[50][0].split(), word_to_index)
    intent = training_data[50][2]
    # precheck_intent = prepare_sequence(intent[0], intent_to_index)
    precheck_intent = torch.tensor([intent_to_index[intent]], dtype=torch.long)
    print(test_model(precheck_sent, precheck_intent, mask))

tensor([1, 1, 0, 1, 4])
tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 1, 2, 3, 4, 5, 6])
tensor([ 7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
        25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
        43, 44])
Viterbi Decode
[[1, 2, 2, 2, 2]]


In [570]:
final_tags = {}
for intent in intent_to_tags.keys():
    final_tags[intent] = []
    for tag in intent_to_tags[intent]:
        if tag == 'O': 
            final_tags[intent].append(tag)
            continue
        
        final_tags[intent].append('B-' + tag)
        final_tags[intent].append('I-' + tag)   

final_tags

distinct_tags = final_tags.values()

distinct_tags = set([tag for sublist in distinct_tags for tag in sublist])

distinct_tags

print(len(distinct_tags))

tags_to_index = {tag: index for index, tag in enumerate(distinct_tags)}

tags_to_index

53


{'I-VAL': 0,
 'I-LHS': 1,
 'B-CONDITION': 2,
 'B-VAR': 3,
 'I-ACTION': 4,
 'I-END': 5,
 'B-STEP': 6,
 'B-LOG': 7,
 'I-OPERATION': 8,
 'I-ARRAY': 9,
 'B-CLASS': 10,
 'I-TYPE': 11,
 'B-VAL': 12,
 'I-START': 13,
 'B-OPERATOR': 14,
 'I-COMMENT': 15,
 'I-ELEMENT': 16,
 'I-DIR': 17,
 'B-COMMENT': 18,
 'I-LINE': 19,
 'I-RHS': 20,
 'I-PARAM': 21,
 'I-LOG': 22,
 'B-OPERAND': 23,
 'B-FILE': 24,
 'B-FUNC': 25,
 'I-OPERAND': 26,
 'B-COLLECTION': 27,
 'B-MESSAGE': 28,
 'B-RHS': 29,
 'B-LIB_NAME': 30,
 'I-COLLECTION': 31,
 'B-DIR': 32,
 'I-LIB_NAME': 33,
 'I-MESSAGE': 34,
 'B-ACTION': 35,
 'B-OPERATION': 36,
 'I-FILE': 37,
 'I-VAR': 38,
 'B-LINE': 39,
 'B-ARRAY': 40,
 'I-CLASS': 41,
 'I-OPERATOR': 42,
 'I-CONDITION': 43,
 'B-LHS': 44,
 'B-START': 45,
 'B-TYPE': 46,
 'I-STEP': 47,
 'I-FUNC': 48,
 'B-PARAM': 49,
 'O': 50,
 'B-END': 51,
 'B-ELEMENT': 52}

In [571]:
intent_to_index

{'array operation': 0,
 'assertion': 1,
 'assignment operation': 2,
 'bitwise operation': 3,
 'casting': 4,
 'class declaration': 5,
 'comment': 6,
 'conditional operation': 7,
 'constant declaration': 8,
 'file system': 9,
 'for loop': 10,
 'function declaration': 11,
 'git operation': 12,
 'ide operation': 13,
 'input': 14,
 'interactive commands': 15,
 'libraries': 16,
 'mathematical operation': 17,
 'membership operation': 18,
 'output': 19,
 'variable declaration': 20,
 'while loop': 21,
 '<UNK>': 22}

In [572]:
# split the data into training and testing
# shuffle the data
from sklearn.model_selection import train_test_split


# random.shuffle(training_data)

# split the data
train_data, test_data = train_test_split(training_data, test_size=0.2, random_state=42, shuffle=True)

In [573]:
# training loop
def train(model, training_data, epochs=10):
    for epoch in tqdm(range(epochs)):
        total_loss = 0
        for sentence, tags, intent in training_data:
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is,
            # turn them into Tensors of word indices.
            intent_mask = create_mask(intent, all_tags, intent_to_tags)
            # print(intent_mask)
            
            # print(intent)
            # print(intent_to_index[intent])
            # intent = prepare_sequence(intents[0], intent_to_index)
            # print(intent)
            # print(sentence, tags, intent)


            # print(final_mask)

            sentence = prepare_sequence(sentence.split(), word_to_index)
            target_tags = torch.tensor([tag_to_index[t] for t in tags], dtype=torch.long)
            intent = torch.tensor([intent_to_index[intent]], dtype=torch.long)


            # Step 3. Run our forward pass.
            loss = model.neg_log_likelihood(sentence, target_tags, intent, intent_mask)

            total_loss += loss.item()

            # Step 4. Compute the loss, gradients, and update the parameters by
            loss.backward()
            test_optimizer.step()

        
        print(f"Epoch: {epoch}, Loss: {total_loss / len(training_data)}")


train(test_model, train_data, epochs=75)

  1%|▏         | 1/75 [00:35<44:00, 35.68s/it]

Epoch: 0, Loss: 6.3341906317026595


  3%|▎         | 2/75 [01:23<52:15, 42.95s/it]

Epoch: 1, Loss: 3.004118569409629


  4%|▍         | 3/75 [01:58<46:51, 39.04s/it]

Epoch: 2, Loss: 1.9881200731684046


  5%|▌         | 4/75 [02:24<40:06, 33.89s/it]

Epoch: 3, Loss: 1.4952448156208398


  7%|▋         | 5/75 [02:52<37:11, 31.88s/it]

Epoch: 4, Loss: 1.1984922186176779


  8%|▊         | 6/75 [03:25<37:09, 32.31s/it]

Epoch: 5, Loss: 1.0230564147156014


  9%|▉         | 7/75 [04:06<39:56, 35.24s/it]

Epoch: 6, Loss: 0.8897947263177208


 11%|█         | 8/75 [04:37<37:49, 33.87s/it]

Epoch: 7, Loss: 0.771593408784143


 12%|█▏        | 9/75 [05:03<34:31, 31.39s/it]

Epoch: 8, Loss: 0.6732260093755481


 13%|█▎        | 10/75 [05:40<35:54, 33.15s/it]

Epoch: 9, Loss: 0.5968477868368652


 15%|█▍        | 11/75 [06:19<37:08, 34.82s/it]

Epoch: 10, Loss: 0.5686233247376153


 16%|█▌        | 12/75 [07:00<38:30, 36.68s/it]

Epoch: 11, Loss: 0.5185001982536749


 17%|█▋        | 13/75 [07:26<34:37, 33.50s/it]

Epoch: 12, Loss: 0.4536709365578662


 19%|█▊        | 14/75 [07:52<31:45, 31.23s/it]

Epoch: 13, Loss: 0.4242628956734874


 20%|██        | 15/75 [08:23<31:10, 31.17s/it]

Epoch: 14, Loss: 0.3926664724907048


 21%|██▏       | 16/75 [09:10<35:18, 35.90s/it]

Epoch: 15, Loss: 0.380231970377143


 23%|██▎       | 17/75 [09:42<33:28, 34.63s/it]

Epoch: 16, Loss: 0.3461871124290859


 24%|██▍       | 18/75 [10:09<30:50, 32.46s/it]

Epoch: 17, Loss: 0.3132405641708773


 25%|██▌       | 19/75 [10:31<27:24, 29.36s/it]

Epoch: 18, Loss: 0.3131134020523497


 27%|██▋       | 20/75 [10:54<25:03, 27.34s/it]

Epoch: 19, Loss: 0.2945524542458492


 28%|██▊       | 21/75 [11:18<23:46, 26.42s/it]

Epoch: 20, Loss: 0.2638856390817537


 29%|██▉       | 22/75 [11:43<22:58, 26.02s/it]

Epoch: 21, Loss: 0.2579438218679653


 31%|███       | 23/75 [12:11<22:56, 26.47s/it]

Epoch: 22, Loss: 0.2593978602884121


 32%|███▏      | 24/75 [12:34<21:47, 25.64s/it]

Epoch: 23, Loss: 0.24535050824505408


 33%|███▎      | 25/75 [12:58<20:53, 25.07s/it]

Epoch: 24, Loss: 0.22851260787212235


 35%|███▍      | 26/75 [13:24<20:36, 25.23s/it]

Epoch: 25, Loss: 0.21462344208694897


 36%|███▌      | 27/75 [13:50<20:23, 25.49s/it]

Epoch: 26, Loss: 0.20502755407052345


 37%|███▋      | 28/75 [14:21<21:20, 27.25s/it]

Epoch: 27, Loss: 0.2011572492569555


 39%|███▊      | 29/75 [14:52<21:38, 28.23s/it]

Epoch: 28, Loss: 0.18959703888182236


 40%|████      | 30/75 [15:30<23:27, 31.28s/it]

Epoch: 29, Loss: 0.17999063617993982


 41%|████▏     | 31/75 [16:15<25:56, 35.37s/it]

Epoch: 30, Loss: 0.16760682176275676


 43%|████▎     | 32/75 [16:57<26:47, 37.38s/it]

Epoch: 31, Loss: 0.16780545946402245


 44%|████▍     | 33/75 [17:27<24:34, 35.11s/it]

Epoch: 32, Loss: 0.16391266223754483


 45%|████▌     | 34/75 [17:59<23:22, 34.22s/it]

Epoch: 33, Loss: 0.16655193044501174


 47%|████▋     | 35/75 [18:37<23:37, 35.45s/it]

Epoch: 34, Loss: 0.15002223670014944


 48%|████▊     | 36/75 [19:18<24:02, 36.98s/it]

Epoch: 35, Loss: 0.15327319084506713


 49%|████▉     | 37/75 [19:51<22:36, 35.69s/it]

Epoch: 36, Loss: 0.14542605176425125


 51%|█████     | 38/75 [20:29<22:31, 36.54s/it]

Epoch: 37, Loss: 0.1454188661567003


 52%|█████▏    | 39/75 [20:58<20:31, 34.21s/it]

Epoch: 38, Loss: 0.14517094773007355


 53%|█████▎    | 40/75 [21:28<19:14, 33.00s/it]

Epoch: 39, Loss: 0.13850844550777125


 55%|█████▍    | 41/75 [21:56<17:51, 31.52s/it]

Epoch: 40, Loss: 0.12254098164276549


 56%|█████▌    | 42/75 [22:24<16:47, 30.54s/it]

Epoch: 41, Loss: 0.12705455859432868


 57%|█████▋    | 43/75 [22:56<16:23, 30.75s/it]

Epoch: 42, Loss: 0.12295037689890774


 59%|█████▊    | 44/75 [23:23<15:17, 29.60s/it]

Epoch: 43, Loss: 0.11789119248818394


 60%|██████    | 45/75 [23:52<14:50, 29.67s/it]

Epoch: 44, Loss: 0.11494139244791311


 61%|██████▏   | 46/75 [24:20<14:02, 29.06s/it]

Epoch: 45, Loss: 0.1222613304723525


 63%|██████▎   | 47/75 [24:47<13:20, 28.60s/it]

Epoch: 46, Loss: 0.1076272766592197


 64%|██████▍   | 48/75 [25:20<13:24, 29.81s/it]

Epoch: 47, Loss: 0.11052689566857522


 65%|██████▌   | 49/75 [25:49<12:49, 29.60s/it]

Epoch: 48, Loss: 0.10554512241350222


 67%|██████▋   | 50/75 [26:17<12:06, 29.08s/it]

Epoch: 49, Loss: 0.10242649504903513


 68%|██████▊   | 51/75 [26:41<10:59, 27.48s/it]

Epoch: 50, Loss: 0.0942875186359768


 69%|██████▉   | 52/75 [27:01<09:39, 25.20s/it]

Epoch: 51, Loss: 0.09443941290158236


 71%|███████   | 53/75 [27:19<08:30, 23.20s/it]

Epoch: 52, Loss: 0.0940301610577491


 72%|███████▏  | 54/75 [27:39<07:45, 22.15s/it]

Epoch: 53, Loss: 0.09342562705408311


 73%|███████▎  | 55/75 [27:59<07:10, 21.55s/it]

Epoch: 54, Loss: 0.07853472284784292


 75%|███████▍  | 56/75 [28:20<06:43, 21.22s/it]

Epoch: 55, Loss: 0.08025529037482237


 76%|███████▌  | 57/75 [28:41<06:25, 21.43s/it]

Epoch: 56, Loss: 0.07687732772195245


 77%|███████▋  | 58/75 [29:03<06:06, 21.57s/it]

Epoch: 57, Loss: 0.07167233044724934


 79%|███████▊  | 59/75 [29:23<05:36, 21.05s/it]

Epoch: 58, Loss: 0.08250428139903593


 80%|████████  | 60/75 [29:44<05:12, 20.85s/it]

Epoch: 59, Loss: 0.08572095710501633


 81%|████████▏ | 61/75 [30:04<04:50, 20.76s/it]

Epoch: 60, Loss: 0.08332502519762194


 83%|████████▎ | 62/75 [30:33<05:03, 23.31s/it]

Epoch: 61, Loss: 0.07488954732391039


 84%|████████▍ | 63/75 [31:10<05:26, 27.19s/it]

Epoch: 62, Loss: 0.08156547675261626


 85%|████████▌ | 64/75 [31:33<04:46, 26.08s/it]

Epoch: 63, Loss: 0.07636715800427517


 87%|████████▋ | 65/75 [32:05<04:38, 27.81s/it]

Epoch: 64, Loss: 0.07235964672816975


 88%|████████▊ | 66/75 [32:25<03:49, 25.48s/it]

Epoch: 65, Loss: 0.06746370790310288


 89%|████████▉ | 67/75 [32:44<03:07, 23.42s/it]

Epoch: 66, Loss: 0.0675527855117319


 91%|█████████ | 68/75 [33:04<02:37, 22.53s/it]

Epoch: 67, Loss: 0.08354006981787311


 92%|█████████▏| 69/75 [33:24<02:10, 21.78s/it]

Epoch: 68, Loss: 0.0707464515383387


 93%|█████████▎| 70/75 [34:13<02:28, 29.76s/it]

Epoch: 69, Loss: 0.06973786570242206


 95%|█████████▍| 71/75 [35:09<02:30, 37.65s/it]

Epoch: 70, Loss: 0.064737869743274


 96%|█████████▌| 72/75 [36:21<02:23, 47.97s/it]

Epoch: 71, Loss: 0.06847011118010012


 97%|█████████▋| 73/75 [37:01<01:31, 45.69s/it]

Epoch: 72, Loss: 0.06293501018334599


 99%|█████████▊| 74/75 [37:23<00:38, 38.54s/it]

Epoch: 73, Loss: 0.0689955878070675


100%|██████████| 75/75 [37:44<00:00, 30.19s/it]

Epoch: 74, Loss: 0.06790666031442524





In [574]:
# Check predictions before training
precheck_tag = prepare_sequence(training_data[50][1], tag_to_index)
print(precheck_tag)

mask = create_mask(training_data[50][2], all_tags, intent_to_tags)

with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[50][0].split(), word_to_index)
    intent = training_data[50][2]
    # precheck_intent = prepare_sequence(intent[0], intent_to_index)
    precheck_intent = torch.tensor([intent_to_index[intent]], dtype=torch.long)
    print(test_model(precheck_sent, precheck_intent, mask))

tensor([1, 1, 0, 1, 4])
tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 1, 2, 3, 4, 5, 6])
tensor([ 7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
        25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
        43, 44])
Viterbi Decode
[[1, 1, 0, 1, 4]]


In [575]:
def predict(model, test_data):
    predictions = [] 
    with torch.no_grad():
        for sentence, _, intent in test_data:
            precheck_sent = prepare_sequence(sentence.split(), word_to_index)
            # precheck_intent = prepare_sequence(intent, intent_to_index)
            precheck_intent = torch.tensor([intent_to_index[intent]], dtype=torch.long)
            mask = create_mask(intent, all_tags, intent_to_tags)
            predictions.append(model(precheck_sent, precheck_intent, mask)[0])
    
    return predictions

In [576]:
# validation data
print(len(test_data))
predictions = predict(test_model, test_data)

574
tensor([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([ 1, 22, 23])
tensor([ 0,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
        39, 40, 41, 42, 43, 44])
Viterbi Decode
tensor([0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([ 1,  2,  5,  8,  9, 33, 34])
tensor([ 0,  3,  4,  6,  7, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
        23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 35, 36, 37, 38, 39, 40, 41, 42,
        43, 44])
Viterbi Decode
tensor([0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([ 1,  7, 10, 11, 12, 21])
tensor([ 0,  2,  3,  4,  5,  6,  8,  9, 13, 14, 1

In [577]:
labels = []
predicted_tags = []

for index, prediction in enumerate(predictions):
    print("Test Example:", index)
    mapped_tags = [all_tags[tag] for tag in prediction]
    predicted_tags.append(mapped_tags)
    labels.append(test_data[index][1])
    print("Sentence:", test_data[index][0])
    print("Actual tags:", test_data[index][1])
    print("Predicted tags:", mapped_tags)

Test Example: 0
Sentence: make a class with the name Score
Actual tags: ['O', 'O', 'O', 'O', 'O', 'O', 'B-CLASS']
Predicted tags: ['O', 'O', 'O', 'O', 'O', 'O', 'B-CLASS']
Test Example: 1
Sentence: check if total revenue is in collection request queue
Actual tags: ['O', 'O', 'B-ELEMENT', 'I-ELEMENT', 'O', 'B-CONDITION', 'O', 'B-COLLECTION', 'I-COLLECTION']
Predicted tags: ['O', 'O', 'B-ELEMENT', 'I-ELEMENT', 'O', 'B-CONDITION', 'O', 'B-COLLECTION', 'I-COLLECTION']
Test Example: 2
Sentence: under the name step make a new constant and initialize it with unsupervised learning
Actual tags: ['O', 'O', 'O', 'B-VAR', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-VAL', 'I-VAL']
Predicted tags: ['O', 'O', 'O', 'B-VAR', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-VAL', 'I-VAL']
Test Example: 3
Sentence: whether default config and element are the same and default config and -1.0 are the same
Actual tags: ['O', 'B-LHS', 'I-LHS', 'B-LOG', 'B-RHS', 'O', 'B-CONDITION', 'I-CONDITION', 'O', 'O', 'O', 'O', 

In [578]:
# Check predictions before training
# test_example_1 = ("declare a float variable under the name test and set it to 6.99", ["O", "O", "B-TYPE", "O", "O", "O", "O", "B-VAR", "O", "O", "O", "O", "B-VAL"], "variable declaration")
# test_example_2 = ("cast number to a string", ["O", "B-VAR", "O", "O", "B-CAST_TYPE"], "casting")
# test_example_3 = ("import the library numpy", ["O", "O", "O", "B-LIB_NAME"], "libraries")
# test_example_4 = ('make the comment this variable is a temp variable', ['O', 'O', 'O', 'B-COMMENT', 'I-COMMENT', 'I-COMMENT', 'I-COMMENT', 'I-COMMENT', 'I-COMMENT' ], 'comment')
# test_example_5 = ('declare a new class with the name Car', ['O', 'O', 'O', 'O', 'B-CLASS_NAME'], 'class declaration')
# test_example_6 = ('define a new method named func and takes parameters x, y and z', ['O', 'O', 'O', 'O', 'O', 'B-FUNC', 'O', 'O', 'O', 'B-PARAM', 'B-PARAM', 'O', 'B-PARAM'], 'function declaration')
# test_example_7 = ('create a new function called add with parameters num1 and num2', ['O', 'O', 'O', 'O', 'O', 'B-FUNC', 'O', 'O', 'O', 'B-PARAM', 'O', 'B-PARAM'], 'function declaration')
# test_example_8 = ('create a for loop from 1 to 10 with step 10',["O", "O", "B-LOOP", "O", "O", "B-START", "O", "B-END", "O", "O", "B-STEP"], 'iterative operation')
# test_example_9 = ('write a while loop that runs while x is less than 10', ["O", "O", "B-LOOP", "O", "O", "O", "O", "B-LHS", "O", "B-CONDITION","I-CONDITION", "B-RHS"], 'iterative operation')
# test_example_10 = ('create a new variable called x and set it to 10', ["O", "O", "O", "O", "B-VAR", "O", "O", "O", "B-VAL"], 'variable declaration')
# test_example_11 = ('append the value 5 to the list numbers', ["B-OPERATION", "O", "O", "B-ELEMENT", "O", "O", "O", "B-ARRAY"], 'array operation')

# test_data = [test_example_1, test_example_2, test_example_3, test_example_4, test_example_5, test_example_6, test_example_7, test_example_8, test_example_9, test_example_10, test_example_11]

# predictions = predict(test_model, test_data)
# labels = []
# predicted_tags = []

# for index, prediction in enumerate(predictions):
#     print("Test Example:", index)
#     mapped_tags = [all_tags[tag] for tag in prediction]
#     predicted_tags.append(mapped_tags)
#     labels.append(test_data[index][1])
#     print("Actual tags:", test_data[index][1])
#     print("Predicted tags:", mapped_tags)

In [579]:
# evaluation 
def evaluate(y_true, y_pred):
    mlb = MultiLabelBinarizer()
    y_true_binarized = mlb.fit_transform(y_true)
    y_pred_binarized = mlb.transform(y_pred)
    
    f1 = f1_score(y_true_binarized, y_pred_binarized, average='weighted')
    precision = precision_score(y_true_binarized, y_pred_binarized, average='weighted')
    recall = recall_score(y_true_binarized, y_pred_binarized, average='weighted')
    accuracy = accuracy_score(y_true_binarized, y_pred_binarized)
    
    return f1, precision, recall, accuracy

f1, precision, recall, accuracy = evaluate(labels, predicted_tags)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)

F1 Score: 0.9779741913223128
Precision: 0.9846760287411571
Recall: 0.97262118491921
Accuracy: 0.8745644599303136


In [580]:
torch.save(test_model.state_dict(), '../models/ner_model_2.pth')