In [1]:
import utils
import json
import os
import re
import random
from string import Formatter
import itertools

In [7]:

def augment_intent(file_path, target_intent, number_of_augmentations=10):
    '''
    This function augments the data in the dataset

    It take the intent keywords and augments them using the synonym augmenter and inserts spelling mistakes in the keywords and saves the augmented data to a new file

    Args:
        - file_path (str) : path of the dataset
    '''

    with open(file_path, 'r') as intents_file:
        data = json.load(intents_file)

        intents = data["intents"]

        for intent in intents:    
            if intent["intent"] == target_intent:
                print(intent["intent"])
                    
                # intent_data = intent[target_intent]

                default_parameters = intent["default parameters"]

                templates = intent["patterns"]

                formatted_templates = []

                print(len(templates))

                for template in templates:

                    number = number_of_augmentations

                    for _ in range(number):

                        filtered_parameters = {
                            field_name for _, field_name, _, _ in Formatter().parse(template) if field_name}

                        final_parameters = {
                            key: default_parameters[key] for key in filtered_parameters}

                        if intent["intent"] == "Variable Declaration":
                            if "datatype" in final_parameters:

                                # we check if the type is in the template then we check the type to choose the appropriate synonyms
                                final_parameters["datatype"] = random.choice(
                                    intent["synonyms"]["datatype"])

                                match final_parameters["datatype"]:
                                    case "int" | "integer":
                                        final_parameters["value"] = random.choice(
                                            intent["synonyms"]["value"]["integer"])
                                    case "float" | "double":
                                        final_parameters["value"] = random.choice(
                                            intent["synonyms"]["value"]["float"])
                                    case "string":
                                        final_parameters["value"] = random.choice(
                                            intent["synonyms"]["value"]["string"])
                                    case "char" | "character":
                                        final_parameters["value"] = chr(
                                            random.randint(32, 123))
                                    case "bool" | "boolean":
                                        final_parameters["value"] = random.choice(
                                            intent["synonyms"]["value"]["boolean"])
                            else:
                                final_parameters["value"] = random.choice(
                                    list(itertools.chain(*intent["synonyms"]["value"].values())))

                        for parameter in final_parameters:

                            if intent["intent"] == "Variable Declaration":
                                if parameter == "value" or parameter == "datatype":
                                    continue
                            if parameter == "value":
                                if type(intent["synonyms"]["value"]) == dict:
                                    final_parameters["value"] = random.choice(
                                        list(itertools.chain(*intent["synonyms"]["value"].values())))
                                    continue
                            
                            if parameter == "variable_1" or parameter == "variable_2" or parameter == "variable_3":
                                final_parameters[parameter] = random.choice(
                                    intent["synonyms"]["variable"])
                                continue
                            
                            if parameter == "start" or parameter == "end" or parameter == "step":
                                final_parameters[parameter] = random.choice(intent["synonyms"]["number"])
                                continue

                            synonyms = intent["synonyms"].get(parameter, [])
                            final_parameters[parameter] = random.choice(synonyms)

                        formatted_string = template.format(**final_parameters)

                        formatted_templates.append(formatted_string)

                intent["formatted patterns"] = formatted_templates

                print(len(formatted_templates))

                with open(file_path, 'w') as file:
                    json.dump(data, file, indent=4)


In [8]:
def augment_data(file_path, intents=['all'], number_of_augmentations=10):
    if intents == ['all']:
        with open(file_path, 'r') as intents_file:
            data = json.load(intents_file)

            intents = data["intents"]
            for intent in intents:
                augment_intent(file_path, intent, number_of_augmentations)
    else:
        for intent in intents:
            augment_intent(file_path, intent, number_of_augmentations)

In [9]:
intents_to_augment = ["IDE Operation"]
augment_data("./intent_detection_dataset/more_intents_pattern.json", intents_to_augment)

IDE Operation
30
300


In [10]:
utils.ner_dataset_pre_annotations("./intent_detection_dataset/more_intents_pattern.json")

In [11]:
for file in os.listdir("./ner_dataset/annotations/final_annotations"):
    utils.reformat_json(f"./ner_dataset/annotations/final_annotations/{file}")

In [12]:
# add the annotations of the data to the annotations file
with open("./ner_dataset/annotations/annotations.json") as f:
    annotations = json.load(f)
    
    for file in os.listdir("./ner_dataset/annotations/final_annotations"):

        with open(f"./ner_dataset/annotations/final_annotations/{file}") as f:
            data = json.load(f)


        intent = re.sub(r"\.json", "", file)
        intent = re.sub(r"_", " ", intent)

        annotations["annotations"][intent] = data["annotations"]
        
        with open("./ner_dataset/annotations/annotations.json", "w") as f:
            json.dump(annotations, f) 

In [13]:
utils.reformat_json("./ner_dataset/annotations/annotations.json")

In [3]:
def find_subset_indices(subset, larger_list):
    indices = []
    for item in subset:
        try:
            # Find the index of the current item in the larger list
            index = larger_list.index(item)
            indices.append(index)
        except ValueError:
            # If an item is not found, return None indicating subset is not fully present
            return None
    return indices

def remove_punctuation(input_string):
    # Define a regex pattern to match punctuation (excluding underscore and digits)
    # exclude special characters if present alone
    
    pattern = r'(?<!\d)\.(?![\d\s])|[^\w\s.\-]|_'

    # Use re.sub to substitute all matches of the pattern with an empty string
    return re.sub(pattern, '', input_string)

def find_subset_indices(subset, larger_list):
    indices = []
    for item in subset:
        try:
            # Find the index of the current item in the larger list
            index = larger_list.index(item)
            indices.append(index)
        except ValueError:
            # If an item is not found, return None indicating subset is not fully present
            return None
    return indices

def convert_annotations_to_csv(file_path):

    # delete the csv file if it exists
    if os.path.exists("./ner_dataset/ner_dataset.csv"):
        os.remove("./ner_dataset/ner_dataset.csv")

    with open(file_path, 'r') as file:
        data = json.load(file)

        annotations = data['annotations']

        intents = annotations.keys()
        print(intents)

        with open("./ner_dataset/ner_dataset.csv", 'w') as csv_file:
            csv_file.write("Sentence #, Word, Tag, Intent\n")

            sentence_index = 0

            for intent in intents: 
                print(intent)
                print(len(annotations))
                examples = annotations[intent]
                print(len(examples))
                if len(examples) == 0:
                    continue
                
                for example in examples:
                    # if intent == "io operation":
                    #     print(example)

                    if example is None:
                        continue
                        
                    sentence = example[0]
                    # print(sentence)
                    entities = example[1]["entities"]
                    # print(entities)
                    # print(sentence_index)
                    # print(sentence)
                    # print(entities)

                    sentence = sentence[:-1]
                    
                    # if intent == "comment":
                    #     print(sentence)

                    if len(sentence) == 0:
                        continue

                    if sentence[-1] == ".":
                        sentence = sentence[:-1]

                    # print(sentence)
                    words = sentence.split(" ")

                    tags = ["O"] * len(words)

                    words = [remove_punctuation(word) for word in words]

                    # print(words)
                    # print(tags)

                    for entity in entities:
                        start = entity[0]
                        end = entity[1]
                        tag = entity[2]
                        split_entity = sentence[start:end].split(" ")
                        # print(split_entity)

                        indices = find_subset_indices(split_entity, words)

                        # print(indices)
                        # print(indices)

                        if indices is not None:
                            indices.sort()
                            for i, index in enumerate(indices):
                                if i == 0:
                                    tags[index] = f"B-{tag}"
                                else:
                                    tags[index] = f"I-{tag}"
                        
                        # print(tags)
                        # break
                    
                    for word, tag in zip(words, tags):
                        csv_file.write(f"{sentence_index}, {word}, {tag}, {intent}\n")

                    sentence_index += 1

                    # break

In [5]:
convert_annotations_to_csv("./ner_dataset/annotations/annotations.json")

dict_keys(['assertion', 'assignment operation', 'bitwise operation', 'casting', 'class declaration', 'comment', 'conditional operation', 'constant declaration', 'file system', 'for loop', 'function declaration', 'git operation', 'ide operation', 'input', 'interactive commands', 'libraries', 'mathematical operation', 'membership operation', 'output', 'variable declaration', 'while loop'])
assertion
21
49
assignment operation
21
80
bitwise operation
21
97
casting
21
78
class declaration
21
38
comment
21
50
conditional operation
21
160
constant declaration
21
200
file system
21
150
for loop
21
120
function declaration
21
112
git operation
21
80
ide operation
21
290
input
21
50
interactive commands
21
220
libraries
21
51
mathematical operation
21
370
membership operation
21
114
output
21
120
variable declaration
21
240
while loop
21
60


In [16]:
# create file for final intents dataset
with open("./intent_detection_dataset/final_intents_dataset.json", "w") as f_final:
    final_data = {}

    with open("./intent_detection_dataset/more_intents_pattern.json", "r") as f_intents:
        intents = json.load(f_intents)['intents']
        
        for intent in intents:
            intent_name = intent["intent"]
            final_patterns = intent["formatted patterns"]
            final_data[intent_name] = final_patterns

    json.dump(final_data, f_final, indent=4)

In [17]:
positive_float_pattern = r'^\d*\.\d+$'
print(re.match(positive_float_pattern, '123.456'))  # Match
print(re.match(positive_float_pattern, '-123.456'))  # No match


<re.Match object; span=(0, 7), match='123.456'>
None


In [18]:
print(list(zip([1], [2], [3])))

[(1, 2, 3)]


In [19]:
import nltk
nltk.download('wordnet')

from nltk.corpus import wordnet

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return synonyms

word = "push"
synonyms = get_synonyms(word)
print(f"Synonyms of {word}: {synonyms}")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yazmi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Synonyms of push: {'pushing', 'crowd', 'press', 'energy', 'promote', 'push', 'bear_on', 'push_button', 'drive', 'advertize', 'advertise', 'agitate', 'button', 'get-up-and-go', 'thrust', 'fight', 'tug', 'labour', 'crusade', 'campaign', 'labor', 'force'}


In [20]:
valid = True
try:
    number = int('100 in')
except ValueError:
    valid = False

valid

False