In [31]:
import json
from nltk.corpus import stopwords
from collections import Counter
import string
import tensorflow as tf
import numpy as np

In [32]:
stop = set(stopwords.words('english'))

In [33]:
def load_data(file_name):
  file_name = file_name + ".json"
  with open("Datasets/"+file_name) as f:
    data = json.load(f)
  return data

In [34]:
def remove_punct(text):
    table = str.maketrans("","",string.punctuation)
    return text.translate(table)

In [35]:
def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word not in stop]
    return " ".join(text)

In [36]:
def counter_word(text):
    count = Counter()
    for i in text:
        for word in i.split():
            count[word] += 1
    return count

In [37]:
def gen_dictionaries(file_name):
    try:
        pickle.load(f)
    except:
        patterns = []
        classes = []
        data = load_data(file_name)
        for intent in data['intents']:
            for pattern in intent['patterns']:
                text = remove_punct(pattern)
                processed_text = remove_stopwords(text)
                patterns.append(text)
            classes.append(intent['tag'])
        patterns_counter = counter_word(patterns)
        return patterns_counter, patterns, classes

In [38]:
patterns_counter, patterns, classes = gen_dictionaries("CS")

classes = list(sorted(set(classes)))

In [39]:
def bag_of_words(input, vocab):
    vocab_dictionary = dict((k,i) for i, k in enumerate(vocab))
    input_words  = [input]
    if set(vocab_dictionary).intersection(set(input_words)):
        intersection = set(vocab_dictionary).intersection(set(input_words))
        indices = [vocab_dictionary[x] for x in intersection]
        vocab_np = np.asarray(vocab).reshape(len(vocab), 1)
        nn_outputs = np.zeros(vocab_np.shape)
        for x in indices:
            nn_outputs[x] = 1
        return nn_outputs

In [40]:
def generate_labels(classes, file_name):
    data = load_data(file_name)
    labels = []
    for intent in data['intents']:
        for pattern in intent['patterns']:
            labels.append(bag_of_words(intent['tag'], classes))
    labels = np.squeeze(np.asarray(labels))
    #np.save('Labels/labels'+ file_name, labels)
    return labels
    

In [41]:
labels = generate_labels(classes, "CS")

num_words = len(patterns_counter)

max_words = 20

In [42]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words = num_words)
tokenizer.fit_on_texts(patterns)

In [43]:
word_index = tokenizer.word_index

word_index

{'what': 1,
 'is': 2,
 'define': 3,
 'a': 4,
 'how': 5,
 'describe': 6,
 'array': 7,
 'an': 8,
 'the': 9,
 'function': 10,
 'of': 11,
 'to': 12,
 'error': 13,
 'does': 14,
 'in': 15,
 'loop': 16,
 'class': 17,
 'operator': 18,
 'string': 19,
 'do': 20,
 'statement': 21,
 'are': 22,
 'structure': 23,
 'data': 24,
 'file': 25,
 'work': 26,
 'for': 27,
 'while': 28,
 'c': 29,
 'classes': 30,
 'by': 31,
 'size': 32,
 'overflow': 33,
 'inheritance': 34,
 'explain': 35,
 'we': 36,
 'character': 37,
 'and': 38,
 'if': 39,
 'storage': 40,
 'meant': 41,
 'can': 42,
 'sort': 43,
 'search': 44,
 'cstyle': 45,
 'short': 46,
 'header': 47,
 'value': 48,
 'comment': 49,
 'directive': 50,
 'long': 51,
 'double': 52,
 'syntax': 53,
 'controlled': 54,
 'modularity': 55,
 'parameter': 56,
 'state': 57,
 'protected': 58,
 'selection': 59,
 'declaration': 60,
 'elements': 61,
 'library': 62,
 'strcat': 63,
 'logical': 64,
 'break': 65,
 'loops': 66,
 'member': 67,
 'this': 68,
 'pointer': 69,
 'compiler':

In [44]:

len(classes)
labels.shape

(449, 136)

In [45]:
pattern_sequences = tokenizer.texts_to_sequences(patterns)

In [46]:
pattern_sequences

[[208],
 [5, 22, 95],
 [2, 209, 162],
 [210],
 [163, 164],
 [211, 212],
 [1, 2, 4, 58, 67],
 [1, 14, 96, 4, 58, 67, 11, 4, 17, 97],
 [1, 14, 96, 4, 58, 67, 11, 4, 17, 213],
 [1, 14, 96, 4, 58, 67, 10, 97],
 [3, 58, 214],
 [1, 2, 9, 68, 69],
 [1, 14, 9, 68, 69, 20],
 [1, 2, 9, 10, 11, 9, 68, 69],
 [1, 2, 4, 68, 69],
 [1, 2, 8, 33, 13],
 [3, 33, 13],
 [1, 14, 8, 33, 13, 97],
 [1, 22, 33, 215],
 [216],
 [217, 95, 218],
 [219],
 [220, 221, 222],
 [223, 4, 163, 164],
 [1, 2, 4, 29, 70],
 [1, 2, 224, 225],
 [1, 2, 4, 70],
 [70],
 [3, 70],
 [1, 2, 98],
 [98],
 [3, 98],
 [1, 2, 4, 17],
 [1, 22, 30],
 [3, 29, 30],
 [3, 30, 15, 29],
 [3, 30],
 [1, 2, 4, 10, 15, 29],
 [1, 2, 4, 29, 10],
 [3, 29, 10],
 [3, 71],
 [3, 71, 15, 29],
 [1, 2, 34],
 [1, 2, 9, 165, 11, 34],
 [6, 34, 11, 30],
 [3, 34],
 [34, 15, 29],
 [1, 2, 4, 47, 25],
 [1, 2, 226, 25],
 [3, 47, 25, 11, 30],
 [3, 47, 227],
 [1, 22, 24, 99],
 [5, 166, 24, 99, 22, 162],
 [5, 2, 24, 167, 15, 29],
 [3, 24, 99],
 [1, 2, 4, 168, 169, 170],
 [35

In [47]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

pattern_sequences_padded = pad_sequences(
    pattern_sequences, maxlen = max_words, padding="post", truncating="post"
)

In [48]:
pattern_sequences_padded

array([[208,   0,   0, ...,   0,   0,   0],
       [  5,  22,  95, ...,   0,   0,   0],
       [  2, 209, 162, ...,   0,   0,   0],
       ...,
       [ 35, 160, 161, ...,   0,   0,   0],
       [  3, 160, 161, ...,   0,   0,   0],
       [  1,   2, 160, ...,   0,   0,   0]])

In [49]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.initializers import Constant
from tensorflow.keras.optimizers import Adam

model = Sequential()

model.add(Embedding(num_words, 32, input_length=max_words))
model.add(LSTM(64, dropout=0.3))
model.add(Dense(labels.shape[1], activation = "softmax"))

optimizer = Adam(learning_rate = 0.001)

model.compile(loss="categorical_crossentropy", optimizer = optimizer, metrics =["accuracy"])

In [50]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 32)            9344      
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dense_1 (Dense)              (None, 136)               8840      
Total params: 43,016
Trainable params: 43,016
Non-trainable params: 0
_________________________________________________________________


In [51]:
model.fit(pattern_sequences_padded, labels, epochs = 600, batch_size = 32)

Epoch 1/600
Epoch 2/600
Epoch 3/600
Epoch 4/600
Epoch 5/600
Epoch 6/600
Epoch 7/600
Epoch 8/600
Epoch 9/600
Epoch 10/600
Epoch 11/600
Epoch 12/600
Epoch 13/600
Epoch 14/600
Epoch 15/600
Epoch 16/600
Epoch 17/600
Epoch 18/600
Epoch 19/600
Epoch 20/600
Epoch 21/600
Epoch 22/600
Epoch 23/600
Epoch 24/600
Epoch 25/600
Epoch 26/600
Epoch 27/600
Epoch 28/600
Epoch 29/600
Epoch 30/600
Epoch 31/600
Epoch 32/600
Epoch 33/600
Epoch 34/600
Epoch 35/600
Epoch 36/600
Epoch 37/600
Epoch 38/600
Epoch 39/600
Epoch 40/600
Epoch 41/600
Epoch 42/600
Epoch 43/600
Epoch 44/600
Epoch 45/600
Epoch 46/600
Epoch 47/600
Epoch 48/600
Epoch 49/600
Epoch 50/600
Epoch 51/600
Epoch 52/600
Epoch 53/600
Epoch 54/600
Epoch 55/600
Epoch 56/600
Epoch 57/600
Epoch 58/600
Epoch 59/600
Epoch 60/600
Epoch 61/600
Epoch 62/600
Epoch 63/600
Epoch 64/600
Epoch 65/600
Epoch 66/600
Epoch 67/600
Epoch 68/600
Epoch 69/600
Epoch 70/600
Epoch 71/600
Epoch 72/600
Epoch 73/600
Epoch 74/600
Epoch 75/600
Epoch 76/600
Epoch 77/600
Epoch 78

Epoch 83/600
Epoch 84/600
Epoch 85/600
Epoch 86/600
Epoch 87/600
Epoch 88/600
Epoch 89/600
Epoch 90/600
Epoch 91/600
Epoch 92/600
Epoch 93/600
Epoch 94/600
Epoch 95/600
Epoch 96/600
Epoch 97/600
Epoch 98/600
Epoch 99/600
Epoch 100/600
Epoch 101/600
Epoch 102/600
Epoch 103/600
Epoch 104/600
Epoch 105/600
Epoch 106/600
Epoch 107/600
Epoch 108/600
Epoch 109/600
Epoch 110/600
Epoch 111/600
Epoch 112/600
Epoch 113/600
Epoch 114/600
Epoch 115/600
Epoch 116/600
Epoch 117/600
Epoch 118/600
Epoch 119/600
Epoch 120/600
Epoch 121/600
Epoch 122/600
Epoch 123/600
Epoch 124/600
Epoch 125/600
Epoch 126/600
Epoch 127/600
Epoch 128/600
Epoch 129/600
Epoch 130/600
Epoch 131/600
Epoch 132/600
Epoch 133/600
Epoch 134/600
Epoch 135/600
Epoch 136/600
Epoch 137/600
Epoch 138/600
Epoch 139/600
Epoch 140/600
Epoch 141/600
Epoch 142/600
Epoch 143/600
Epoch 144/600
Epoch 145/600
Epoch 146/600
Epoch 147/600
Epoch 148/600
Epoch 149/600
Epoch 150/600
Epoch 151/600
Epoch 152/600
Epoch 153/600
Epoch 154/600
Epoch 155

Epoch 164/600
Epoch 165/600
Epoch 166/600
Epoch 167/600
Epoch 168/600
Epoch 169/600
Epoch 170/600
Epoch 171/600
Epoch 172/600
Epoch 173/600
Epoch 174/600
Epoch 175/600
Epoch 176/600
Epoch 177/600
Epoch 178/600
Epoch 179/600
Epoch 180/600
Epoch 181/600
Epoch 182/600
Epoch 183/600
Epoch 184/600
Epoch 185/600
Epoch 186/600
Epoch 187/600
Epoch 188/600
Epoch 189/600
Epoch 190/600
Epoch 191/600
Epoch 192/600
Epoch 193/600
Epoch 194/600
Epoch 195/600
Epoch 196/600
Epoch 197/600
Epoch 198/600
Epoch 199/600
Epoch 200/600
Epoch 201/600
Epoch 202/600
Epoch 203/600
Epoch 204/600
Epoch 205/600
Epoch 206/600
Epoch 207/600
Epoch 208/600
Epoch 209/600
Epoch 210/600
Epoch 211/600
Epoch 212/600
Epoch 213/600
Epoch 214/600
Epoch 215/600
Epoch 216/600
Epoch 217/600
Epoch 218/600
Epoch 219/600
Epoch 220/600
Epoch 221/600
Epoch 222/600
Epoch 223/600
Epoch 224/600
Epoch 225/600
Epoch 226/600
Epoch 227/600
Epoch 228/600
Epoch 229/600
Epoch 230/600
Epoch 231/600
Epoch 232/600
Epoch 233/600
Epoch 234/600
Epoch 

Epoch 244/600
Epoch 245/600
Epoch 246/600
Epoch 247/600
Epoch 248/600
Epoch 249/600
Epoch 250/600
Epoch 251/600
Epoch 252/600
Epoch 253/600
Epoch 254/600
Epoch 255/600
Epoch 256/600
Epoch 257/600
Epoch 258/600
Epoch 259/600
Epoch 260/600
Epoch 261/600
Epoch 262/600
Epoch 263/600
Epoch 264/600
Epoch 265/600
Epoch 266/600
Epoch 267/600
Epoch 268/600
Epoch 269/600
Epoch 270/600
Epoch 271/600
Epoch 272/600
Epoch 273/600
Epoch 274/600
Epoch 275/600
Epoch 276/600
Epoch 277/600
Epoch 278/600
Epoch 279/600
Epoch 280/600
Epoch 281/600
Epoch 282/600
Epoch 283/600
Epoch 284/600
Epoch 285/600
Epoch 286/600
Epoch 287/600
Epoch 288/600
Epoch 289/600
Epoch 290/600
Epoch 291/600
Epoch 292/600
Epoch 293/600
Epoch 294/600
Epoch 295/600
Epoch 296/600
Epoch 297/600
Epoch 298/600
Epoch 299/600
Epoch 300/600
Epoch 301/600
Epoch 302/600
Epoch 303/600
Epoch 304/600
Epoch 305/600
Epoch 306/600
Epoch 307/600
Epoch 308/600
Epoch 309/600
Epoch 310/600
Epoch 311/600
Epoch 312/600
Epoch 313/600
Epoch 314/600
Epoch 

Epoch 324/600
Epoch 325/600
Epoch 326/600
Epoch 327/600
Epoch 328/600
Epoch 329/600
Epoch 330/600
Epoch 331/600
Epoch 332/600
Epoch 333/600
Epoch 334/600
Epoch 335/600
Epoch 336/600
Epoch 337/600
Epoch 338/600
Epoch 339/600
Epoch 340/600
Epoch 341/600
Epoch 342/600
Epoch 343/600
Epoch 344/600
Epoch 345/600
Epoch 346/600
Epoch 347/600
Epoch 348/600
Epoch 349/600
Epoch 350/600
Epoch 351/600
Epoch 352/600
Epoch 353/600
Epoch 354/600
Epoch 355/600
Epoch 356/600
Epoch 357/600
Epoch 358/600
Epoch 359/600
Epoch 360/600
Epoch 361/600
Epoch 362/600
Epoch 363/600
Epoch 364/600
Epoch 365/600
Epoch 366/600
Epoch 367/600
Epoch 368/600
Epoch 369/600
Epoch 370/600
Epoch 371/600
Epoch 372/600
Epoch 373/600
Epoch 374/600
Epoch 375/600
Epoch 376/600
Epoch 377/600
Epoch 378/600
Epoch 379/600
Epoch 380/600
Epoch 381/600
Epoch 382/600
Epoch 383/600
Epoch 384/600
Epoch 385/600
Epoch 386/600
Epoch 387/600
Epoch 388/600
Epoch 389/600
Epoch 390/600
Epoch 391/600
Epoch 392/600
Epoch 393/600
Epoch 394/600
Epoch 

Epoch 404/600
Epoch 405/600
Epoch 406/600
Epoch 407/600
Epoch 408/600
Epoch 409/600
Epoch 410/600
Epoch 411/600
Epoch 412/600
Epoch 413/600
Epoch 414/600
Epoch 415/600
Epoch 416/600
Epoch 417/600
Epoch 418/600
Epoch 419/600
Epoch 420/600
Epoch 421/600
Epoch 422/600
Epoch 423/600
Epoch 424/600
Epoch 425/600
Epoch 426/600
Epoch 427/600
Epoch 428/600
Epoch 429/600
Epoch 430/600
Epoch 431/600
Epoch 432/600
Epoch 433/600
Epoch 434/600
Epoch 435/600
Epoch 436/600
Epoch 437/600
Epoch 438/600
Epoch 439/600
Epoch 440/600
Epoch 441/600
Epoch 442/600
Epoch 443/600
Epoch 444/600
Epoch 445/600
Epoch 446/600
Epoch 447/600
Epoch 448/600
Epoch 449/600
Epoch 450/600
Epoch 451/600
Epoch 452/600
Epoch 453/600
Epoch 454/600
Epoch 455/600
Epoch 456/600
Epoch 457/600
Epoch 458/600
Epoch 459/600
Epoch 460/600
Epoch 461/600
Epoch 462/600
Epoch 463/600
Epoch 464/600
Epoch 465/600
Epoch 466/600
Epoch 467/600
Epoch 468/600
Epoch 469/600
Epoch 470/600
Epoch 471/600
Epoch 472/600
Epoch 473/600
Epoch 474/600
Epoch 

Epoch 484/600
Epoch 485/600
Epoch 486/600
Epoch 487/600
Epoch 488/600
Epoch 489/600
Epoch 490/600
Epoch 491/600
Epoch 492/600
Epoch 493/600
Epoch 494/600
Epoch 495/600
Epoch 496/600
Epoch 497/600
Epoch 498/600
Epoch 499/600
Epoch 500/600
Epoch 501/600
Epoch 502/600
Epoch 503/600
Epoch 504/600
Epoch 505/600
Epoch 506/600
Epoch 507/600
Epoch 508/600
Epoch 509/600
Epoch 510/600
Epoch 511/600
Epoch 512/600
Epoch 513/600
Epoch 514/600
Epoch 515/600
Epoch 516/600
Epoch 517/600
Epoch 518/600
Epoch 519/600
Epoch 520/600
Epoch 521/600
Epoch 522/600
Epoch 523/600
Epoch 524/600
Epoch 525/600
Epoch 526/600
Epoch 527/600
Epoch 528/600
Epoch 529/600
Epoch 530/600
Epoch 531/600
Epoch 532/600
Epoch 533/600
Epoch 534/600
Epoch 535/600
Epoch 536/600
Epoch 537/600
Epoch 538/600
Epoch 539/600
Epoch 540/600
Epoch 541/600
Epoch 542/600
Epoch 543/600
Epoch 544/600
Epoch 545/600
Epoch 546/600
Epoch 547/600
Epoch 548/600
Epoch 549/600
Epoch 550/600
Epoch 551/600
Epoch 552/600
Epoch 553/600
Epoch 554/600
Epoch 

Epoch 564/600
Epoch 565/600
Epoch 566/600
Epoch 567/600
Epoch 568/600
Epoch 569/600
Epoch 570/600
Epoch 571/600
Epoch 572/600
Epoch 573/600
Epoch 574/600
Epoch 575/600
Epoch 576/600
Epoch 577/600
Epoch 578/600
Epoch 579/600
Epoch 580/600
Epoch 581/600
Epoch 582/600
Epoch 583/600
Epoch 584/600
Epoch 585/600
Epoch 586/600
Epoch 587/600
Epoch 588/600
Epoch 589/600
Epoch 590/600
Epoch 591/600
Epoch 592/600
Epoch 593/600
Epoch 594/600
Epoch 595/600
Epoch 596/600
Epoch 597/600
Epoch 598/600
Epoch 599/600
Epoch 600/600


<tensorflow.python.keras.callbacks.History at 0x2ac53cea088>

In [70]:
x = ['what happens when we exceed the offset of an array']
seq = tokenizer.texts_to_sequences(x)
print(seq)
padded = pad_sequences(
    seq, maxlen = max_words, padding="post", truncating="post"
)
print(padded)
prediction = model.predict(padded)
print(classes[np.squeeze(np.argmax(prediction))])

[[1, 36, 180, 9, 103, 11, 8, 7]]
[[  1  36 180   9 103  11   8   7   0   0   0   0   0   0   0   0   0   0
    0   0]]
offset_exceed
