## **Import and load the data file**

In [1]:
import nltktrain/Tingkat 1/Training_261590
nltk.download('punkt')#Sentence tokenizer

[nltk_data] Downloading package punkt to /home/mangg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import json
import pickle
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import SGD
import random

2023-11-15 12:08:08.939719: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-15 12:08:08.941655: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-15 12:08:08.975305: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-15 12:08:08.975338: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-15 12:08:08.975359: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

# **Preprocessing**

In [4]:
words=[]
classes = []
documents = []
ignore_words = ['?', '!']
data_file = open('intents.json').read() # read json file
intents = json.loads(data_file) # load json file

In [5]:
for intent in intents['intents']:
    for pattern in intent['patterns']:
        #tokenize each word
        w = nltk.word_tokenize(pattern)
        words.extend(w)# add each elements into list
        #combination between patterns and intents
        documents.append((w, intent['tag']))#add single element into end of list
        # add to tag in our classes list
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

In [6]:
nltk.download('wordnet') #lexical database for the English language

[nltk_data] Downloading package wordnet to /home/mangg/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /home/mangg/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

Now we will lemmatize each word and remove duplicate words from the list. 
- Lemmatizing is the process of converting a word into its lemma form and then creating a pickle file to store the Python objects which we will use while predicting.

In [8]:
# lemmatize, lower each word and remove duplicates
words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))
# sort classes
classes = sorted(list(set(classes)))
# documents = combination between patterns and intents
print (len(documents), "documents\n", documents, "\n")
# classes = intents[tag]
print (len(classes), "classes\n", classes, "\n")
# words = all words, vocabulary
print (len(words), "unique lemmatized words\n", words, "\n")
pickle.dump(words,open('words.pkl','wb'))
pickle.dump(classes,open('classes.pkl','wb'))

79 documents
 [(['Hai'], 'ucapan'), (['Apa', 'kabar', '?'], 'ucapan'), (['Is', 'anyone', 'there', '?'], 'ucapan'), (['Halo'], 'ucapan'), (['Halo', '?'], 'ucapan'), (['Ada', 'Apa', '?'], 'ucapan'), (['Apa', 'Kabar', 'mu', '?'], 'ucapan'), (['heyy'], 'ucapan'), (['Apa', '?'], 'ucapan'), (['?', '?', '?', '?', '?', '?', '?', '?'], 'ucapan'), (['cya'], 'selamat_tinggal'), (['See', 'you'], 'selamat_tinggal'), (['bye', 'bye'], 'selamat_tinggal'), (['Senang', 'ketemu', 'kembali'], 'selamat_tinggal'), (['Goodbye'], 'selamat_tinggal'), (['Saya', 'pergi'], 'selamat_tinggal'), (['Bye'], 'selamat_tinggal'), (['Semoga', 'hari', 'indah'], 'selamat_tinggal'), (['Bicara', 'denganmu', 'nanti'], 'selamat_tinggal'), (['ttyl'], 'selamat_tinggal'), (['Saya', 'akan', 'pergi'], 'selamat_tinggal'), (['gtg'], 'selamat_tinggal'), (['apa', 'nama', 'pengembang', 'anda', '?'], 'pembuat'), (['apa', 'nama', 'kreator', 'anda', '?'], 'pembuat'), (['siapa', 'nama', 'pengembangnya'], 'pembuat'), (['siapa', 'nama', 'kreat

# **Training Model**

In [9]:
# create our training data
training = []
# create an empty array for our output
output_empty = [0] * len(classes)
# training set, bag of words for each sentence
for doc in documents:
    # initialize our bag of words
    bag = []
    # list of tokenized words
    pattern_words = doc[0]
    # convert pattern_words in lower case
    pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]
    # create bag of words array,if word match found in current pattern then put 1 otherwise 0.[row * colm(263)]
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)
    
    # in output array 0 value for each tag ang 1 value for matched tag.[row * colm(8)]
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1
    
    training.append([bag, output_row])
# shuffle training and turn into np.array
random.shuffle(training)
training = np.array(training)
# create train and test. X - patterns(words), Y - intents(tags)
train_x = list(training[:,0])
train_y = list(training[:,1])
print("Training data created")

Training data created


In [10]:
from tensorflow.python.framework import ops
ops.reset_default_graph()

# **Build the model** 

In [11]:
# Create model - 3 layers. First layer 128 neurons, second layer 64 neurons and 3rd output layer contains number of neurons
# equal to number of intents to predict output intent with softmax
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]), activation='softmax'))
print("First layer:",model.layers[0].get_weights()[0])

First layer: [[ 0.15011743  0.0970507  -0.07103215 ... -0.10040861 -0.13289784
   0.13589278]
 [ 0.0366894   0.03165027  0.12249193 ... -0.09169336  0.11976612
  -0.02522385]
 [-0.09291448  0.12480813  0.04549734 ... -0.13745677 -0.05613062
  -0.05679071]
 ...
 [-0.03109999 -0.0753238  -0.09726171 ... -0.04403056  0.02472316
   0.0910314 ]
 [-0.0497993   0.02873389 -0.08725252 ...  0.15938249 -0.14352742
   0.07635112]
 [-0.15737627 -0.09886788 -0.07162287 ... -0.05536497 -0.12644018
   0.12198389]]


In [12]:
# Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
# sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
#fitting and saving the model 
hist = model.fit(np.array(train_x), np.array(train_y), epochs=200, batch_size=5, verbose=1)
model.save('chatbot_model.h5', hist)

print("model created")


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78