# Problem statement

To develop a simple chatbot that can answer basic questions about a specific topic.

reference: https://handsonai.medium.com/build-a-chat-bot-from-scratch-using-python-and-tensorflow-fd189bcfae45

# Setup environment

In [6]:
# import libraries
import nltk
from nltk.stem.lancaster import LancasterStemmer

import numpy as np
import json
import random
import pickle
import requests

import tensorflow as tf

# Load and Preprocess Data

reference: https://www.yourlibrary.ca/citizenship-test-answer-keys/

In [7]:
# load data
# define the URL of the JSON file
url = "https://raw.githubusercontent.com/wpjerrykwok/LLM-chatBot/main/intents.json"

# make a GET request to the URL
response = requests.get(url)

# check if the request was successful
if response.status_code == 200:
    # parse the response as JSON
    raw_data = response.json()
    # print the raw data
    print(raw_data)
else:
    # handle the error
    print(f"Request failed with status code {response.status_code}")

{'intents': [{'tag': 'greeting', 'patterns': ['Hi there', 'How are you', 'Is anyone there?', 'Hey', 'Hola', 'Hello', 'Good day'], 'responses': ['Hello, thanks for asking', 'Good to see you again', 'Hi there, how can I help?'], 'context': ['']}, {'tag': 'goodbye', 'patterns': ['Bye', 'See you later', 'Goodbye', 'Nice chatting to you, bye', 'Till next time'], 'responses': ['See you!', 'Have a nice day', 'Bye! Come back again soon.'], 'context': ['']}, {'tag': 'thanks', 'patterns': ['Thanks', 'Thank you', "That's helpful", 'Awesome, thanks', 'Thanks for helping me'], 'responses': ['Happy to help!', 'Any time!', 'My pleasure'], 'context': ['']}, {'tag': 'noanswer', 'patterns': [], 'responses': ["Sorry, can't understand you", 'Please give me more info', 'Not sure I understand'], 'context': ['']}, {'tag': 'question_1', 'patterns': ['A Member of Parliament from Montreal announces that she will spend her weekend in her electoral district. This means she would be:'], 'responses': ['In the part 

In [8]:
stemmer = LancasterStemmer()

In [9]:
try:
    with open('data.pickle', 'rb') as data_file:
        words, labels, training, output = pickle.load(data_file)
except:
# get the words and labels
    words = []
    labels = []
    docs_x = []
    docs_y = []

    for intent in raw_data['intents']:
        for pattern in intent['patterns']:
            tokenized_words = nltk.word_tokenize(pattern)
            words.extend(tokenized_words)
            docs_x.append(tokenized_words)
            docs_y.append(intent['tag'])

        if intent['tag'] not in labels:
            labels.append(intent['tag'])

    # stem the words
    words = [stemmer.stem(w.lower()) for w in words if w != '?']
    words = sorted(list(set(words)))
    labels = sorted(labels)

    # create training and output data
    training = []
    output = []

    out_empty = [0 for _ in range(len(labels))]

    # one hot encoding
    for x, doc in enumerate(docs_x):
        bag = []

        stemmed_words = [stemmer.stem(w.lower()) for w in doc]

        for w in words:
            if w in stemmed_words:
                bag.append(1)
            else:
                bag.append(0)

        output_row = out_empty[:]
        output_row[labels.index(docs_y[x])] = 1

        training.append(bag)
        output.append(output_row)

    # convert to numpy arrays
    training = np.array(training)
    output = np.array(output)

    # save data
    with open('data.pickle', 'wb') as data_file:
        pickle.dump((words, labels, training, output), data_file)

# Train the model

In [10]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(8, input_shape=[len(training[0])]))
model.add(tf.keras.layers.Dense(8))
model.add(tf.keras.layers.Dense(len(output[0]), activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

try:
    model.load_weights('model.keras')
except:
    model.fit(training, output, epochs=1000, batch_size=8)
    model.save('model.keras')

In [11]:
#
def bag_of_words(s, words):
    bag = [0 for _ in range(len(words))]

    tokenized_words = nltk.word_tokenize(s)
    stemmed_words = [stemmer.stem(w.lower()) for w in tokenized_words]

    for w in stemmed_words:
        for i, word in enumerate(words):
            if word == w:
                bag[i] = 1

    return np.array(bag)

In [12]:
def chat():
    print('Start talking with the bot! (type quit to stop)')
    while True:
        inp = input('You: ')
        if inp.lower() == 'quit':
            break

        results = model.predict(np.array([bag_of_words(inp, words)]))
        results_index = np.argmax(results)
        tag = labels[results_index]

        for intent in raw_data['intents']:
            if intent['tag'] == tag:
                responses = intent['responses']

        print(random.choice(responses))

In [15]:
chat()

Start talking with the bot! (type quit to stop)
Good to see you again
Good to see you again
Good to see you again
Hi there, how can I help?
Hi there, how can I help?
Hi there, how can I help?
