In [2]:
import zipfile
import os
import json

# Unzip the archive.zip file
zip_path = '/content/archive.zip'  # Path to your zip file
unzip_dir = '/content/dataset'  # Directory where to extract

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(unzip_dir)

# Load the intents JSON file
json_path = os.path.join(unzip_dir, 'intents.json')
with open(json_path, 'r') as file:
    intents = json.load(file)

print(intents)

{'intents': [{'tag': 'greeting', 'patterns': ['Hi', 'How are you?', 'Is anyone there?', 'Hello', 'Good day', "What's up", 'how are ya', 'heyy', 'whatsup', '??? ??? ??'], 'responses': ['Hello!', 'Good to see you again!', 'Hi there, how can I help?'], 'context_set': ''}, {'tag': 'goodbye', 'patterns': ['cya', 'see you', 'bye bye', 'See you later', 'Goodbye', 'I am Leaving', 'Bye', 'Have a Good day', 'talk to you later', 'ttyl', 'i got to go', 'gtg'], 'responses': ['Sad to see you go :(', 'Talk to you later', 'Goodbye!', 'Come back soon'], 'context_set': ''}, {'tag': 'creator', 'patterns': ['what is the name of your developers', 'what is the name of your creators', 'what is the name of the developers', 'what is the name of the creators', 'who created you', 'your developers', 'your creators', 'who are your developers', 'developers', 'you are made by', 'you are made by whom', 'who created you', 'who create you', 'creators', 'who made you', 'who designed you'], 'responses': ['College student

In [7]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [14]:
# Extracting the data from the intents file
training_sentences = []
training_labels = []

for intent in intents['intents']:
    for pattern in intent['patterns']:
        training_sentences.append(pattern)
        training_labels.append(intent['tag'])

# Encode the labels
lbl_encoder = LabelEncoder()
training_labels = lbl_encoder.fit_transform(training_labels)

# Tokenize the sentences
vocab_size = 1000
embedding_dim = 16
max_len = 20
oov_token = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded_sequences = pad_sequences(sequences, truncating='post', maxlen=max_len)

In [15]:
# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(len(lbl_encoder.classes_), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

model.summary()



In [16]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, training_labels, test_size=0.2)

# Train the model
epochs = 500
history = model.fit(X_train, y_train, epochs=epochs, validation_data=(X_val, y_val))

Epoch 1/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 79ms/step - accuracy: 0.0412 - loss: 3.6330 - val_accuracy: 0.0494 - val_loss: 3.6235
Epoch 2/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.0664 - loss: 3.6114 - val_accuracy: 0.0494 - val_loss: 3.6056
Epoch 3/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.0791 - loss: 3.5839 - val_accuracy: 0.0494 - val_loss: 3.5851
Epoch 4/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.0758 - loss: 3.5372 - val_accuracy: 0.0494 - val_loss: 3.5950
Epoch 5/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.0814 - loss: 3.4707 - val_accuracy: 0.0494 - val_loss: 3.5835
Epoch 6/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.0556 - loss: 3.4890 - val_accuracy: 0.0494 - val_loss: 3.5750
Epoch 7/500
[1m11/11[0m [

In [11]:
# Save the model and the label encoder
model.save("chatbot_model.h5")

import pickle

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('label_encoder.pickle', 'wb') as enc_file:
    pickle.dump(lbl_encoder, enc_file, protocol=pickle.HIGHEST_PROTOCOL)



In [18]:
# Load the model and tokenizer
model = tf.keras.models.load_model('chatbot_model.h5')

with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

with open('label_encoder.pickle', 'rb') as enc_file:
    lbl_encoder = pickle.load(enc_file)

# Chat function
def chat():
    print("Start talking with the bot (type 'quit' to stop)!")
    while True:
        sentence = input("You: ")
        if sentence.lower() == "quit":
            break

        # Preprocess the input sentence
        sequence = tokenizer.texts_to_sequences([sentence])
        padded_sequence = pad_sequences(sequence, truncating='post', maxlen=max_len)

        # Predict the intent
        predictions = model.predict(padded_sequence)
        tag = lbl_encoder.inverse_transform([np.argmax(predictions)])

        # Print response based on the predicted intent
        for intent in intents['intents']:
            if intent['tag'] == tag:
                print(f"Bot: {np.random.choice(intent['responses'])}")

chat()



Start talking with the bot (type 'quit' to stop)!
You: Hello
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 349ms/step
Bot: Good to see you again!
You: What are you?
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Bot: You can call me Mind Reader.
You: How can you help?
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Bot: You can ask me questions regarding college, and i will try to answer them
You: quit
