In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
import tensorflow as tf

# Example path to the DailyDialog dataset
dailydialog_path = 'C:/Users/Desktop/ViraajM/chatbotProject/ijcnlp_dailydialog/ijcnlp_dailydialog/dialogues_text.txt'

# Function to load and preprocess data
def load_data(file_path, max_samples=50000):
    questions = []
    answers = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('__eou__')
            if len(parts) > 1:
                questions.append(parts[:-1])
                answers.append(parts[1:])
            if len(questions) >= max_samples:
                break
    return questions, answers

# Load and preprocess the data
questions, answers = load_data(dailydialog_path)

# Adjust max_seq_length based on your dataset
max_seq_length = 30

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(questions + answers)

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# Convert text to sequences
question_sequences = tokenizer.texts_to_sequences(questions)
answer_sequences = tokenizer.texts_to_sequences(answers)

# Padding sequences
question_sequences = pad_sequences(question_sequences, maxlen=max_seq_length, padding='post')
answer_sequences = pad_sequences(answer_sequences, maxlen=max_seq_length + 1, padding='post')

# Load GloVe embeddings
glove_path = 'C:/Users/Student/Desktop/ViraajM/chatbotProject/glove.6B/glove.6B.100d.txt'  # Adjust the path to your GloVe file
embeddings_index = {}
with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Prepare embedding matrix
embedding_dim = 100  # Adjust based on the dimension of your GloVe embeddings
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Define the model architecture
embedding_layer = Embedding(vocab_size, embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_seq_length,
                            mask_zero=True,
                            trainable=False)  # Set trainable=False to freeze embeddings

# Encoder
encoder_inputs = Input(shape=(max_seq_length,))
encoder_embedding = embedding_layer(encoder_inputs)
encoder_embedding = BatchNormalization()(encoder_embedding)
encoder_dropout = Dropout(0.5)(encoder_embedding)  # Adding dropout
encoder_lstm = LSTM(512, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_dropout)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_seq_length,))
decoder_embedding = embedding_layer(decoder_inputs)
decoder_dropout = Dropout(0.5)(decoder_embedding)  # Adding dropout
decoder_lstm = LSTM(512, return_sequences=True, return_state=True,
                    kernel_regularizer=l2(0.02),
                    recurrent_regularizer=l2(0.02),
                    bias_regularizer=l2(0.002))
decoder_lstm_outputs, _, _ = decoder_lstm(decoder_dropout, initial_state=encoder_states)

# Additional Dense layer
decoder_dense1 = Dense(256, activation='relu')
decoder_dense_outputs = decoder_dense1(decoder_lstm_outputs)

# Final Dense layer for output
decoder_dense2 = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense2(decoder_dense_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer=Adam(clipvalue=1.0), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()

# Define input and target data for training
input_data = [question_sequences, answer_sequences[:, :-1]]
target_data = answer_sequences[:, 1:]

# Print shapes for debugging
print(f"Input data shape: {input_data[0].shape}, {input_data[1].shape}")
print(f"Target data shape: {target_data.shape}")

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with early stopping
history = model.fit(input_data, target_data,
                    batch_size=64,
                    epochs=1,
                    validation_split=0.2,
                    callbacks=[early_stopping])


KeyboardInterrupt: 

In [None]:

encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(512,))
decoder_state_input_c = Input(shape=(512,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense1(decoder_outputs)
decoder_outputs = decoder_dense2(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)
def generate_text(input_question, max_length=30, temperature=1.0):
    input_seq = tokenizer.texts_to_sequences([input_question])
    input_seq = pad_sequences(input_seq, maxlen=max_seq_length, padding='post')
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))
    stop_condition = False
    decoded_sentence = ''
    num_words = 0

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)


        sampled_token_index = np.random.choice(len(output_tokens[0, -1, :]), p=output_tokens[0, -1, :])
        sampled_word = tokenizer.index_word.get(sampled_token_index, '')

        if (sampled_word == '.' or num_words > max_length):
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word
            num_words += 1

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]
    sentences=0
    decoded=""
    for character in decoded_sentence.strip():
        decoded+=character
        if character in ".?!":
            sentences+=1
        if sentences==5:
            break
    return decoded.strip()


input_question = "How are you?"
decoded_answer = generate_text(input_question)
print(f"Question: {input_question}")
print(f"Answer: {decoded_answer}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load pretrained model and tokenizer
model_name = "microsoft/DialoGPT-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name, pad_token='<pad>')

# Ensure pad_token_id is set correctly
if tokenizer.pad_token_id is None:
    # Add '<pad>' as a special token and set pad_token_id
    tokenizer.add_special_tokens({'pad_token': '<pad>'})
    tokenizer.pad_token = '<pad>'
    tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

# Verify tokenizer details
print(f"Tokenizer: {tokenizer}")

# Load model
model = AutoModelForCausalLM.from_pretrained(model_name, pad_token_id=tokenizer.pad_token_id)

# Function to generate response
def generate_response(prompt):
    input_ids = tokenizer.encode(prompt + tokenizer.eos_token, return_tensors="pt")
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)  # Default attention mask
    response_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=100)
    return tokenizer.decode(response_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)

# Test the chatbot
print("Chatbot: Hi! How can I help you today?")
while True:
    user_input = input("You: ")
    if user_input.lower() in ['exit', 'quit', 'bye']:
        print("Chatbot: Goodbye!")
        break
    response = generate_response(user_input)
    print(f"Chatbot: {response}")

Tokenizer: GPT2TokenizerFast(name_or_path='microsoft/DialoGPT-medium', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50257: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
Chatbot: Hi! How can I help you today?


You:  Hi


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: Hi! :D


You:  What color is the sky


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: It's a blue sky
