In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Example path to the DailyDialog dataset (replace with your actual path)
dailydialog_path = 'C:/Users/Student/Desktop/ViraajM/chatbotProject/dailydialog/ijcnlp_dailydialog/dialogues_text.txt'

# Function to load and preprocess data
def load_data(file_path, max_samples=50000):
    questions = []
    answers = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('__eou__')
            if len(parts) > 1:
                questions.append(parts[:-1])
                answers.append(parts[1:])
            if len(questions) >= max_samples:
                break
    return questions, answers

# Load and preprocess the data
questions, answers = load_data(dailydialog_path)

# Adjust max_seq_length based on your dataset
max_seq_length = 30

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(questions)
tokenizer.fit_on_texts(answers)

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# Convert text to sequences

question_sequences = tokenizer.texts_to_sequences(questions)

answer_sequences = tokenizer.texts_to_sequences(answers)

# Padding sequences
question_sequences = pad_sequences(question_sequences, maxlen=max_seq_length, padding='post')
answer_sequences = pad_sequences(answer_sequences, maxlen=max_seq_length+1, padding='post')

# Define the model architecture
embedding_dim = 256
lstm_units = 512

# Encoder
encoder_inputs = Input(shape=(max_seq_length,))
encoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_seq_length,))
decoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()

# Define input and target data for training
input_data = [question_sequences, answer_sequences[:, :-1]]
target_data = answer_sequences[:, 1:]

# Print shapes for debugging
print(f"Input data shape: {input_data[0].shape}, {input_data[1].shape}")
print(f"Target data shape: {target_data.shape}")

# Train the model
model.fit(input_data, target_data, batch_size=64, epochs=10, validation_split=0.2)

# Example usage of generating response (assuming the model is trained)
def generate_response(input_text):
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=max_seq_length, padding='post')

    # Encode the input sequence
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index['<start>']  # Start token

    stop_condition = False
    generated_response = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = tokenizer.index_word[sampled_token_index]

        if sampled_token != '<end>':
            generated_response += sampled_token + ' '

        # Exit condition: either hit max length or find stop token
        if sampled_token == '<end>' or len(generated_response.split()) > max_seq_length:
            stop_condition = True

        # Update the target sequence
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return generated_response

# Example usage
input_text = "Can you help me with my computer?"
response = generate_response(input_text)
print(f"Bot: {response}")


Model: "model_17"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_35 (InputLayer)       [(None, 30)]                 0         []                            
                                                                                                  
 input_36 (InputLayer)       [(None, 30)]                 0         []                            
                                                                                                  
 embedding_34 (Embedding)    (None, 30, 256)              2102092   ['input_35[0][0]']            
                                                          8                                       
                                                                                                  
 embedding_35 (Embedding)    (None, 30, 256)              2102092   ['input_36[0][0]']     