In [None]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
import tensorflow as tf

# Check if GPU is available
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))
print("GPU Devices:", tf.config.list_physical_devices('GPU'))

# Check TensorFlow version and GPU info
print("TensorFlow Version:", tf.__version__)
print("Is GPU available?", tf.test.is_gpu_available())

Unnamed: 0,English,Tamil,Category
0,I slept.,நான் தூங்கினேன்.,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
1,Calm down.,அமைதியாக இருங்கள்,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
2,I'll walk.,நான் நடப்பேன்.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3,Who is he?,அவன் யார்?,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
4,Who knows?,யாருக்குத் தெரியும்?,CC-BY 2.0 (France) Attribution: tatoeba.org #2...


In [None]:
# Load dataset
import pandas as pd

df = pd.read_csv('/kaggle/input/dialogues-tab-separated-txt/dialog_tab.txt', sep='\t')
df.head()

In [8]:
df.isnull().sum()

English    0
Tamil      0
dtype: int64

In [None]:
# Preprocess data
input_texts = df['input_text'].values
target_texts = ["<start> " + tar + " <end>" for tar in df['output_text'].values]

# Tokenization and padding
tokenizer_in = Tokenizer()
tokenizer_out = Tokenizer()

# Fit tokenizers
tokenizer_in.fit_on_texts(input_texts)
tokenizer_out.fit_on_texts(target_texts)

# Convert texts to sequences
input_seqs = tokenizer_in.texts_to_sequences(input_texts)
target_seqs = tokenizer_out.texts_to_sequences(target_texts)

# Pad sequences
max_encoder_len = max(len(s) for s in input_seqs)
max_decoder_len = max(len(s) for s in target_seqs)

# Prepare encoder and decoder input data
encoder_input_data = pad_sequences(input_seqs, maxlen=max_encoder_len, padding='post')
decoder_input_data = pad_sequences(target_seqs, maxlen=max_decoder_len, padding='post')

# Vocabulary sizes
vocab_in = len(tokenizer_in.word_index) + 1
vocab_out = len(tokenizer_out.word_index) + 1

In [None]:
# Defining parameters
batch_size = 64
epochs = 200
latent_dim = 256

In [None]:
# Prepare decoder target data
decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]

In [None]:
from tensorflow.keras.layers import Embedding

latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(vocab_in, latent_dim)
enc_emb = encoder_embedding(encoder_inputs)

encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(vocab_out, latent_dim)
dec_emb = decoder_embedding(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

decoder_dense = Dense(vocab_out, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
from tensorflow.keras.metrics import SparseCategoricalAccuracy

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=[SparseCategoricalAccuracy()])
model.summary()

In [None]:
with tf.device('/GPU:0'):
    model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2
)

In [None]:
# Encoder inference model
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder inference model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_inputs_single = Input(shape=(None,))

dec_emb2 = decoder_embedding(decoder_inputs_single)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(
    dec_emb2, initial_state=decoder_states_inputs)

decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs2, state_h2, state_c2]
)


In [None]:
# model.save('seq2seq_chatbot_model.keras')

In [None]:
model.load('seq2seq_chatbot_model.keras')

In [None]:
# Create reverse lookup token index
reverse_out_index = {v: k for k, v in tokenizer_out.word_index.items()}

# Function to decode sequence
def decode_sequence(input_sentence):
    seq = tokenizer_in.texts_to_sequences([input_sentence])
    seq = pad_sequences(seq, maxlen=max_encoder_len, padding='post')

    states = encoder_model.predict(seq)

    target_seq = np.array([[tokenizer_out.word_index["start"]]])

    output_sentence = ""

    for _ in range(max_decoder_len):
        output_tokens, h, c = decoder_model.predict([target_seq] + states)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_out_index.get(sampled_token_index, "")

        if sampled_word == "end":
            break

        output_sentence += " " + sampled_word

        target_seq = np.array([[sampled_token_index]])
        states = [h, c]

    return output_sentence.strip()


In [None]:
print(decode_sequence("hi, how are you doing?"))

In [None]:
print(decode_sequence("i'm fine. what do you do"))