In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Input, Dense, LSTM, LayerNormalization, MultiHeadAttention
from tensorflow.keras.models import Model
import pandas as pd
from sklearn.model_selection import train_test_split
import re

In [2]:

# 1. Data Preparation
# Load the CSV file and process conversations
df = pd.read_csv('/kaggle/input/chatdata3/chat_data.csv',nrows=10000)

In [3]:

pattern = r"'value':\s\"(.*?)\""

final_data = {'Column 1': [], 'Column 2': []}
for input_string in df['conversations']:
    messages = re.findall(pattern, input_string)
    temp_data = {'Column 1': [], 'Column 2': []}
    for i, message in enumerate(messages):
        if i % 2 == 0:
            temp_data['Column 1'].append(message)
            temp_data['Column 2'].append('')
        else:
            temp_data['Column 2'][-1] = message
    final_data['Column 1'].extend(temp_data['Column 1'])
    final_data['Column 2'].extend(temp_data['Column 2'])

final_data = pd.DataFrame(final_data)

In [4]:
final_data

Unnamed: 0,Column 1,Column 2
0,I've been feeling so sad and overwhelmed latel...,"Hey there, I'm here to listen and support you...."
1,"I recently got a promotion at work, which I th...",I can understand how it can be overwhelming wh...
2,"Well, the workload has increased significantly...",It sounds like you're dealing with a lot of pr...
3,I've been trying to prioritize my tasks and de...,It's great to hear that you're already impleme...
4,You're right. I haven't really opened up about...,"It's completely normal to feel that way, but r..."
...,...,...
70274,Great! Let's start by finding a quiet and comf...,(Takes a deep breath) I'm starting to feel a b...
70275,"That's wonderful, Charlie. Remember, it's norm...",I'll keep that in mind. This feels like someth...
70276,"You're very welcome, Charlie. I'm glad I could...","I truly appreciate your support, Alex. You've ..."
70277,"I'm so glad to hear that, Charlie. Remember, t...","Thank you, Alex. Your warmth and guidance mean..."


In [5]:
# Combine both columns into a single text chunk
combined_full_text = " ".join(final_data["Column 1"].fillna("") + " " + final_data["Column 2"].fillna(""))


In [6]:
len(combined_full_text)

33859914

In [7]:
# !python -m pip install contractions


In [8]:
combined_text = combined_full_text[:300000]

In [9]:
len(combined_text)

300000

In [10]:
import pandas as pd

# Assuming 'combined_text' is already created
words = combined_text.split()  # Split the text into individual words

# Create input-output pairs
sequence_length = 10
input_output_pairs = []

for i in range(len(words) - sequence_length):
    input_sequence = " ".join(words[i:i + sequence_length])  # 10 words as input
    output_word = words[i + sequence_length]  # 11th word as output
    input_output_pairs.append((input_sequence, output_word))

# Convert to DataFrame for better visualization and handling
input_output_df = pd.DataFrame(input_output_pairs, columns=["Input", "Output"])



In [11]:

input_output_df["Output"][0]


'become'

In [12]:
input_output_df["Input"][0]


"I've been feeling so sad and overwhelmed lately. Work has"

In [13]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Tokenizing the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(input_output_df["Input"]) 

# Filter out rows with empty output sequences
filtered_pairs = [
    (input_seq, output_word)
    for input_seq, output_word in zip(input_output_df["Input"], input_output_df["Output"])
    if tokenizer.texts_to_sequences([output_word])[0]  # Check if the output word is in the vocabulary
]

# Extract the filtered input and output
filtered_inputs, filtered_outputs = zip(*filtered_pairs)

# Convert text to sequences
input_sequences = tokenizer.texts_to_sequences(filtered_inputs)
output_words = tokenizer.texts_to_sequences(filtered_outputs)

# Pad input sequences
max_length = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_length, padding="post")

# Convert output to categorical
vocab_size = len(tokenizer.word_index) + 1
output_words = np.array([seq[0] for seq in output_words])  # Convert output to 1D array
output_words = to_categorical(output_words, num_classes=vocab_size)

print(f"Vocabulary size: {vocab_size}, Input shape: {input_sequences.shape}, Output shape: {output_words.shape}")


Vocabulary size: 2881, Input shape: (50232, 13), Output shape: (50232, 2881)


In [14]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(input_sequences, output_words, test_size=0.2, random_state=42)


In [15]:
# Define the Transformer block
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super(TransformerBlock, self).__init__(**kwargs)
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(ff_dim, activation="relu"),
            tf.keras.layers.Dense(embed_dim),
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training=None):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

    def get_config(self):
        config = super(TransformerBlock, self).get_config()
        config.update({
            "embed_dim": self.att.key_dim,
            "num_heads": self.att.num_heads,
            "ff_dim": self.ffn.layers[0].units,
            "rate": self.dropout1.rate,
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

# Positional encoding
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, max_len, embed_dim, **kwargs):
        super(PositionalEncoding, self).__init__(**kwargs)
        self.positional_encoding = self.get_positional_encoding(max_len, embed_dim)

    def get_positional_encoding(self, max_len, embed_dim):
        pos = np.arange(max_len)[:, np.newaxis]
        i = np.arange(embed_dim)[np.newaxis, :]
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(embed_dim))
        angle_rads = pos * angle_rates
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])  # Apply sin to even indices
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])  # Apply cos to odd indices
        return tf.constant(angle_rads, dtype=tf.float32)

    def call(self, inputs):
        return inputs + self.positional_encoding[: tf.shape(inputs)[1], :]


In [16]:
from tensorflow.keras.layers import Input, Embedding, Dropout, Dense, GlobalAveragePooling1D, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam


embed_dim = 64  # Embedding size
num_heads = 4   # Number of attention heads
ff_dim = 128    # Feed-forward network dimension
dropout_rate = 0.5  # Dropout rate

inputs = Input(shape=(max_length,))
x = Embedding(input_dim=vocab_size, output_dim=embed_dim)(inputs)
x = PositionalEncoding(max_len=max_length, embed_dim=embed_dim)(x)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim, rate=dropout_rate)
x = transformer_block(x)
x = GlobalAveragePooling1D()(x)
x = Dense(64, activation="relu")(x)
x = Dropout(dropout_rate)(x)

outputs = Dense(vocab_size, activation="softmax")(x)

model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
print(model.summary())


None


In [17]:
# Train the model
history = model.fit(
    X_train,
    y_train,
    batch_size=32,
    epochs=100,
    validation_data=(X_test, y_test)
)

Epoch 1/100


I0000 00:00:1733250925.504912    2671 service.cc:145] XLA service 0x7f94f0006e60 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1733250925.504965    2671 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1733250925.504971    2671 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5




[1m  55/1256[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3s[0m 3ms/step - accuracy: 0.0283 - loss: 7.6625 

I0000 00:00:1733250937.509122    2671 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1238/1256[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.0408 - loss: 6.4862





[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0408 - loss: 6.4833




[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 18ms/step - accuracy: 0.0408 - loss: 6.4832 - val_accuracy: 0.0419 - val_loss: 6.0769
Epoch 2/100
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.0443 - loss: 6.0180 - val_accuracy: 0.0449 - val_loss: 5.9773
Epoch 3/100
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.0561 - loss: 5.8460 - val_accuracy: 0.0633 - val_loss: 5.9703
Epoch 4/100
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.0656 - loss: 5.7351 - val_accuracy: 0.0678 - val_loss: 5.9245
Epoch 5/100
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.0782 - loss: 5.5786 - val_accuracy: 0.0775 - val_loss: 5.9252
Epoch 6/100
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.0908 - loss: 5.4470 - val_accuracy: 0.0902 - val_loss: 5.7978
Epoch 7/100
[1m1256

In [29]:
def predict_sequence(initial_text, model, tokenizer, max_length, num_predictions):

    current_text = initial_text
    
    for _ in range(num_predictions):
        # Preprocess the current text
        input_sequence = tokenizer.texts_to_sequences([current_text])
        input_sequence = pad_sequences(input_sequence, maxlen=max_length, padding="post")
        
        # Predict the next word
        predicted_probs = model.predict(input_sequence, verbose=0)
        predicted_index = np.argmax(predicted_probs, axis=-1)[0]
        predicted_word = tokenizer.index_word.get(predicted_index, "")
        
        if not predicted_word:  # Stop if no valid prediction
            break
        
        # Update the input text
        current_text += f" {predicted_word}"  # Append the predicted word
        current_text = " ".join(current_text.split()[-max_length:])  # Keep only the last max_length words

    return current_text

# Example usage
initial_text = "I've been feeling so sad and"
generated_sequence = predict_sequence(initial_text, model, tokenizer, max_length, num_predictions=20)
print(f"Generated sequence: {generated_sequence}")


Generated sequence: our with to i've how excited that remember i i well well a


In [19]:
model.save("transformer_model.h5")

In [27]:
from tensorflow.keras.models import load_model

# Load the model
loaded_model = load_model("transformer_model.h5", custom_objects={
    "TransformerBlock": TransformerBlock,
    "PositionalEncoding": PositionalEncoding
})

In [30]:
from tensorflow.keras.layers import Input, Embedding, Dropout, Dense, GlobalAveragePooling1D, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam


# Build the model
embed_dim = 128  # Embedding size
num_heads = 4   # Number of attention heads
ff_dim = 252    # Feed-forward network dimension
dropout_rate = 0.5  # Dropout rate

inputs = Input(shape=(max_length,))
x = Embedding(input_dim=vocab_size, output_dim=embed_dim)(inputs)
x = PositionalEncoding(max_len=max_length, embed_dim=embed_dim)(x)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim, rate=dropout_rate)
x = transformer_block(x)
x = GlobalAveragePooling1D()(x)
x = Dense(64, activation="relu")(x)
x = Dropout(dropout_rate)(x)

outputs = Dense(vocab_size, activation="softmax")(x)

model2 = Model(inputs=inputs, outputs=outputs)
model2.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
print(model2.summary())


None


In [32]:
# Train the model
history = model2.fit(
    X_train,
    y_train,
    batch_size=32,
    epochs=100,
    validation_data=(X_test, y_test)
)

Epoch 1/100
[1m1258/1258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.1189 - loss: 5.1228 - val_accuracy: 0.1190 - val_loss: 5.6952
Epoch 2/100
[1m1258/1258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.1256 - loss: 5.0777 - val_accuracy: 0.1192 - val_loss: 5.8469
Epoch 3/100
[1m1258/1258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.1315 - loss: 5.0384 - val_accuracy: 0.1287 - val_loss: 5.7776
Epoch 4/100
[1m1258/1258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.1318 - loss: 4.9635 - val_accuracy: 0.1277 - val_loss: 5.7194
Epoch 5/100
[1m1258/1258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.1362 - loss: 4.9229 - val_accuracy: 0.1303 - val_loss: 5.9414
Epoch 6/100
[1m1258/1258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.1419 - loss: 4.8591 - val_accuracy: 0.1334 - val_loss: 5.7801
Epoch 7/10

In [34]:

initial_text = "I've been feeling so sad and"
generated_sequence = predict_sequence(initial_text, model, tokenizer, max_length, num_predictions=20)
print(f"Generated sequence: {generated_sequence}")


Generated sequence: our with to i've how excited that remember i i well well a


In [35]:

initial_text = "I've been feeling so sad and"
generated_sequence = predict_sequence(initial_text, model2, tokenizer, max_length, num_predictions=20)
print(f"Generated sequence: {generated_sequence}")


Generated sequence: have feel can been be feeling a sense sense of of sense failure
