In [7]:
import numpy as np
import pandas as pd
import re
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import os
import pickle

# Set NLTK path and download required data
nltk_data_path = 'C:/nltk_data'
nltk.data.path.append(nltk_data_path)

nltk.download('punkt', download_dir=nltk_data_path)
nltk.download('stopwords', download_dir=nltk_data_path)
nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_path)
nltk.download('wordnet', download_dir=nltk_data_path)
nltk.download('omw-1.4', download_dir=nltk_data_path)

# Load tokenizer explicitly from local directory
from nltk.tokenize import PunktSentenceTokenizer

try:
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
except LookupError:
    from nltk.tokenize.punkt import PunktTrainer
    trainer = PunktTrainer()
    trainer.train("This is a sample sentence. You can add more sentences to improve the tokenizer.")
    tokenizer = PunktSentenceTokenizer(trainer.get_params())
    os.makedirs('C:/nltk_data/tokenizers/punkt', exist_ok=True)
    with open('C:/nltk_data/tokenizers/punkt/english.pickle', 'wb') as f:
        pickle.dump(tokenizer, f)

# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = tokenizer.tokenize(text)
    tokens = [word.strip() for word in tokens if word.strip()]
    return tokens

# Load dataset (update path as needed)
df = pd.read_csv("C:/Users/DELL/Downloads/train_data_chatbot.csv")

# Apply preprocessing to questions
df['tokens'] = df['short_question'].astype(str).apply(clean_text)

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.save("word2vec_model.bin")

# Tokenize original questions for model input
tokenizer_keras = Tokenizer()
tokenizer_keras.fit_on_texts(df['short_question'])
X_sequences = tokenizer_keras.texts_to_sequences(df['short_question'])

# Pad sequences
max_seq_length = max(len(seq) for seq in X_sequences)
X_padded = pad_sequences(X_sequences, maxlen=max_seq_length, padding='post')

# Convert labels to categorical format
df['label'] = df['label'].astype(int)
y = to_categorical(df['label'])

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)



     

[nltk_data] Downloading package punkt to C:/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [11]:
# Load Word2Vec embeddings
embedding_matrix = np.zeros((len(tokenizer_keras.word_index) + 1, 100))
for word, i in tokenizer_keras.word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

# Define the LSTM model
model = Sequential([
    Embedding(input_dim=len(tokenizer_keras.word_index) + 1, output_dim=100, weights=[embedding_matrix],
              input_length=max_seq_length, trainable=False),
    LSTM(128, return_sequences=True),
    Dropout(0.3),
    LSTM(64),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(y.shape[1], activation='softmax')  
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Save the trained model
model.save("chatbot_lstm_model.h5")

# Save tokenizer for later use
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer_keras, f)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
