# Market Research Chatbot - Neural Network from Scratch

In [2]:
# Install required packages
!pip install nltk pandas numpy tensorflow scikit-learn

import pandas as pd
import numpy as np
import nltk
import re
import string
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, Input, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import pickle
import warnings
warnings.filterwarnings('ignore')

# NLTK
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

print("All dependencies installed successfully!")

All dependencies installed successfully!


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
# Load your CSV data
df = pd.read_csv('/content/sample_data/market_research_chatbot_dataset.csv')

print("Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print("\nSample data:")
print(df.head())

Dataset loaded successfully!
Dataset shape: (750, 2)

Sample data:
                                            Question  \
0               Name some leading startups in AI/ML.   
1               List major players active in Edtech.   
2           Who are major AI/ML market participants?   
3                 What occupations use Fintech most?   
4  What growth is forecasted for the Fintech sector?   

                                              Answer  
0  Big players include OpenAI, Google DeepMind, a...  
1  Top edtech players include Coursera, Udemy, an...  
2  Big players include OpenAI, Google DeepMind, a...  
3  Fintech adoption is highest among millennials ...  
4      The fintech sector may grow to $460B by 2025.  


In [4]:
# ===== TEXT PREPROCESSING =====

class TextPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def clean_text(self, text):
        """Clean and preprocess text"""
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()

        return text

    def tokenize_text(self, text):
        """Tokenize text using NLTK"""
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens
                 if token not in self.stop_words and len(token) > 1]

        return tokens

    def preprocess(self, text):
        """Complete preprocessing pipeline"""
        cleaned_text = self.clean_text(text)
        tokens = self.tokenize_text(cleaned_text)
        return tokens

# Preprocessor
preprocessor = TextPreprocessor()

# Preprocess questions and answers
print("Preprocessing text data...")
questions_processed = []
answers_processed = []

for idx, row in df.iterrows():
    if pd.notna(row['Question']) and pd.notna(row['Answer']):
        q_tokens = preprocessor.preprocess(row['Question'])
        a_tokens = preprocessor.preprocess(row['Answer'])

        questions_processed.append(q_tokens)
        answers_processed.append(a_tokens)

print(f"Text preprocessing completed! Processed {len(questions_processed)} pairs")

Preprocessing text data...
Text preprocessing completed! Processed 750 pairs


In [5]:
# ===== VOCABULARY BUILDING =====

def build_vocabulary(tokenized_texts, min_freq=2):
    """Build vocabulary from tokenized texts"""
    all_words = []
    for tokens in tokenized_texts:
        all_words.extend(tokens)

    vocab_counter = Counter(all_words)
    vocab = ['<PAD>', '<UNK>', '<START>', '<END>'] + [
        word for word, count in vocab_counter.most_common() if count >= min_freq
    ]

    word_to_idx = {word: idx for idx, word in enumerate(vocab)}
    idx_to_word = {idx: word for idx, word in enumerate(vocab)}

    return vocab, word_to_idx, idx_to_word

# Build vocabulary from questions and answers
all_texts = questions_processed + answers_processed
vocab, word_to_idx, idx_to_word = build_vocabulary(all_texts, min_freq=2)

print(f"Vocabulary size: {len(vocab)}")
print(f"Sample vocabulary: {vocab[:20]}")

Vocabulary size: 258
Sample vocabulary: ['<PAD>', '<UNK>', '<START>', '<END>', 'edtech', 'aiml', 'fintech', 'include', 'model', 'ecommerce', 'healthtech', 'trend', '2025', 'user', 'firm', 'market', 'player', 'use', 'key', 'whats']


In [6]:
# ===== GLOVE EMBEDDINGS =====

def download_glove():
    """Download GloVe embeddings"""
    import urllib.request
    import zipfile
    import os

    if not os.path.exists('glove.6B.100d.txt'):
        print("Downloading GloVe embeddings...")
        try:
            url = 'http://nlp.stanford.edu/data/glove.6B.zip'
            urllib.request.urlretrieve(url, 'glove.6B.zip')

            with zipfile.ZipFile('glove.6B.zip', 'r') as zip_ref:
                zip_ref.extract('glove.6B.100d.txt')

            os.remove('glove.6B.zip')
            print("GloVe embeddings downloaded!")
        except Exception as e:
            print(f"Error downloading GloVe: {e}")
            print("Will use random embeddings instead.")
            return False
    return True

def load_glove_embeddings(vocab, embedding_dim=100):
    """Load GloVe embeddings"""
    glove_available = download_glove()

    embeddings_index = {}

    if glove_available:
        with open('glove.6B.100d.txt', 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    values = line.split()
                    word = values[0]
                    coefs = np.asarray(values[1:], dtype='float32')
                    embeddings_index[word] = coefs
                except:
                    continue

        print(f"Found {len(embeddings_index)} word vectors in GloVe.")

    # Embedding matrix
    embedding_matrix = np.zeros((len(vocab), embedding_dim))

    for word, idx in word_to_idx.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector
        else:
            # Random initialization for unknown words
            embedding_matrix[idx] = np.random.normal(0, 0.1, embedding_dim)

    return embedding_matrix

# Load GloVe embeddings
EMBEDDING_DIM = 100
embedding_matrix = load_glove_embeddings(vocab, EMBEDDING_DIM)
print("Embeddings loaded successfully!")

Downloading GloVe embeddings...
GloVe embeddings downloaded!
Found 400000 word vectors in GloVe.
Embeddings loaded successfully!


In [7]:
# ===== DATA ENCODING =====

def encode_sequences(tokenized_texts, word_to_idx, max_length=None):
    """Convert tokenized texts to sequences of indices"""
    sequences = []
    for tokens in tokenized_texts:
        sequence = [word_to_idx.get(token, word_to_idx['<UNK>']) for token in tokens]
        sequences.append(sequence)

    if max_length is None:
        max_length = max(len(seq) for seq in sequences) if sequences else 10

    # Pad sequences
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    return padded_sequences, max_length

# Sequence lengths based on data distribution
question_lengths = [len(q) for q in questions_processed]
answer_lengths = [len(a) for a in answers_processed]

# 95th percentile to determine max lengths
MAX_QUESTION_LENGTH = min(30, int(np.percentile(question_lengths, 95))) if question_lengths else 20
MAX_ANSWER_LENGTH = min(60, int(np.percentile(answer_lengths, 95))) if answer_lengths else 40

print(f"Max question length: {MAX_QUESTION_LENGTH}")
print(f"Max answer length: {MAX_ANSWER_LENGTH}")

# Encode questions and answers
questions_encoded, _ = encode_sequences(questions_processed, word_to_idx, MAX_QUESTION_LENGTH)
answers_encoded, _ = encode_sequences(answers_processed, word_to_idx, MAX_ANSWER_LENGTH)

print(f"Questions encoded shape: {questions_encoded.shape}")
print(f"Answers encoded shape: {answers_encoded.shape}")

Max question length: 5
Max answer length: 7
Questions encoded shape: (750, 5)
Answers encoded shape: (750, 7)


In [8]:
# ===== NEURAL NETWORK ARCHITECTURE =====

class MarketResearchChatbot:
    def __init__(self, vocab_size, embedding_dim, embedding_matrix, max_question_length, max_answer_length):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embedding_matrix = embedding_matrix
        self.max_question_length = max_question_length
        self.max_answer_length = max_answer_length
        self.model = None
        self.encoder_model = None
        self.decoder_model = None

    def build_model(self):
        """
        Build LSTM-based seq2seq model

        NEURAL NETWORK LAYERS ARCHITECTURE:

        1. EMBEDDING LAYER:
           - Input: Integer sequences (word indices)
           - Output: Dense vectors (GloVe embeddings)
           - Parameters: vocab_size x embedding_dim

        2. ENCODER LAYERS:
           - Bidirectional LSTM (512 units total: 256 forward + 256 backward)
           - Dropout: 0.3 (prevents overfitting)
           - Recurrent Dropout: 0.3
           - Returns: sequences, forward states, backward states

        3. DECODER LAYERS:
           - LSTM (512 units)
           - Dropout: 0.3
           - Recurrent Dropout: 0.3
           - Returns: sequences and states

        4. ATTENTION MECHANISM (Simplified):
           - Dense layer for attention weights
           - Context vector computation

        5. OUTPUT LAYER:
           - Dense layer with softmax activation
           - Size: vocab_size (probability distribution over vocabulary)
        """

        # ENCODER ARCHITECTURE
        encoder_inputs = Input(shape=(self.max_question_length,), name='encoder_input')

        # Layer 1: Embedding Layer
        embedding_layer = Embedding(
            input_dim=self.vocab_size,
            output_dim=self.embedding_dim,
            weights=[self.embedding_matrix],
            trainable=False,
            mask_zero=True,
            name='embedding_layer'
        )

        encoder_embedded = embedding_layer(encoder_inputs)

        # Layer 2: Bidirectional LSTM Encoder
        encoder_lstm = Bidirectional(
            LSTM(256, return_sequences=True, return_state=True,
                 dropout=0.3, recurrent_dropout=0.3),
            name='encoder_bilstm'
        )

        encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedded)

        # Combine forward and backward states
        state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
        state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])
        encoder_states = [state_h, state_c]

        # DECODER ARCHITECTURE
        decoder_inputs = Input(shape=(None,), name='decoder_input')
        decoder_embedded = embedding_layer(decoder_inputs)

        # Layer 3: LSTM Decoder
        decoder_lstm = LSTM(
            512,
            return_sequences=True,
            return_state=True,
            dropout=0.3,
            recurrent_dropout=0.3,
            name='decoder_lstm'
        )

        decoder_outputs, _, _ = decoder_lstm(decoder_embedded, initial_state=encoder_states)

        # Layer 4: Attention Mechanism
        attention = Dense(512, activation='tanh', name='attention_layer')(decoder_outputs)

        # Layer 5: Output Dense Layer
        decoder_dense = Dense(self.vocab_size, activation='softmax', name='output_layer')
        decoder_outputs = decoder_dense(attention)

        # Define the training model
        model = Model([encoder_inputs, decoder_inputs], decoder_outputs, name='seq2seq_chatbot')

        # Compile with improved optimizer settings
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        self.model = model

        # Inference models
        # Encoder model for inference
        self.encoder_model = Model(encoder_inputs, encoder_states, name='encoder_inference')

        # Decoder model for inference
        decoder_state_input_h = Input(shape=(512,), name='decoder_state_h')
        decoder_state_input_c = Input(shape=(512,), name='decoder_state_c')
        decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

        decoder_outputs, state_h, state_c = decoder_lstm(
            decoder_embedded, initial_state=decoder_states_inputs)

        # Apply attention and output layers
        attention_inf = Dense(512, activation='tanh', name='attention_inference')(decoder_outputs)
        decoder_outputs = decoder_dense(attention_inf)

        decoder_states = [state_h, state_c]

        self.decoder_model = Model(
            [decoder_inputs] + decoder_states_inputs,
            [decoder_outputs] + decoder_states,
            name='decoder_inference'
        )

        return model

# Initialize chatbot
chatbot = MarketResearchChatbot(
    vocab_size=len(vocab),
    embedding_dim=EMBEDDING_DIM,
    embedding_matrix=embedding_matrix,
    max_question_length=MAX_QUESTION_LENGTH,
    max_answer_length=MAX_ANSWER_LENGTH
)

model = chatbot.build_model()
print("Model architecture created!")
print("\n" + "="*60)
print("NEURAL NETWORK LAYERS SUMMARY:")
print("="*60)
print("1. Embedding Layer: Converts word indices to dense vectors")
print("2. Bidirectional LSTM Encoder: Processes input questions")
print("3. LSTM Decoder: Generates output answers")
print("4. Attention Layer: Focuses on relevant parts of input")
print("5. Dense Output Layer: Produces probability distribution over vocabulary")
print("="*60)
print(f"Total parameters: {model.count_params():,}")
print("="*60)

model.summary()

Model architecture created!

NEURAL NETWORK LAYERS SUMMARY:
1. Embedding Layer: Converts word indices to dense vectors
2. Bidirectional LSTM Encoder: Processes input questions
3. LSTM Decoder: Generates output answers
4. Attention Layer: Focuses on relevant parts of input
5. Dense Output Layer: Produces probability distribution over vocabulary
Total parameters: 2,407,370


In [9]:
# ===== TRAINING DATA PREPARATION =====

def prepare_training_data(questions_encoded, answers_encoded, word_to_idx):
    """Prepare training data for seq2seq model"""
    encoder_input_data = questions_encoded

    # Decoder input and target data preparation
    decoder_input_data = np.zeros((len(answers_encoded), MAX_ANSWER_LENGTH), dtype='int32')
    decoder_target_data = np.zeros((len(answers_encoded), MAX_ANSWER_LENGTH), dtype='int32')

    for i, answer_seq in enumerate(answers_encoded):
        # Find actual length
        actual_length = np.count_nonzero(answer_seq)

        if actual_length > 0:
            # Decoder input: START token + answer
            decoder_input_data[i, 0] = word_to_idx['<START>']
            if actual_length > 1:
                decoder_input_data[i, 1:min(actual_length, MAX_ANSWER_LENGTH-1)] = answer_seq[:min(actual_length-1, MAX_ANSWER_LENGTH-2)]

            # Decoder target: answer + END token
            decoder_target_data[i, :min(actual_length, MAX_ANSWER_LENGTH-1)] = answer_seq[:min(actual_length, MAX_ANSWER_LENGTH-1)]
            if actual_length < MAX_ANSWER_LENGTH:
                decoder_target_data[i, min(actual_length, MAX_ANSWER_LENGTH-1)] = word_to_idx['<END>']

    return encoder_input_data, decoder_input_data, decoder_target_data

encoder_input_data, decoder_input_data, decoder_target_data = prepare_training_data(
    questions_encoded, answers_encoded, word_to_idx)

print("Training data prepared!")
print(f"Training samples: {len(encoder_input_data)}")
print(f"Encoder input shape: {encoder_input_data.shape}")
print(f"Decoder input shape: {decoder_input_data.shape}")
print(f"Decoder target shape: {decoder_target_data.shape}")

Training data prepared!
Training samples: 750
Encoder input shape: (750, 5)
Decoder input shape: (750, 7)
Decoder target shape: (750, 7)


In [10]:
# ===== MODEL TRAINING =====

print("Starting model training...")

# Callbacks for better training
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=5,
        min_lr=0.0001,
        verbose=1
    )
]

# Train the model
batch_size = min(32, len(encoder_input_data) // 10)
epochs = 150 if len(encoder_input_data) > 100 else 50

print(f"Training with batch size: {batch_size}, epochs: {epochs}")

history = model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
    callbacks=callbacks,
    verbose=1
)

print("Model training completed!")

Starting model training...
Training with batch size: 32, epochs: 150
Epoch 1/150
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 198ms/step - accuracy: 0.0830 - loss: 4.8611 - val_accuracy: 0.2771 - val_loss: 2.6821 - learning_rate: 0.0010
Epoch 2/150
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 81ms/step - accuracy: 0.3843 - loss: 2.3305 - val_accuracy: 0.6676 - val_loss: 1.1037 - learning_rate: 0.0010
Epoch 3/150
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 77ms/step - accuracy: 0.6610 - loss: 0.9883 - val_accuracy: 0.6886 - val_loss: 0.5612 - learning_rate: 0.0010
Epoch 4/150
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 74ms/step - accuracy: 0.7076 - loss: 0.5444 - val_accuracy: 0.7543 - val_loss: 0.3061 - learning_rate: 0.0010
Epoch 5/150
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 103ms/step - accuracy: 0.7445 - loss: 0.3437 - val_accuracy: 0.7676 - val_loss: 0.2286 - learning_rate: 0.001

In [11]:
# ===== INFERENCE FUNCTIONS =====

def similarity_search(query_tokens, questions_processed, threshold=0.2):
    """Find most similar question using token overlap"""
    best_match_idx = -1
    best_score = 0

    query_set = set(query_tokens)

    for i, q_tokens in enumerate(questions_processed):
        q_set = set(q_tokens)

        intersection = len(query_set.intersection(q_set))
        union = len(query_set.union(q_set))

        if union > 0:
            score = intersection / union
            if score > best_score and score >= threshold:
                best_score = score
                best_match_idx = i

    return best_match_idx, best_score

def decode_sequence(input_seq, encoder_model, decoder_model, word_to_idx, idx_to_word, max_decoder_seq_length):
    """Decode a sequence using the trained model"""
    try:
        states_value = encoder_model.predict(input_seq, verbose=0)

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = word_to_idx['<START>']

        stop_condition = False
        decoded_sentence = []
        max_iterations = max_decoder_seq_length

        for _ in range(max_iterations):
            output_tokens, h, c = decoder_model.predict([target_seq] + states_value, verbose=0)

            # Sample a token
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            sampled_word = idx_to_word.get(sampled_token_index, '<UNK>')

            if sampled_word in ['<END>', '<PAD>'] or len(decoded_sentence) >= max_decoder_seq_length:
                break

            if sampled_word != '<UNK>':
                decoded_sentence.append(sampled_word)

            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index

            states_value = [h, c]

        return ' '.join(decoded_sentence)

    except Exception as e:
        print(f"Error in decode_sequence: {e}")
        return ""

def get_chatbot_response(user_input):
    """Get response from the chatbot"""
    try:
        # Preprocess user input
        user_tokens = preprocessor.preprocess(user_input)

        if not user_tokens:
            return "I'm sorry, I couldn't understand your question. Could you please rephrase it?"

        # First, Similarity search
        match_idx, similarity_score = similarity_search(user_tokens, questions_processed, threshold=0.2)

        if match_idx != -1 and similarity_score > 0.4:
            return df.iloc[match_idx]['Answer']

        # No good match found then neural network prediction
        user_sequence = [word_to_idx.get(token, word_to_idx['<UNK>']) for token in user_tokens]
        user_padded = pad_sequences([user_sequence], maxlen=MAX_QUESTION_LENGTH, padding='post')

        response = decode_sequence(
            user_padded,
            chatbot.encoder_model,
            chatbot.decoder_model,
            word_to_idx,
            idx_to_word,
            MAX_ANSWER_LENGTH
        )

        # If response is too short or empty, fallback
        if len(response.strip()) < 5:
            if match_idx != -1:
                return df.iloc[match_idx]['Answer']
            else:
                return "I'm sorry, I couldn't find relevant information for your query. Could you try rephrasing your question?"

        return response

    except Exception as e:
        print(f"Error in get_chatbot_response: {e}")
        return "I'm sorry, I encountered an error processing your request. Please try again."

In [12]:
# ===== TESTING THE CHATBOT =====

print("CHATBOT TESTING WITH Q&A PAIRS")

# Test
test_queries = [
    "What is the market size of edtech?",
    "How big is the fintech market?",
    "Tell me about healthtech trends",
    "What are startup funding trends?",
    "How to validate a business idea?",
    "What are the key metrics for SaaS startups?"
]

for query in test_queries:
    print(f"\nUser: {query}")
    response = get_chatbot_response(query)
    print(f"Bot: {response}")


CHATBOT TESTING WITH Q&A PAIRS

User: What is the market size of edtech?
Bot: The edtech market is expected to reach $404B by 2025.

User: How big is the fintech market?
Bot: Notable players are PayPal, Stripe, and Robinhood.

User: Tell me about healthtech trends
Bot: Trends include telemedicine and wearable health devices.

User: What are startup funding trends?
Bot: directtoconsumer top major 460b expected fast competitor

User: How to validate a business idea?
Bot: health major open aidriven aidriven student gamification

User: What are the key metrics for SaaS startups?
Bot: alibaba alibaba follow clinic clinic healthcare gamification


In [13]:
# ===== INTERACTIVE CHATBOT =====

def interactive_chatbot():
    """Interactive chatbot session"""
    print("\n" + "="*60)
    print("MARKET RESEARCH CHATBOT - 751 Q&A PAIRS")
    print("="*60)
    print("Ask me anything about market research, startups, and business!")
    print("Type 'quit' to exit.")
    print("-"*60)

    while True:
        user_input = input("\nYou: ").strip()

        if user_input.lower() in ['quit', 'exit', 'bye']:
            print("Bot: Thank you for using the Market Research Chatbot. Goodbye!")
            break

        if user_input:
            response = get_chatbot_response(user_input)
            print(f"Bot: {response}")
        else:
            print("Bot: Please enter a valid question.")

print("\nChatbot is ready! Starting interactive session...")
interactive_chatbot()


Chatbot is ready! Starting interactive session...

MARKET RESEARCH CHATBOT - 751 Q&A PAIRS
Ask me anything about market research, startups, and business!
Type 'quit' to exit.
------------------------------------------------------------

You: What is the market size of the AI/ML
Bot: AI/ML is estimated to hit $500B by 2025.

You: Tell me about AI/ML
Bot: AI/ML is estimated to hit $500B by 2025.

You: which match is today in IPL
Bot: top top goto aidriven millennials estimated estimated

You: who is virat kohli
Bot: projected model blockchain expected future forecasted forecasted

You: What's the go-to business model in AI/ML
Bot: AI/ML firms use licensing, APIs, and consulting models.

You: How much is the E-commerce sector worth
Bot: E-commerce could reach $6.3T globally by 2025.

You: Which sectors consume Fintech solutions the most?
Bot: Fintech adoption is highest among millennials and Gen Z.

You: tell me something about music
Bot: popular top 460b 460b aidriven market player

You

In [15]:
# ===== SAVE MODEL AND COMPONENTS =====

print("\nSaving model and components...")

# Save the complete chatbot models using modern Keras format
print("Saving training model...")
model.save('market_research_chatbot_751_model.keras')

print("Saving inference models...")
# Save encoder and decoder models for inference
chatbot.encoder_model.save('market_research_encoder_model.keras')
chatbot.decoder_model.save('market_research_decoder_model.keras')

# Save preprocessor as a separate component
preprocessor_data = {
    'stop_words': list(preprocessor.stop_words),
    'lemmatizer_class': type(preprocessor.lemmatizer).__name__
}

# Save all components including preprocessor
components_to_save = {
    'vocab': vocab,
    'word_to_idx': word_to_idx,
    'idx_to_word': idx_to_word,
    'questions_processed': questions_processed,
    'answers_processed': answers_processed,
    'df': df,
    'max_question_length': MAX_QUESTION_LENGTH,
    'max_answer_length': MAX_ANSWER_LENGTH,
    'embedding_dim': EMBEDDING_DIM,
    'vocab_size': len(vocab),
    'embedding_matrix': embedding_matrix,
    'preprocessor_data': preprocessor_data
}

# Save components with protocol 4 for better compatibility
with open('chatbot_751_components.pkl', 'wb') as f:
    pickle.dump(components_to_save, f, protocol=4)

# Save model architecture as JSON for reconstruction
model_config = model.to_json()
with open('model_architecture.json', 'w') as f:
    f.write(model_config)

print("Model and components saved successfully!")
print("\nFiles saved:")
print("- market_research_chatbot_751_model.keras (Complete training model)")
print("- market_research_encoder_model.keras (Encoder for inference)")
print("- market_research_decoder_model.keras (Decoder for inference)")
print("- chatbot_751_components.pkl (All components and data)")
print("- model_architecture.json (Model architecture)")

print(f"\nModel Statistics:")
print(f"- Total training samples: {len(encoder_input_data)}")
print(f"- Vocabulary size: {len(vocab)}")
print(f"- Total model parameters: {model.count_params():,}")

print("\n" + "="*60)
print("SETUP COMPLETED SUCCESSFULLY!")
print("Your custom neural network chatbot is fully saved!")
print("All models and components are ready for future use.")
print("="*60)


Saving model and components...
Saving training model...
Saving inference models...
Model and components saved successfully!

Files saved:
- market_research_chatbot_751_model.keras (Complete training model)
- market_research_encoder_model.keras (Encoder for inference)
- market_research_decoder_model.keras (Decoder for inference)
- chatbot_751_components.pkl (All components and data)
- model_architecture.json (Model architecture)

Model Statistics:
- Total training samples: 750
- Vocabulary size: 258
- Total model parameters: 2,407,370

SETUP COMPLETED SUCCESSFULLY!
Your custom neural network chatbot is fully saved!
All models and components are ready for future use.


In [None]:
# ===== LOADING FUNCTION FOR FUTURE USE =====

def load_saved_chatbot():
    """
    Complete function to load the saved chatbot for future use.
    Use this code when you want to load your saved model in a new session.
    """
    loading_code = '''
# Complete code to load your saved custom neural network chatbot

import pickle
import tensorflow as tf
from tensorflow.keras.models import load_model
import pandas as pd
import numpy as np
import nltk
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Download required NLTK data (run once)
try:
    nltk.download('punkt', quiet=True)
    nltk.download('punkt_tab', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('omw-1.4', quiet=True)
except:
    pass

print("Loading chatbot components...")

# Load all saved components
with open('chatbot_751_components.pkl', 'rb') as f:
    components = pickle.load(f)

# Extract all components
vocab = components['vocab']
word_to_idx = components['word_to_idx']
idx_to_word = components['idx_to_word']
questions_processed = components['questions_processed']
answers_processed = components['answers_processed']
df = components['df']
MAX_QUESTION_LENGTH = components['max_question_length']
MAX_ANSWER_LENGTH = components['max_answer_length']
EMBEDDING_DIM = components['embedding_dim']
embedding_matrix = components['embedding_matrix']

print("Loading trained models...")

# Load all three models (using .keras format)
main_model = load_model('market_research_chatbot_751_model.keras')
encoder_model = load_model('market_research_encoder_model.keras')
decoder_model = load_model('market_research_decoder_model.keras')

print("Recreating preprocessor...")

# Recreate the TextPreprocessor class
class TextPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def clean_text(self, text):
        """Clean and preprocess text"""
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z0-9\\s]', '', text)
        text = re.sub(r'\\s+', ' ', text).strip()
        return text

    def tokenize_text(self, text):
        """Tokenize text using NLTK"""
        tokens = word_tokenize(text)
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens
                 if token not in self.stop_words and len(token) > 1]
        return tokens

    def preprocess(self, text):
        """Complete preprocessing pipeline"""
        cleaned_text = self.clean_text(text)
        tokens = self.tokenize_text(cleaned_text)
        return tokens

# Initialize preprocessor
preprocessor = TextPreprocessor()

print("Setting up inference functions...")

# Similarity search function
def similarity_search(query_tokens, questions_processed, threshold=0.2):
    """Find most similar question using token overlap"""
    best_match_idx = -1
    best_score = 0

    query_set = set(query_tokens)

    for i, q_tokens in enumerate(questions_processed):
        q_set = set(q_tokens)

        intersection = len(query_set.intersection(q_set))
        union = len(query_set.union(q_set))

        if union > 0:
            score = intersection / union
            if score > best_score and score >= threshold:
                best_score = score
                best_match_idx = i

    return best_match_idx, best_score

# Sequence decoding function
def decode_sequence(input_seq, encoder_model, decoder_model, word_to_idx, idx_to_word, max_decoder_seq_length):
    """Decode a sequence using the trained model"""
    try:
        states_value = encoder_model.predict(input_seq, verbose=0)

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = word_to_idx['<START>']

        decoded_sentence = []
        max_iterations = max_decoder_seq_length

        for _ in range(max_iterations):
            output_tokens, h, c = decoder_model.predict([target_seq] + states_value, verbose=0)

            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            sampled_word = idx_to_word.get(sampled_token_index, '<UNK>')

            if sampled_word in ['<END>', '<PAD>'] or len(decoded_sentence) >= max_decoder_seq_length:
                break

            if sampled_word != '<UNK>':
                decoded_sentence.append(sampled_word)

            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index
            states_value = [h, c]

        return ' '.join(decoded_sentence)

    except Exception as e:
        print(f"Error in decode_sequence: {e}")
        return ""

# Main chatbot response function
def get_chatbot_response(user_input):
    """Get response from the chatbot"""
    try:
        user_tokens = preprocessor.preprocess(user_input)

        if not user_tokens:
            return "I'm sorry, I couldn't understand your question. Could you please rephrase it?"

        # Try similarity search first
        match_idx, similarity_score = similarity_search(user_tokens, questions_processed, threshold=0.2)

        if match_idx != -1 and similarity_score > 0.4:
            return df.iloc[match_idx]['Answer']

        # Use neural network prediction
        user_sequence = [word_to_idx.get(token, word_to_idx['<UNK>']) for token in user_tokens]
        user_padded = pad_sequences([user_sequence], maxlen=MAX_QUESTION_LENGTH, padding='post')

        response = decode_sequence(
            user_padded,
            encoder_model,
            decoder_model,
            word_to_idx,
            idx_to_word,
            MAX_ANSWER_LENGTH
        )

        if len(response.strip()) < 5:
            if match_idx != -1:
                return df.iloc[match_idx]['Answer']
            else:
                return "I'm sorry, I couldn't find relevant information for your query. Could you try rephrasing your question?"

        return response

    except Exception as e:
        print(f"Error in get_chatbot_response: {e}")
        return "I'm sorry, I encountered an error processing your request. Please try again."

# Interactive chatbot function
def start_chatbot():
    """Start interactive chatbot session"""
    print("\\n" + "="*60)
    print("MARKET RESEARCH CHATBOT - LOADED FROM SAVED MODEL")
    print("="*60)
    print("Ask me anything about market research, startups, and business!")
    print("Type 'quit' to exit.")
    print("-"*60)

    while True:
        user_input = input("\\nYou: ").strip()

        if user_input.lower() in ['quit', 'exit', 'bye']:
            print("Bot: Thank you for using the Market Research Chatbot. Goodbye!")
            break

        if user_input:
            response = get_chatbot_response(user_input)
            print(f"Bot: {response}")
        else:
            print("Bot: Please enter a valid question.")

print("\\n" + "="*60)
print("CHATBOT LOADED SUCCESSFULLY!")
print("="*60)
print(f"Vocabulary size: {len(vocab)}")
print(f"Training samples: {len(questions_processed)}")
print(f"Model parameters: {main_model.count_params():,}")
print("="*60)

# Test the loaded chatbot
print("\\nTesting loaded chatbot...")
test_query = "What is market research?"
print(f"Test Query: {test_query}")
print(f"Response: {get_chatbot_response(test_query)}")

print("\\nYour chatbot is ready! Use start_chatbot() to begin interactive session.")
print("Or use get_chatbot_response('your question') for single queries.")
    '''
    return loading_code

# Save the complete loading code to a file
loading_instructions = load_saved_chatbot()
with open('load_chatbot_keras.py', 'wb') as f:
    f.write(loading_instructions.encode('utf-8'))

print("Complete loading function saved to: load_chatbot_keras.py")
print("\\nTo use your saved chatbot in future:")
print("1. Run: exec(open('load_chatbot_keras.py').read())")
print("2. Or copy-paste the code from load_chatbot_keras.py")
print("3. Then use: get_chatbot_response('your question')")
print("4. Or use: start_chatbot() for interactive mode")