<a href="https://colab.research.google.com/github/vassa33/alu-machine_learning/blob/main/chatbot/Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Libraries**

In [2]:
import json
import re
import tensorflow as tf
from transformers import BertTokenizer

# **Pre-Process Data**

In [3]:
# Load the dataset
with open('dataset.json', 'r') as file:
    data = json.load(file)

# Clean the text data
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

questions = [clean_text(pair['question']) for pair in data]
answers = [clean_text(pair['answer']) for pair in data]

# **Tokenize Text**

In [None]:
# Tokenize the text
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking')

def tokenize(texts, is_answer=False):
    return tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,
        max_length=128,  # Adjusted for typical BERT input lengths
        padding='max_length',
        truncation=True,
        return_tensors='tf',
        # If tokenizing answers, only return the input IDs without any additional tokens
        return_token_type_ids= not is_answer,
        return_attention_mask= not is_answer
    )

tokenized_questions = tokenize(questions)
tokenized_answers = tokenize(answers, is_answer=True)

# **Prepare Input Tensors for BERT**

In [None]:
# Extract input IDs from tokenized answers to use as labels
answer_labels = tokenized_answers['input_ids']

train_dataset = tf.data.Dataset.from_tensor_slices((
    {
        'input_ids': tokenized_questions['input_ids'],
        'attention_mask': tokenized_questions['attention_mask']
    },
    answer_labels
)).shuffle(len(questions)).batch(8)

# **Fine-tune the Model**

In [None]:
from transformers import TFBertForSequenceClassification, BertConfig

# Use the number of possible answers as num_labels
config = BertConfig.from_pretrained('bert-large-uncased', num_labels=len(set(answers)))
model = TFBertForSequenceClassification.from_pretrained('bert-large-uncased', config=config)

optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

model.fit(train_dataset, epochs=3)

# **Build an Interface for the Model**

In [None]:
def get_answer(question):
    cleaned_question = clean_text(question)
    inputs = tokenizer.encode_plus(cleaned_question, return_tensors='tf')
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    outputs = model(input_ids, attention_mask=attention_mask)
    answer_start = tf.argmax(outputs.start_logits, axis=1).numpy()[0]
    answer_end = tf.argmax(outputs.end_logits, axis=1).numpy()[0]
    answer_tokens = input_ids[0, answer_start:answer_end+1]
    answer = tokenizer.decode(answer_tokens)
    return answer

def chatbot_interface():
    print("Welcome to the Agriculture Chatbot. Ask a question related to farming and technology!")
    while True:
        question = input("Q: ")
        if question.lower() in ["exit", "quit", "bye"]:
            print("A: Goodbye!")
            break
        answer = get_answer(question)
        print(f"A: {answer}")

# Run the chatbot interface
chatbot_interface()
