In [63]:
# Import the necessary libaries
import pandas as pd
import numpy as np 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer
from datasets import load_dataset


In [64]:

ds = load_dataset("KisanVaani/agriculture-qa-english-only")
print(ds["train"][0])

{'question': 'why is crop rotation important in farming?', 'answers': 'This helps to prevent soil erosion and depletion, and can also help to control pests and diseases'}


In [66]:
# Extract questions and answers

questions = [item['question'] for item in ds['train']]
answers = [item['answers'] for item in ds['train']]


# Tokenize the data to convert a text in a model friendly format.
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
    inputs = tokenizer(examples['question'], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(examples['answers'], padding="max_length", truncation=True, max_length=128)
    return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]}

ds = ds.map(preprocess_data, batched=True)

In [67]:
# Fine tune a transformer model

from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small")
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
train_dataset = ds["train"].shuffle().select(range(10000))  # Reduce size if needed
train_dataset = train_dataset.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    label_cols=["labels"],
    batch_size=8,
    shuffle=True,
    collate_fn=data_collator
)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer)

# Train the model
model.fit(train_dataset, epochs=3)

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


ImportError: Unable to convert output to PyTorch tensors format, PyTorch is not installed.

In [62]:
from sklearn.preprocessing import LabelEncoder

answers = [item['answers'] for item in ds['train']]

# Convert answers to labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(answers)

# Print the mapping of labels
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))



In [59]:
# Predict categories / a label

def predict_category(question):
    inputs = tokenizer(question, return_tensors="tf")
    output = model.generate(**inputs)
    response_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Convert chatbot's response into a category
    if response_text in label_encoder.classes_:
        predicted_label = label_encoder.transform([response_text])[0]
    else:
        predicted_label = -1  # Unknown response
    return predicted_label


In [61]:
from sklearn.metrics import f1_score

# Generate predictions
true_labels = labels[:100]  # Use a subset for evaluation
predicted_labels = [predict_category(q) for q in ds["train"][:100]["question"]]

# Remove unknown (-1) predictions before scoring
filtered_true_labels = []
filtered_predicted_labels = []
for t, p in zip(true_labels, predicted_labels):
    if p != -1:
        filtered_true_labels.append(t)
        filtered_predicted_labels.append(p)

# Calculate F1-score
f1 = f1_score(filtered_true_labels, filtered_predicted_labels, average='weighted')
print("F1 Score:", f1)


ValueError: The following `model_kwargs` are not used by the model: ['token_type_ids'] (note: typos in the generate arguments will also show up in this list)

In [None]:
# Evaluate the model with BLEW

from datasets import load_metric

metric = load_metric("bleu")

# Example evaluation function
def evaluate_model(question):
    inputs = tokenizer(question, return_tensors="tf")
    output = model.generate(**inputs)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

sample_question = "What are the best fertilizers for wheat?"
print(evaluate_model(sample_question))


ImportError: cannot import name 'load_metric' from 'datasets' (c:\Users\yiish\OneDrive\Desktop\domain-chatbot\new-venv\Lib\site-packages\datasets\__init__.py)

In [51]:
import gradio as gr

def chatbot_response(question):
    return evaluate_model(question)

iface = gr.Interface(fn=chatbot_response, inputs="text", outputs="text", title="Agriculture Chatbot")
iface.launch()


* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "c:\Users\yiish\OneDrive\Desktop\domain-chatbot\new-venv\Lib\site-packages\gradio\queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\yiish\OneDrive\Desktop\domain-chatbot\new-venv\Lib\site-packages\gradio\route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\yiish\OneDrive\Desktop\domain-chatbot\new-venv\Lib\site-packages\gradio\blocks.py", line 2096, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\yiish\OneDrive\Desktop\domain-chatbot\new-venv\Lib\site-packages\gradio\blocks.py", line 1643, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\yiish\OneDrive\De