In [63]:
# Import the necessary libaries
import pandas as pd
import numpy as np 
import tensorflow as tf
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer
from datasets import load_dataset, DatasetDict


In [57]:
# Load and check the structure of a dataset
ds = load_dataset("KisanVaani/agriculture-qa-english-only")

print(ds["train"][0])

{'question': 'why is crop rotation important in farming?', 'answers': 'This helps to prevent soil erosion and depletion, and can also help to control pests and diseases'}


In [58]:
# Convert the dataset into panda dataframe for easy preprocessing
df = pd.DataFrame(ds["train"])

In [59]:
# Check for the missing values
print(df.isnull().sum())

question    0
answers     0
dtype: int64


In [None]:
# Double check to drop the missing values even though there are not present
df.dropna(inplace=True)



In [None]:
# Remove special characters, extraspaces and lowercase all letters
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-zA-Z0-9?.!,']", " ", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

df['question'] = df['question'].apply(clean_text)
df['answer'] = df['answers'].apply(clean_text)


In [67]:
# Tokenize the model using wordPiece tokenization for BERT

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["question"], examples["answers"], padding="max_length", truncation=True)

tokenized_ds = ds.map(tokenize_function, batched=True)

Map: 100%|██████████| 22615/22615 [00:06<00:00, 3326.48 examples/s]


In [70]:
# Get the tokenized samples

print(tokenized_ds)
print(tokenized_ds['train'][0])

DatasetDict({
    train: Dataset({
        features: ['question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 22615
    })
})
{'question': 'why is crop rotation important in farming?', 'answers': 'This helps to prevent soil erosion and depletion, and can also help to control pests and diseases', 'input_ids': [101, 2339, 2003, 10416, 9963, 2590, 1999, 7876, 1029, 102, 2023, 7126, 2000, 4652, 5800, 14173, 1998, 2139, 10814, 3508, 1010, 1998, 2064, 2036, 2393, 2000, 2491, 20739, 2015, 1998, 7870, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [71]:
# Prepare data for training by importing TFAutoModelForSequenceClassification for question answering

from transformers import TFAutoModelForSequenceClassification

# Load the model (BERT-base)
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)





To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development





All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [72]:
# Convert dataset to TensorFlow format

train_features = {key: tf.constant(tokenized_ds["train"][key]) for key in tokenizer.model_input_names}
train_labels = tf.constant([0] * len(tokenized_ds["train"]["question"]))  # Dummy labels for now

# Create TensorFlow dataset
train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_labels))
train_dataset = train_dataset.batch(8).shuffle(1000)


In [74]:
from transformers import AdamWeightDecay

# Set optimizer and loss
optimizer = AdamWeightDecay(learning_rate=5e-5, weight_decay_rate=0.01)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Compile the model
model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])


# Training the model with few epochs
model.fit(train_dataset, epochs=3)



Epoch 1/3



In [55]:
# Extract questions and answers

questions = [item['question'] for item in ds['train']]
answers = [item['answers'] for item in ds['train']]

In [56]:
print(questions)

['why is crop rotation important in farming?', 'What farming practice helps prevent soil erosion?', 'what is crop rotation', 'what are the different methods of irrigation?', 'why is soil health vital?', 'what are the causes of soil degradation?', 'what is organic farming?', 'what is the importane of organic farming?', 'Which crop is the major source of food in africa?', 'what is the major cassava disease?', 'what are different cassava diseases?', 'what are the signs and symptoms of CBB disease?', 'what are the sustainable approaches to cassava disease management?', 'which diseases can be spread by whiteflies?', 'what is irrigation?', 'what are organic fertilizers?', 'what are fertilizers?', 'why are fertilizers used in farming?', 'what type of fertilizers are made from chemical?', 'what is pest management?', 'what are the different methods of pest management?', 'in which way is climate change affecting agriculture?', 'what are the effects of climate change', 'what is crop protection?',

In [50]:
# Fine tune a transformer model



In [51]:
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from tqdm import tqdm

class ChatbotEvaluator:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')
        self.gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        
    def calculate_f1_metrics(self, reference, candidate):
        """
        Calculate F1, precision, and recall scores for a single response
        """
        # Tokenize both reference and candidate
        reference_tokens = set(word_tokenize(reference.lower()))
        candidate_tokens = set(word_tokenize(candidate.lower()))
        
        # Calculate true positives, false positives, and false negatives
        true_positives = len(reference_tokens.intersection(candidate_tokens))
        false_positives = len(candidate_tokens - reference_tokens)
        false_negatives = len(reference_tokens - candidate_tokens)
        
        # Calculate precision
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        
        # Calculate recall
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        
        # Calculate F1 score
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        return {
            'f1': f1,
            'precision': precision,
            'recall': recall
        }
    
    def calculate_perplexity(self, text):
        """
        Calculate perplexity using GPT-2 as a language model
        """
        encodings = self.gpt2_tokenizer(text, return_tensors='pt')
        max_length = 1024
        
        with torch.no_grad():
            outputs = self.gpt2_model(**encodings)
            loss = outputs.loss
            
        return torch.exp(loss).item()
    
    def semantic_similarity(self, text1, text2):
        """
        Calculate semantic similarity between two texts using cosine similarity
        """
        encoding1 = self.tokenizer(text1, return_tensors='tf', padding=True, truncation=True)
        encoding2 = self.tokenizer(text2, return_tensors='tf', padding=True, truncation=True)
        
        embeddings1 = self.model.encode(encoding1)
        embeddings2 = self.model.encode(encoding2)
        
        similarity = np.dot(embeddings1, embeddings2) / (np.linalg.norm(embeddings1) * np.linalg.norm(embeddings2))
        return similarity
    
    def evaluate_responses(self, true_responses, predicted_responses):
        """
        Comprehensive evaluation of model responses
        """
        results = {
            'f1_scores': [],
            'precision_scores': [],
            'recall_scores': [],
            'perplexities': [],
            'semantic_similarities': [],
            'response_lengths': []
        }
        
        for true_resp, pred_resp in tqdm(zip(true_responses, predicted_responses)):
            # Calculate F1, precision, and recall
            metrics = self.calculate_f1_metrics(true_resp, pred_resp)
            results['f1_scores'].append(metrics['f1'])
            results['precision_scores'].append(metrics['precision'])
            results['recall_scores'].append(metrics['recall'])
            
            # Calculate perplexity
            perplexity = self.calculate_perplexity(pred_resp)
            results['perplexities'].append(perplexity)
            
            # Calculate semantic similarity
            similarity = self.semantic_similarity(true_resp, pred_resp)
            results['semantic_similarities'].append(similarity)
            
            # Response length analysis
            results['response_lengths'].append(len(word_tokenize(pred_resp)))
        
        return results
    
    def generate_evaluation_report(self, results):
        """
        Generate comprehensive evaluation report with visualizations
        """
        report = {
            'metrics': {
                'avg_f1': np.mean(results['f1_scores']),
                'avg_precision': np.mean(results['precision_scores']),
                'avg_recall': np.mean(results['recall_scores']),
                'avg_perplexity': np.mean(results['perplexities']),
                'avg_similarity': np.mean(results['semantic_similarities']),
                'avg_length': np.mean(results['response_lengths'])
            },
            'distributions': results
        }
        
        # Create visualizations
        plt.figure(figsize=(15, 10))
        
        # F1 score distribution
        plt.subplot(2, 3, 1)
        sns.histplot(results['f1_scores'])
        plt.title('F1 Score Distribution')
        
        # Precision distribution
        plt.subplot(2, 3, 2)
        sns.histplot(results['precision_scores'])
        plt.title('Precision Distribution')
        
        # Recall distribution
        plt.subplot(2, 3, 3)
        sns.histplot(results['recall_scores'])
        plt.title('Recall Distribution')
        
        # Perplexity distribution
        plt.subplot(2, 3, 4)
        sns.histplot(results['perplexities'])
        plt.title('Perplexity Distribution')
        
        # Semantic similarity distribution
        plt.subplot(2, 3, 5)
        sns.histplot(results['semantic_similarities'])
        plt.title('Semantic Similarity Distribution')
        
        plt.tight_layout()
        
        return report



In [52]:
def qualitative_analysis(predicted_responses, true_responses, n_samples=5):
    """
    Perform qualitative analysis on random samples
    """
    indices = np.random.choice(len(predicted_responses), n_samples, replace=False)
    analysis = []
    
    evaluator = ChatbotEvaluator(None, None)  # Temporary instance for F1 calculation
    
    for idx in indices:
        metrics = evaluator.calculate_f1_metrics(true_responses[idx], predicted_responses[idx])
        analysis.append({
            'true_response': true_responses[idx],
            'predicted_response': predicted_responses[idx],
            'f1_score': metrics['f1'],
            'precision': metrics['precision'],
            'recall': metrics['recall'],
            'response_length': len(word_tokenize(predicted_responses[idx])),
            'notes': ''  # For manual analysis notes
        })
    
    return analysis