In [37]:
# Import the necessary libaries
import pandas as pd
import numpy as np 
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer
from datasets import load_dataset, DatasetDict


In [38]:
ds = load_dataset("KisanVaani/agriculture-qa-english-only")
print(ds["train"][0])

{'question': 'why is crop rotation important in farming?', 'answers': 'This helps to prevent soil erosion and depletion, and can also help to control pests and diseases'}


In [39]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import re
from bs4 import BeautifulSoup
import unicodedata

def preprocess_dataset(dataset):
    """
    Main preprocessing function that coordinates all preprocessing steps
    """
    # Convert dataset to pandas DataFrame for easier manipulation
    df = pd.DataFrame({
        'question': [item['question'] for item in dataset],
        'answer': [item['answer'] for item in dataset]
    })
    
    # Clean text
    df['question_cleaned'] = df['question'].apply(clean_text)
    df['answer_cleaned'] = df['answer'].apply(clean_text)
    
    # Initialize tokenizer (using BERT base model)
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    
    # Tokenize the cleaned text
    encoded_data = tokenize_data(df, tokenizer)
    
    return encoded_data, tokenizer

def clean_text(text):
    """
    Comprehensive text cleaning function
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # Normalize unicode characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

def tokenize_data(df, tokenizer, max_length=128):
    """
    Tokenize the cleaned text data using the BERT tokenizer
    """
    # Tokenize questions
    questions_encoded = tokenizer(
        df['question_cleaned'].tolist(),
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='tf'
    )
    
    # Tokenize answers
    answers_encoded = tokenizer(
        df['answer_cleaned'].tolist(),
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='tf'
    )
    
    return {
        'input_ids': questions_encoded['input_ids'],
        'attention_mask': questions_encoded['attention_mask'],
        'labels': answers_encoded['input_ids']
    }

def create_train_val_test_split(encoded_data, train_size=0.7, val_size=0.15):
    """
    Split the encoded data into train, validation, and test sets
    """
    # First split into train and temp
    train_data, temp_data = {}, {}
    
    for key in encoded_data:
        train_data[key], temp_data[key] = train_test_split(
            encoded_data[key],
            train_size=train_size,
            random_state=42
        )
    
    # Split temp into validation and test
    val_data, test_data = {}, {}
    val_ratio = val_size / (1 - train_size)
    
    for key in temp_data:
        val_data[key], test_data[key] = train_test_split(
            temp_data[key],
            train_size=val_ratio,
            random_state=42
        )
    
    return train_data, val_data, test_data

In [40]:
import tensorflow as tf
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import itertools
from datetime import datetime

class HyperparameterTuner:
    def __init__(self, train_data, val_data, tokenizer):
        self.train_data = train_data
        self.val_data = val_data
        self.tokenizer = tokenizer
        self.results = []
        
    def create_model(self, learning_rate, dropout_rate):
        """
        Create model with specified hyperparameters
        """
        model = TFAutoModelForSeq2SeqLM.from_pretrained('t5-small')
        
        # Adjust model configuration
        model.config.dropout_rate = dropout_rate
        
        # Compile model
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy')
        
        return model
    
    def train_and_evaluate(self, model, batch_size, epochs):
        """
        Train model and evaluate performance
        """
        # Convert data to tf.data.Dataset
        train_dataset = tf.data.Dataset.from_tensor_slices(self.train_data).batch(batch_size)
        val_dataset = tf.data.Dataset.from_tensor_slices(self.val_data).batch(batch_size)
        
        # Train model
        history = model.fit(
            train_dataset,
            validation_data=val_dataset,
            epochs=epochs,
            verbose=1
        )
        
        # Generate predictions on validation set
        val_predictions = model.predict(val_dataset)
        
        # Convert predictions to text
        predicted_text = self.tokenizer.batch_decode(
            tf.argmax(val_predictions, axis=-1),
            skip_special_tokens=True
        )
        actual_text = self.tokenizer.batch_decode(
            self.val_data['labels'],
            skip_special_tokens=True
        )
        
        # Calculate metrics
        metrics = {
            'val_loss': min(history.history['val_loss']),
            'accuracy': accuracy_score(actual_text, predicted_text),
            'f1_score': f1_score(actual_text, predicted_text, average='weighted')
        }
        
        return metrics
    
    def run_grid_search(self, param_grid):
        """
        Perform grid search over hyperparameter combinations
        """
        # Generate all combinations of hyperparameters
        param_combinations = [dict(zip(param_grid.keys(), v)) 
                            for v in itertools.product(*param_grid.values())]
        
        for params in param_combinations:
            # Create and train model
            model = self.create_model(
                learning_rate=params['learning_rate'],
                dropout_rate=params['dropout_rate']
            )
            
            metrics = self.train_and_evaluate(
                model,
                batch_size=params['batch_size'],
                epochs=params['epochs']
            )
            
            # Store results
            self.results.append({
                'timestamp': datetime.now(),
                **params,
                **metrics
            })
    
    def get_results_df(self):
        """
        Return results as pandas DataFrame
        """
        return pd.DataFrame(self.results)

# Define hyperparameter grid
param_grid = {
    'learning_rate': [1e-4, 3e-4, 1e-3],
    'batch_size': [16, 32, 64],
    'dropout_rate': [0.1, 0.2, 0.3],
    'epochs': [3, 5]
}

In [41]:
# Extract questions and answers

questions = [item['question'] for item in ds['train']]
answers = [item['answers'] for item in ds['train']]

In [42]:
# Fine tune a transformer model



In [43]:
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from tqdm import tqdm

class ChatbotEvaluator:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')
        self.gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        
    def calculate_f1_metrics(self, reference, candidate):
        """
        Calculate F1, precision, and recall scores for a single response
        """
        # Tokenize both reference and candidate
        reference_tokens = set(word_tokenize(reference.lower()))
        candidate_tokens = set(word_tokenize(candidate.lower()))
        
        # Calculate true positives, false positives, and false negatives
        true_positives = len(reference_tokens.intersection(candidate_tokens))
        false_positives = len(candidate_tokens - reference_tokens)
        false_negatives = len(reference_tokens - candidate_tokens)
        
        # Calculate precision
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        
        # Calculate recall
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        
        # Calculate F1 score
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        return {
            'f1': f1,
            'precision': precision,
            'recall': recall
        }
    
    def calculate_perplexity(self, text):
        """
        Calculate perplexity using GPT-2 as a language model
        """
        encodings = self.gpt2_tokenizer(text, return_tensors='pt')
        max_length = 1024
        
        with torch.no_grad():
            outputs = self.gpt2_model(**encodings)
            loss = outputs.loss
            
        return torch.exp(loss).item()
    
    def semantic_similarity(self, text1, text2):
        """
        Calculate semantic similarity between two texts using cosine similarity
        """
        encoding1 = self.tokenizer(text1, return_tensors='tf', padding=True, truncation=True)
        encoding2 = self.tokenizer(text2, return_tensors='tf', padding=True, truncation=True)
        
        embeddings1 = self.model.encode(encoding1)
        embeddings2 = self.model.encode(encoding2)
        
        similarity = np.dot(embeddings1, embeddings2) / (np.linalg.norm(embeddings1) * np.linalg.norm(embeddings2))
        return similarity
    
    def evaluate_responses(self, true_responses, predicted_responses):
        """
        Comprehensive evaluation of model responses
        """
        results = {
            'f1_scores': [],
            'precision_scores': [],
            'recall_scores': [],
            'perplexities': [],
            'semantic_similarities': [],
            'response_lengths': []
        }
        
        for true_resp, pred_resp in tqdm(zip(true_responses, predicted_responses)):
            # Calculate F1, precision, and recall
            metrics = self.calculate_f1_metrics(true_resp, pred_resp)
            results['f1_scores'].append(metrics['f1'])
            results['precision_scores'].append(metrics['precision'])
            results['recall_scores'].append(metrics['recall'])
            
            # Calculate perplexity
            perplexity = self.calculate_perplexity(pred_resp)
            results['perplexities'].append(perplexity)
            
            # Calculate semantic similarity
            similarity = self.semantic_similarity(true_resp, pred_resp)
            results['semantic_similarities'].append(similarity)
            
            # Response length analysis
            results['response_lengths'].append(len(word_tokenize(pred_resp)))
        
        return results
    
    def generate_evaluation_report(self, results):
        """
        Generate comprehensive evaluation report with visualizations
        """
        report = {
            'metrics': {
                'avg_f1': np.mean(results['f1_scores']),
                'avg_precision': np.mean(results['precision_scores']),
                'avg_recall': np.mean(results['recall_scores']),
                'avg_perplexity': np.mean(results['perplexities']),
                'avg_similarity': np.mean(results['semantic_similarities']),
                'avg_length': np.mean(results['response_lengths'])
            },
            'distributions': results
        }
        
        # Create visualizations
        plt.figure(figsize=(15, 10))
        
        # F1 score distribution
        plt.subplot(2, 3, 1)
        sns.histplot(results['f1_scores'])
        plt.title('F1 Score Distribution')
        
        # Precision distribution
        plt.subplot(2, 3, 2)
        sns.histplot(results['precision_scores'])
        plt.title('Precision Distribution')
        
        # Recall distribution
        plt.subplot(2, 3, 3)
        sns.histplot(results['recall_scores'])
        plt.title('Recall Distribution')
        
        # Perplexity distribution
        plt.subplot(2, 3, 4)
        sns.histplot(results['perplexities'])
        plt.title('Perplexity Distribution')
        
        # Semantic similarity distribution
        plt.subplot(2, 3, 5)
        sns.histplot(results['semantic_similarities'])
        plt.title('Semantic Similarity Distribution')
        
        plt.tight_layout()
        
        return report



In [44]:
def qualitative_analysis(predicted_responses, true_responses, n_samples=5):
    """
    Perform qualitative analysis on random samples
    """
    indices = np.random.choice(len(predicted_responses), n_samples, replace=False)
    analysis = []
    
    evaluator = ChatbotEvaluator(None, None)  # Temporary instance for F1 calculation
    
    for idx in indices:
        metrics = evaluator.calculate_f1_metrics(true_responses[idx], predicted_responses[idx])
        analysis.append({
            'true_response': true_responses[idx],
            'predicted_response': predicted_responses[idx],
            'f1_score': metrics['f1'],
            'precision': metrics['precision'],
            'recall': metrics['recall'],
            'response_length': len(word_tokenize(predicted_responses[idx])),
            'notes': ''  # For manual analysis notes
        })
    
    return analysis