In [None]:
!pip install --upgrade transformers datasets bitsandbytes scikit-learn textstat language_tool_python matplotlib peft pandas sentence-transformers torch

In [1]:
import os
import json
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
from peft import get_peft_model, LoraConfig
import torch

def setup_environment():
    cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
    return cache_dir

def check_model_and_tokenizer(model_name):
    try:
        tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            tokenizer.pad_token_id = tokenizer.eos_token_id
            print("Pad token set to EOS token.")
        
        model = GPT2LMHeadModel.from_pretrained(model_name)
        
        lora_config = LoraConfig(
            r=8,
            lora_alpha=16,
            lora_dropout=0.1
        )
        model = get_peft_model(model, lora_config)
        
        print(f"Model and tokenizer for '{model_name}' loaded successfully.")
        return tokenizer, model
    except Exception as e:
        print(f"Error loading model or tokenizer: {e}")
        return None, None

def check_data_format(dataset_path):
    try:
        with open(dataset_path, 'r') as file:
            data = json.load(file)
        
        if not isinstance(data, list) or not all(isinstance(item, dict) for item in data):
            raise ValueError("Data should be a list of dictionaries.")
        
        required_fields = {'question', 'answer'}
        for item in data:
            if not required_fields.issubset(item.keys()):
                raise ValueError(f"Missing required fields in item: {item}")

        print("Data format is valid.")
        return data
    except Exception as e:
        print(f"Error checking data format: {e}")
        return None

def preprocess_function(examples, tokenizer, max_length=512):
    # Tokenize questions and answers with consistent padding and truncation
    inputs = tokenizer(examples['question'], max_length=max_length, truncation=True, padding='max_length')
    targets = tokenizer(examples['answer'], max_length=max_length, truncation=True, padding='max_length')
    
    # Ensure consistent length
    model_inputs = {k: torch.tensor(v) for k, v in inputs.items()}
    model_inputs['labels'] = torch.tensor(targets['input_ids'])
    
    return model_inputs

def preprocess_and_train_model(tokenizer, model, data, cache_dir, hyperparameters=None):
    if hyperparameters is None:
        hyperparameters = {
            'batch_size': 8,
            'num_epochs': 5,
            'learning_rate': 2e-5,
            'warmup_steps': 1000,
            'weight_decay': 0.01,
            'logging_steps': 100,
            'save_steps': 5000,
            'eval_steps': 1000
        }

    df = pd.DataFrame(data)
    dataset = Dataset.from_pandas(df)
    
    # Tokenize and preprocess data
    tokenized_dataset = dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True, remove_columns=['question', 'answer'])

    # Split dataset into training and validation sets
    split = tokenized_dataset.train_test_split(test_size=0.1)
    train_dataset = split['train']
    eval_dataset = split['test']

    # Define training arguments
    training_args = TrainingArguments(
        per_device_train_batch_size=hyperparameters.get('batch_size', 8),
        per_device_eval_batch_size=hyperparameters.get('batch_size', 8),
        output_dir=os.path.join(cache_dir, 'results'),
        num_train_epochs=hyperparameters.get('num_epochs', 5),
        learning_rate=hyperparameters.get('learning_rate', 2e-5),
        warmup_steps=hyperparameters.get('warmup_steps', 1000),
        weight_decay=hyperparameters.get('weight_decay', 0.01),
        logging_dir=os.path.join(cache_dir, 'logs'),
        logging_steps=hyperparameters.get('logging_steps', 100),
        save_steps=hyperparameters.get('save_steps', 5000),
        save_total_limit=5,
        evaluation_strategy="steps",
        eval_steps=hyperparameters.get('eval_steps', 1000),
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False,
        report_to="tensorboard",
        fp16=True,
        remove_unused_columns=False
    )

    # Initialize the data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator
    )

    trainer.train()

    fine_tuned_model_dir = os.path.join(cache_dir, 'fine-tuned-gpt2')
    model.save_pretrained(fine_tuned_model_dir)
    tokenizer.save_pretrained(fine_tuned_model_dir)
    print(f"Fine-tuned model saved to {fine_tuned_model_dir}")

def main():
    model_name = "gpt2"
    dataset_path = '/kaggle/input/tuning/tuning.json'
    cache_dir = setup_environment()
    
    tokenizer, model = check_model_and_tokenizer(model_name)
    
    if tokenizer and model:
        data = check_data_format(dataset_path)
        
        if data:
            preliminary_tuning = input("Do you want to perform preliminary fine-tuning with a subset of the data? (yes/no): ").strip().lower()
            if preliminary_tuning == 'yes':
                print("Performing preliminary fine-tuning.")
                preprocess_and_train_model(tokenizer, model, data[:10], cache_dir)  # Use subset for preliminary tuning
            
            proceed = input("Do you want to proceed with full fine-tuning? (yes/no): ").strip().lower()
            if proceed == 'yes':
                print("Proceeding to full fine-tuning.")
                preprocess_and_train_model(tokenizer, model, data, cache_dir)
            else:
                print("Fine-tuning aborted.")
        else:
            print("Data format is not valid. Aborting.")
    else:
        print("Model or tokenizer failed to load. Aborting.")

if __name__ == '__main__':
    main()


ModuleNotFoundError: No module named 'peft'

In [None]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import textstat
import language_tool_python
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Define the cache directory and model paths
cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
model_name = "gpt2"  # Using base GPT-2 model for testing
fine_tuned_model_dir = os.path.join(cache_dir, 'fine-tuned-gpt2')

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(model_name, cache_dir=cache_dir)
model = GPT2LMHeadModel.from_pretrained(model_name, cache_dir=cache_dir)  # Changed to base model for testing

# Set pad_token_id if not set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Initialize language tool for grammar checking
language_tool = language_tool_python.LanguageTool('en-US')

def generate_text(prompt, max_tokens=200):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, return_attention_mask=True)
    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=max_tokens + len(inputs['input_ids'][0]),
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.85,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        no_repeat_ngram_size=2,
        repetition_penalty=1.2
    )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Debug print statements
    print("Prompt:")
    print(prompt)
    print("\nGenerated Text:")
    print(generated_text)
    print("-" * 50)

    if generated_text.startswith(prompt):
        generated_text = generated_text[len(prompt):].strip()
    
    return generated_text

def compute_similarity(text1, text2):
    vectorizer = TfidfVectorizer().fit([text1, text2])
    vectors = vectorizer.transform([text1, text2])
    return cosine_similarity(vectors[0:1], vectors[1:2])[0][0]

def check_grammar(text):
    matches = language_tool.check(text)
    return len(matches)  # Number of grammar issues

def evaluate_performance(data_file, max_tokens=50):
    with open(data_file, 'r') as file:
        data = json.load(file)
    
    similarities = []
    grammar_errors = []
    lengths = []
    deviations = []

    for item in data:
        prompt = item['question']
        expected_answer = item['answer']
        generated_text = generate_text(prompt, max_tokens)
        text_length = len(generated_text.split())
        
        similarity = compute_similarity(expected_answer, generated_text)
        grammar_error_count = check_grammar(generated_text)
        deviation = abs(len(expected_answer.split()) - text_length)
        
        similarities.append(similarity)
        grammar_errors.append(grammar_error_count)
        lengths.append(text_length)
        deviations.append(deviation)

    # Plot the metrics
    plt.figure(figsize=(15, 10))

    # Similarity Plot
    plt.subplot(2, 2, 1)
    plt.hist(similarities, bins=20, color='green', edgecolor='black')
    plt.xlabel('Similarity Score')
    plt.ylabel('Frequency')
    plt.title('Distribution of Similarity Scores')

    # Grammar Errors Plot
    plt.subplot(2, 2, 2)
    plt.hist(grammar_errors, bins=20, color='red', edgecolor='black')
    plt.xlabel('Number of Grammar Errors')
    plt.ylabel('Frequency')
    plt.title('Distribution of Grammar Errors')

    # Text Length Plot
    plt.subplot(2, 2, 3)
    plt.hist(lengths, bins=20, color='blue', edgecolor='black')
    plt.xlabel('Length of Generated Text (words)')
    plt.ylabel('Frequency')
    plt.title('Distribution of Generated Text Lengths')

    # Deviation Plot
    plt.subplot(2, 2, 4)
    plt.hist(deviations, bins=20, color='purple', edgecolor='black')
    plt.xlabel('Deviation from Expected Answer Length')
    plt.ylabel('Frequency')
    plt.title('Distribution of Answer Deviation')

    plt.tight_layout()
    plt.savefig('performance_metrics.png')  # Save the plot as a PNG file
    plt.show()  # Display the plot

    # Explanations for the plots
    print("\nPlot Explanations:")
    print("1. **Distribution of Similarity Scores**: This plot shows how similar the generated answers are to the expected answers based on cosine similarity scores. Higher scores indicate better similarity. If the scores are generally low, the model's responses might not be closely matching the expected answers.")
    print("2. **Distribution of Grammar Errors**: This histogram illustrates the number of grammar errors detected in the generated texts. Fewer errors suggest better grammatical quality. A high number of errors may indicate issues with the model's ability to generate grammatically correct text.")
    print("3. **Distribution of Generated Text Lengths**: This plot represents the lengths of the generated texts in terms of word count. It helps to understand the verbosity of the generated answers. If the lengths vary significantly, it could mean the model is generating excessively short or long responses.")
    print("4. **Distribution of Answer Deviation**: This shows how much the length of the generated text deviates from the expected answer length. Smaller deviations indicate more precise text generation. Larger deviations might suggest that the model is not generating responses with appropriate length.")

    # Overall analysis
    avg_similarity = np.mean(similarities)
    avg_grammar_errors = np.mean(grammar_errors)
    avg_length = np.mean(lengths)
    avg_deviation = np.mean(deviations)

    print("\nOverall Performance Analysis:")
    print(f"Average Similarity Score: {avg_similarity:.2f}")
    print(f"Average Number of Grammar Errors: {avg_grammar_errors:.2f}")
    print(f"Average Length of Generated Text: {avg_length:.2f} words")
    print(f"Average Deviation from Expected Answer Length: {avg_deviation:.2f} words")

    if avg_similarity < 0.5:
        print("The model's generated responses are generally not similar to the expected answers. Improvement in the model's training or fine-tuning might be required.")
    else:
        print("The model's generated responses are fairly similar to the expected answers.")

    if avg_grammar_errors > 5:
        print("The generated texts have a high number of grammar errors. Enhancing the model's ability to generate grammatically correct sentences could be beneficial.")
    else:
        print("The generated texts have a relatively low number of grammar errors.")

    if avg_deviation > 10:
        print("The model's responses have a significant deviation in length compared to expected answers. Adjusting the model's parameters or prompt length may help in generating more appropriately sized responses.")
    else:
        print("The model's responses are of appropriate length compared to expected answers.")

    results = []
    for item in data:
        prompt = item['question']
        expected_answer = item['answer']
        generated_text = generate_text(prompt, max_tokens)
        results.append({
            'prompt': prompt,
            'expected_answer': expected_answer,
            'generated_text': generated_text
        })
    
    return results

# Path to the JSON file containing questions and answers
data_file = '/kaggle/input/t-small/t-small.json'

# Evaluate the performance
performance_results = evaluate_performance(data_file)

# Print the results
for result in performance_results:
    print("\n" + "="*50)
    print(f"Prompt:")
    print(result['prompt'])
    print(f"\nExpected Answer:")
    print(result['expected_answer'])
    print(f"\nGenerated Text:")
    print(result['generated_text'])
    print("="*50)


In [None]:
import os
import json
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
import torch
import numpy as np
from transformers import pipeline

def setup_environment():
    cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
    return cache_dir

def check_model_and_tokenizer(model_name):
    try:
        tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            tokenizer.pad_token_id = tokenizer.eos_token_id
            print("Pad token set to EOS token.")
        
        model = GPT2LMHeadModel.from_pretrained(model_name)
        
        lora_config = LoraConfig(
            r=8,
            lora_alpha=16,
            lora_dropout=0.1
        )
        model = get_peft_model(model, lora_config)
        
        print(f"Model and tokenizer for '{model_name}' loaded successfully.")
        return tokenizer, model
    except Exception as e:
        print(f"Error loading model or tokenizer: {e}")
        return None, None

def check_data_format(dataset_path):
    try:
        with open(dataset_path, 'r') as file:
            data = json.load(file)
        
        if not isinstance(data, list) or not all(isinstance(item, dict) for item in data):
            raise ValueError("Data should be a list of dictionaries.")
        
        required_fields = {'question', 'answer'}
        for item in data:
            if not required_fields.issubset(item.keys()):
                raise ValueError(f"Missing required fields in item: {item}")

        print("Data format is valid.")
        return data
    except Exception as e:
        print(f"Error checking data format: {e}")
        return None

def preprocess_function(examples, tokenizer):
    inputs = tokenizer(examples['question'], max_length=512, truncation=True, padding='max_length', return_tensors="pt")
    targets = tokenizer(examples['answer'], max_length=512, truncation=True, padding='max_length', return_tensors="pt")
    
    inputs['input_ids'] = inputs['input_ids'].squeeze(0).tolist()
    targets['input_ids'] = targets['input_ids'].squeeze(0).tolist()

    model_inputs = {k: v for k, v in inputs.items()}
    model_inputs['labels'] = targets['input_ids']
    return model_inputs

def analyze_model_performance(model, tokenizer, data):
    # Create a small sample of data for testing
    test_sample = data[:5]
    test_dataset = Dataset.from_dict({
        'question': [item['question'] for item in test_sample],
        'answer': [item['answer'] for item in test_sample]
    })
    test_dataset = test_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
    
    text_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
    
    print("Model Performance Analysis:")
    for item in test_sample:
        question = item['question']
        expected_answer = item['answer']
        generated_answer = text_generator(question, max_length=50, num_return_sequences=1)[0]['generated_text']
        
        print(f"Question: {question}")
        print(f"Expected Answer: {expected_answer}")
        print(f"Generated Answer: {generated_answer}")
        print()

def suggest_hyperparameters_and_tokenizer_settings(data):
    # Hypothetical evaluation function to suggest improvements
    num_samples = len(data)
    avg_length = np.mean([len(item['question']) for item in data])
    max_length = max([len(item['question']) for item in data])
    
    # Suggested hyperparameters based on typical scenarios
    suggested_batch_size = 4
    suggested_num_epochs = 5
    suggested_learning_rate = 3e-5
    suggested_max_length = min(max_length, 512)
    
    print(f"Suggested Hyperparameters:")
    print(f"Batch Size: {suggested_batch_size}")
    print(f"Number of Epochs: {suggested_num_epochs}")
    print(f"Learning Rate: {suggested_learning_rate}")
    print(f"Maximum Token Length: {suggested_max_length}")

def main():
    model_name = "gpt2"
    dataset_path = '/kaggle/input/t-small/t-small.json'
    cache_dir = setup_environment()
    
    tokenizer, model = check_model_and_tokenizer(model_name)
    
    if tokenizer and model:
        data = check_data_format(dataset_path)
        
        if data:
            analyze_model_performance(model, tokenizer, data)
            suggest_hyperparameters_and_tokenizer_settings(data)
        else:
            print("Data format is not valid. Aborting.")
    else:
        print("Model or tokenizer failed to load. Aborting.")

if __name__ == '__main__':
    main()
