In [15]:
import nltk
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from nltk.translate.meteor_score import meteor_score
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd

conversational_models = [
    'microsoft/DialoGPT-small',
    'microsoft/DialoGPT-medium',
    'gorkemgoknar/gpt2chatbotenglish',
    'Vaibhav-rm/GPT2-Shri-v1'
]

prompts = [
    "How does photosynthesis work?",
    "Tell me a joke.",
    "What is the capital of France?",
    "Explain the theory of relativity.",
    "Any book recommendations?"
]

references = [
    "Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize foods with the help of chlorophyll.",
    "Why did the chicken cross the road? To get to the other side!",
    "The capital of France is Paris.",
    "The theory of relativity, formulated by Albert Einstein, describes the relationships between space, time, and gravity.",
    "It depends on your interests. What genres do you like?"
]

def initialize_model_and_tokenizer(model_name):
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    return model, tokenizer

def generate_response(prompt, model, tokenizer, max_length=50, temperature=0.7):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = model.generate(input_ids, max_length=max_length, temperature=temperature, num_beams=5)
    generated_response = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_response



def calculate_rouge(reference, candidate):
    rouge = Rouge()
    scores = rouge.get_scores(candidate, reference)
    rouge_n_score = scores[0]['rouge-1']['f']
    rouge_l_score = scores[0]['rouge-l']['f']
    return rouge_n_score, rouge_l_score



def calculate_f1(reference, candidate):
    reference_set = set(reference.split())
    candidate_set = set(candidate.split())

    precision = len(reference_set.intersection(candidate_set)) / len(candidate_set)
    recall = len(reference_set.intersection(candidate_set)) / len(reference_set)

    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

    return f1


results = dict()


for model_name in conversational_models:
    model, tokenizer = initialize_model_and_tokenizer(model_name)
    
    rouge_n_scores = []
    rouge_l_scores = []
    f1_scores = []
    response_lengths = []

    for prompt, reference in zip(prompts, references):
        model_response = generate_response(prompt, model, tokenizer)

       
        rouge_n_score, rouge_l_score = calculate_rouge(reference, model_response)
        rouge_n_scores.append(rouge_n_score)
        rouge_l_scores.append(rouge_l_score)


       
        f1 = calculate_f1(reference, model_response)
        f1_scores.append(f1)

        response_lengths.append(len(model_response.split()))

    
    avg_rouge_n_score = sum(rouge_n_scores) / len(rouge_n_scores)
    avg_rouge_l_score = sum(rouge_l_scores) / len(rouge_l_scores)
    avg_f1_score = sum(f1_scores)/len(f1_scores)
    avg_response_length = sum(response_lengths) / len(response_lengths)

       

            

    results[model_name] = {
            
            "ROUGE-N": avg_rouge_n_score,
            "ROUGE-L": avg_rouge_l_score,
            "F1" : avg_f1_score,
            "Response Length": avg_response_length
        }

for model_name, scores in results.items():
    print(f"Model: {model_name}")
    print(f"ROUGE-N Score: {scores['ROUGE-N']:.4f}")
    print(f"ROUGE-L Score: {scores['ROUGE-L']:.4f}")
    print(f"F1 Score: {scores['F1']:.4f}")
    print(f"Average Response Length: {scores['Response Length']:.3f}")
        



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

config.json:   0%|          | 0.00/903 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/510M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

Model: microsoft/DialoGPT-small
ROUGE-N Score: 0.1353
ROUGE-L Score: 0.0902
F1 Score: 0.1364
Average Response Length: 6.800
Model: microsoft/DialoGPT-medium
ROUGE-N Score: 0.1974
ROUGE-L Score: 0.1506
F1 Score: 0.1612
Average Response Length: 16.600
Model: gorkemgoknar/gpt2chatbotenglish
ROUGE-N Score: 0.1694
ROUGE-L Score: 0.1361
F1 Score: 0.1480
Average Response Length: 24.000
Model: Vaibhav-rm/GPT2-Shri-v1
ROUGE-N Score: 0.1571
ROUGE-L Score: 0.1048
F1 Score: 0.1571
Average Response Length: 4.400


In [18]:
results_df = pd.DataFrame(results).T
results_df


Unnamed: 0,ROUGE-N,ROUGE-L,F1,Response Length
microsoft/DialoGPT-small,0.135294,0.090196,0.136364,6.8
microsoft/DialoGPT-medium,0.197403,0.150649,0.161212,16.6
gorkemgoknar/gpt2chatbotenglish,0.169444,0.136111,0.148,24.0
Vaibhav-rm/GPT2-Shri-v1,0.157143,0.104762,0.157143,4.4


In [19]:
results_df.to_csv('Topsis_input.csv')