### Perplexity, BLEU and ROUGE Evaluation

In [27]:
pip install -q langchain-groq==0.2.0 langchain==0.3.0 langchain-community==0.3.0 datasets==3.0.0 evaluate rouge-score sacrebleu

Note: you may need to restart the kernel to use updated packages.


In [57]:
from langchain_groq import ChatGroq
from datasets import Dataset, load_dataset
import pandas as pd
import numpy as np
from sacrebleu import corpus_bleu, sentence_bleu
from rouge_score import rouge_scorer

### Using Lllama 3.1 8B

In [3]:
model_name ='llama-3.1-8b-instant'
groq_key ='key'

In [4]:
llm = ChatGroq(model_name=model_name, groq_api_key=groq_key, temperature=0.5, max_tokens=256)

In [5]:
llm.invoke('How are you?')

AIMessage(content="I'm functioning properly. What would you like to talk about or ask?", additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 16, 'prompt_tokens': 39, 'total_tokens': 55, 'completion_time': 0.021333333, 'prompt_time': 0.003464444, 'queue_time': 0.019771504999999998, 'total_time': 0.024797777}, 'model_name': 'llama-3.1-8b-instant', 'system_fingerprint': 'fp_9cb648b966', 'finish_reason': 'stop', 'logprobs': None}, id='run-790c614e-d261-4a69-88e9-7bec0f0a28f6-0', usage_metadata={'input_tokens': 39, 'output_tokens': 16, 'total_tokens': 55})

### Single conversation dataset

In [10]:
dataset = pd.read_csv('dataset.csv')[['User','Response']]

In [7]:
# dataset = Dataset.from_pandas(dataset)

#### Perplexity Formula for logprobs if available

In [8]:
def compute_perplexity(log_probs):
    total_log_prob = 0
    for log_prob in log_probs:
        total_log_prob += log_prob
    perplexity = np.exp(-total_log_prob / len(log_probs))
    return perplexity

### Generate AI Response

In [11]:
dataset['AIresponse'] = dataset['User'].apply(lambda x: llm.invoke(x).content)

In [15]:
dataset['AIresponse'] = dataset.AIresponse.replace('\n','',regex=True)

In [16]:
dataset

Unnamed: 0,User,Response,AIresponse
0,"Now there are three people A, B and C. I curre...","Based on the given information, A is the talle...",Based on the information given:1. A is taller ...
1,"Mary is Tom's mother, John is Tom's grandfathe...",John is Mary's father-in-law.,"To determine John's relationship to Mary, we n..."
2,A coin is heads up. Jesus does not flip the co...,The coin was flipped by Alejandra. So the coin...,Since the coin was initially heads up and Jesu...
3,There are two boxes. Box 1 is either full or e...,"Since Box 1 is full, based on the given condit...","If Box 1 is full, then according to the given ..."
4,I have five apples. I gave two to Emily and th...,You have zero apples now because you gave all ...,You initially had 5 apples. You gave 2 to Emil...
...,...,...,...
95,How much does it cost to build an app?,The cost of building an app can vary widely ba...,"The cost of building an app can vary widely, d..."
96,What's the best laptop for working from home?,"To give you the best recommendation, I need to...",The best laptop for working from home depends ...
97,Can I upgrade my phone?,I'd be happy to help with that! Could you plea...,"To determine if you can upgrade your phone, I'..."
98,How often should I replace my tires?,The frequency at which you should replace your...,The frequency of replacing tires depends on va...


### Utilising sentence bleu and rouge scorer to generate score

In [59]:
rouge_metric = rouge_scorer.RougeScorer(['rouge1','rougeL'],use_stemmer=False)

In [62]:
dataset['bleu_score'] = dataset.apply(lambda x: sentence_bleu(x['Response'],[x['AIresponse']],lowercase=True).score, axis=1)
dataset['rogue1_score_precision'] = dataset.apply(lambda x: rouge_metric.score(x['AIresponse'],x['Response'])['rouge1'][0], axis=1)
dataset['rogue1_score_recall'] = dataset.apply(lambda x: rouge_metric.score(x['AIresponse'],x['Response'])['rouge1'][1], axis=1)
dataset['rogue1_score_f1'] = dataset.apply(lambda x: rouge_metric.score(x['AIresponse'],x['Response'])['rouge1'][2], axis=1)
dataset['rogueL_score_precision'] = dataset.apply(lambda x: rouge_metric.score(x['AIresponse'],x['Response'])['rougeL'][0], axis=1)
dataset['rogueL_score_recall'] = dataset.apply(lambda x: rouge_metric.score(x['AIresponse'],x['Response'])['rougeL'][1], axis=1)
dataset['rogueL_score_f1'] = dataset.apply(lambda x: rouge_metric.score(x['AIresponse'],x['Response'])['rougeL'][2], axis=1)

### Analysing the score

In [63]:
dataset

Unnamed: 0,User,Response,AIresponse,bleu_score,rogue1_score_precision,rogue1_score_recall,rogue1_score_f1,rogueL_score_precision,rogueL_score_recall,rogueL_score_f1
0,"Now there are three people A, B and C. I curre...","Based on the given information, A is the talle...",Based on the information given:1. A is taller ...,3.736687,1.000000,0.265306,0.419355,0.923077,0.244898,0.387097
1,"Mary is Tom's mother, John is Tom's grandfathe...",John is Mary's father-in-law.,"To determine John's relationship to Mary, we n...",0.004317,0.714286,0.106383,0.185185,0.714286,0.106383,0.185185
2,A coin is heads up. Jesus does not flip the co...,The coin was flipped by Alejandra. So the coin...,Since the coin was initially heads up and Jesu...,7.863696,0.500000,0.416667,0.454545,0.350000,0.291667,0.318182
3,There are two boxes. Box 1 is either full or e...,"Since Box 1 is full, based on the given condit...","If Box 1 is full, then according to the given ...",29.263947,0.419355,0.812500,0.553191,0.387097,0.750000,0.510638
4,I have five apples. I gave two to Emily and th...,You have zero apples now because you gave all ...,You initially had 5 apples. You gave 2 to Emil...,0.522844,0.611111,0.164179,0.258824,0.555556,0.149254,0.235294
...,...,...,...,...,...,...,...,...,...,...
95,How much does it cost to build an app?,The cost of building an app can vary widely ba...,"The cost of building an app can vary widely, d...",0.208148,0.595238,0.161290,0.253807,0.500000,0.135484,0.213198
96,What's the best laptop for working from home?,"To give you the best recommendation, I need to...",The best laptop for working from home depends ...,0.024849,0.342105,0.075581,0.123810,0.236842,0.052326,0.085714
97,Can I upgrade my phone?,I'd be happy to help with that! Could you plea...,"To determine if you can upgrade your phone, I'...",0.097954,0.516129,0.132231,0.210526,0.322581,0.082645,0.131579
98,How often should I replace my tires?,The frequency at which you should replace your...,The frequency of replacing tires depends on va...,0.110838,0.523810,0.113990,0.187234,0.285714,0.062176,0.102128
