# Metrics

In [1]:
!pip install rouge
!pip install gensim
!pip3 install wmd
!pip install POT
!pip install bert-score

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
Collecting wmd
  Downloading wmd-1.3.2.tar.gz (104 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.6/104.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wmd
  Building wheel for wmd (setup.py) ... [?25l[?25hdone
  Created wheel for wmd: filename=wmd-1.3.2-cp310-cp310-linux_x86_64.whl size=1150981 sha256=ad54b59470542f8f451b0d9c0089c1808fbadf64634fcc26078eb863436f1f53
  Stored in directory: /root/.cache/pip/wheels/7e/09/7f/ebf39133074a0411263ce255a480293fb2e91bceaeed6a4141
Successfully built wmd
Installing collected packages: wmd
Successfully installed wmd-1.3.2
Collecting POT
  Downloading POT-0.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [2]:
from rouge import Rouge
from nltk.translate import meteor_score
from nltk.tokenize import word_tokenize
import gensim
from wmd import WMD
import gensim.downloader as api
from nltk.tokenize import word_tokenize
from bert_score import score
import pandas as pd

In [3]:
import nltk

In [4]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
# Load a pre-trained Word2Vec model
model = api.load('word2vec-google-news-300')



## ROUGE

In [6]:
def calculate_rouge_scores(reference_text, model_response):
    """
    Calculates ROUGE scores between a reference text and a model response.

    Args:
      reference_text: The reference text.
      model_response: The model response.

    Returns:
      A dictionary containing the ROUGE-1, ROUGE-2, and ROUGE-L scores.
    """
    rouge = Rouge(metrics=['rouge-1', 'rouge-2', 'rouge-l'], stats=['f', 'p', 'r'])
    scores = rouge.get_scores(model_response, reference_text, avg=True)
    return scores

## METEOR

In [7]:
def calculate_meteor_score(reference, hypothesis):
    """
    Calculates the METEOR score for a given hypothesis text with respect to a reference text.

    Args:
      reference (str): Reference text.
      hypothesis (str): Machine-generated text.

    Returns:
      float: METEOR score.
    """
    # Tokenization of texts
    reference_tokens = word_tokenize(reference)
    hypothesis_tokens = word_tokenize(hypothesis)

    # Calculation of the METEOR score
    score = meteor_score.meteor_score([reference_tokens], hypothesis_tokens)
    return score

## WMD

In [8]:
def calculate_wmd_distance(doc1, doc2, w2v_model):
    """
    Computes the Word Mover's Distance between two documents using a pre-trained word2vec model.

    Args:
        doc1 (str): First document.
        doc2 (str): Second document.
        w2v_model (gensim.models.KeyedVectors): Pre-trained Word2Vec model.

    Returns:
        float: WMD distance between the documents.
    """
    # Tokenize the documents and filter words that are in the model's vocabulary
    tokens1 = [word for word in word_tokenize(doc1.lower()) if word in w2v_model.key_to_index]
    tokens2 = [word for word in word_tokenize(doc2.lower()) if word in w2v_model.key_to_index]

    # Calculate the Word Mover's Distance (WMD) using the wmdistance function from gensim
    distance = w2v_model.wmdistance(tokens1, tokens2)
    return distance

## BERTScore

In [9]:
def calculate_bert_score(candidates, references, lang='en', model_type=None):
    """
    Calculates the BERTScore between candidate texts and reference texts.

    Args:
        candidates (list of str): List of texts generated by the machine.
        references (list of str): List of reference texts.
        lang (str): Language code for which the embeddings will be used (default is 'en' for English).
        model_type (str): Type of BERT model to be used (e.g., 'bert-base-uncased'). If None, uses the default model for the language.

    Returns:
        tuple: Three lists containing precision, recall, and F1 scores for each candidate-reference pair.
    """
    # If a specific model is not provided, set a default one
    if not model_type:
        model_type = 'bert-base-multilingual-cased' if lang != 'en' else 'bert-base-uncased'

    # Calculate the BERTScore
    P, R, F1 = score(candidates, references, lang=lang, model_type=model_type, verbose=True)

    return P, R, F1

## VW Taos 2023

### 1. When will the alarm be triggered?

In [10]:
reference_text = "When opening a mechanically unlocked door with the vehicle key; When opening the front hood; When opening the rear trunk; When turning on the ignition with an invalid vehicle key."
model_response = "1. Unauthorized Entry: The alarm will be triggered if any door, the hood, or the trunk is opened without using the key or remote control. 2. Ignition Attempt: If someone tries to start the car without the correct key, the alarm will go off. 3. Movement Inside the Car: The interior motion sensors will trigger the alarm if there is movement inside the car when it is locked. 4. Tilt Sensor: The alarm will be activated if the car is tilted, which could indicate an attempt to tow the vehicle or steal the wheels. To ensure the alarm system functions correctly, make sure: All doors, the hood, and the trunk are properly closed when locking the car. The key or remote control is used to lock and unlock the vehicle."

In [11]:
scores = calculate_rouge_scores(reference_text, model_response)

In [12]:
meteor = calculate_meteor_score(reference_text, model_response)

In [13]:
distance = calculate_wmd_distance(reference_text, model_response, model)

In [14]:
P, R, F1 = calculate_bert_score([model_response], [reference_text])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 2.68 seconds, 0.37 sentences/sec


In [15]:
data = {
    "Metric": ["ROUGE-L", "BERTScore"],
    "Precision": [f"{scores['rouge-l']['p']:.2f}", f"{P.item():.2f}"],
    "Recall": [f"{scores['rouge-l']['r']:.2f}", f"{R.item():.2f}"],
    "F1": [f"{scores['rouge-l']['f']:.2f}", f"{F1.item():.2f}"],
}

df = pd.DataFrame(data)
df

Unnamed: 0,Metric,Precision,Recall,F1
0,ROUGE-L,0.05,0.2,0.08
1,BERTScore,0.49,0.62,0.55


### 2. When is the air recirculation mode turned off?

In [16]:
reference_text = "The air recirculation mode is turned off in the following situations: When the defogging function is activated; When a sensor detects that the vehicle windows may fog up."
model_response = "1. Automatic Deactivation: The air recirculation mode may automatically turn off in certain conditions to ensure optimal air quality inside the vehicle. This can happen when: The defrost function is activated. The outside temperature is very low, and the system needs to prevent window fogging. The system detects high levels of humidity inside the vehicle. 2. Manual Deactivation: You can manually turn off the air recirculation mode by pressing the air recirculation button on the climate control panel. This will allow fresh air from outside to enter the cabin. 3. System Settings: Some advanced climate control systems may have settings that allow you to customize when the air recirculation mode is activated or deactivated. Check your vehicle's manual for specific instructions on how to access and adjust these settings."

In [17]:
scores = calculate_rouge_scores(reference_text, model_response)

In [18]:
meteor = calculate_meteor_score(reference_text, model_response)

In [19]:
distance = calculate_wmd_distance(reference_text, model_response, model)

In [20]:
P, R, F1 = calculate_bert_score([model_response], [reference_text])

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.97 seconds, 1.03 sentences/sec


In [21]:
data = {
    "Metric": ["ROUGE-L", "BERTScore"],
    "Precision": [f"{scores['rouge-l']['p']:.2f}", f"{P.item():.2f}"],
    "Recall": [f"{scores['rouge-l']['r']:.2f}", f"{R.item():.2f}"],
    "F1": [f"{scores['rouge-l']['f']:.2f}", f"{F1.item():.2f}"],
}

df = pd.DataFrame(data)
df

Unnamed: 0,Metric,Precision,Recall,F1
0,ROUGE-L,0.13,0.46,0.2
1,BERTScore,0.56,0.7,0.62


### 3. When should the seat heating not be turned on?

In [22]:
reference_text = "Under any of the following conditions, do not turn on the seat heating: The seat is used by a person with low pain or temperature perception; The seat is unoccupied; There is a child seat installed on the seat; Objects cover the seat surface, such as protective covers, jackets, liners, or bags; The seat surface is damp or wet; The temperature inside the vehicle or the outside temperature is above +25 °C (+77 °F)."
model_response = "1. Avoid Using Seat Heating in Certain Conditions: - If the Seat is Wet: Do not turn on the seat heating if the seat is wet. This can cause electrical issues and potentially damage the heating elements. - For Infants and Small Children: Avoid using seat heating for infants and small children as their skin is more sensitive and they may not be able to communicate discomfort. - If You Have Certain Medical Conditions: Individuals with certain medical conditions, such as those with reduced heat sensitivity, should avoid using seat heating to prevent burns or discomfort. 2. General Safety Precautions: - Do Not Leave Unattended: Never leave the seat heating on when the vehicle is unattended. - Monitor Temperature: Always monitor the temperature and turn off the seat heating if it becomes too warm."

In [23]:
scores = calculate_rouge_scores(reference_text, model_response)

In [24]:
meteor = calculate_meteor_score(reference_text, model_response)

In [25]:
distance = calculate_wmd_distance(reference_text, model_response, model)

In [26]:
P, R, F1 = calculate_bert_score([model_response], [reference_text])

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.35 seconds, 0.74 sentences/sec


In [27]:
data = {
    "Metric": ["ROUGE-L", "BERTScore"],
    "Precision": [f"{scores['rouge-l']['p']:.2f}", f"{P.item():.2f}"],
    "Recall": [f"{scores['rouge-l']['r']:.2f}", f"{R.item():.2f}"],
    "F1": [f"{scores['rouge-l']['f']:.2f}", f"{F1.item():.2f}"],
}

df = pd.DataFrame(data)
df

Unnamed: 0,Metric,Precision,Recall,F1
0,ROUGE-L,0.13,0.24,0.17
1,BERTScore,0.56,0.58,0.57
