# BLEU Score

## Translated dataset

In [None]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import pandas as pd

# Load datasets
greenwashing_dataset_path = '/content/Dataset_greenwashing - Sheet1.csv'
translated_dataset_path = '/content/translated_claims_validation (1).csv'

greenwashing_data = pd.read_csv(greenwashing_dataset_path)
translated_data = pd.read_csv(translated_dataset_path)

# Extract necessary columns
greenwashing_texts = greenwashing_data['Text'].tolist()
translated_texts = translated_data['translated_text'].tolist()

# Align datasets based on the smallest length
min_length = min(len(greenwashing_texts), len(translated_texts))
aligned_references = [[text.split()] for text in greenwashing_texts[:min_length]]
aligned_candidates = [text.split() for text in translated_texts[:min_length]]

# Calculate BLEU score with smoothing
smoothing_function = SmoothingFunction().method1
bleu_score = corpus_bleu(aligned_references, aligned_candidates, smoothing_function=smoothing_function)

print(f"BLEU Score: {bleu_score}")

BLEU Score: 0.0011136552591733703


## Masked dataset

In [None]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

# Load the masked dataset
masked_dataset_path = '/content/masked_training_dataset (3).csv'
masked_data = pd.read_csv(masked_dataset_path)

# Extract the necessary columns
original_texts = masked_data['text'].tolist()  # Original English text
masked_texts = masked_data['masked_text'].tolist()  # Masked translated text

# Ensure datasets are aligned
min_length = min(len(original_texts), len(masked_texts))
original_texts = original_texts[:min_length]
masked_texts = masked_texts[:min_length]

# Prepare the data for BLEU calculation
# BLEU expects references as a list of lists of tokens
references = [[text.split()] for text in original_texts]
candidates = [text.split() for text in masked_texts]

# Calculate BLEU score with smoothing
smoothing_function = SmoothingFunction().method1
bleu_score = corpus_bleu(references, candidates, smoothing_function=smoothing_function)

print(f"BLEU Score (Masked Dataset): {bleu_score}")

BLEU Score (Masked Dataset): 0.003533243146624169


# Semantic Similarity

The **semantic similarity** metric measures how closely the meaning of two pieces of text aligns, regardless of their syntactic or surface-level differences. It focuses on capturing the semantic content (the underlying meaning) rather than direct word-for-word matches.

here, we use cosine similarity.

Cosine Similarity:
*   Measures the cosine of the angle between two vectors in an embedding space.
*   Range: [0, 1] (or [-1, 1] in some cases, where negative values indicate dissimilarity).
*  Values close to 1 indicate strong semantic similarity.

## Translated dataset

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# Load datasets
greenwashing_dataset_path = '/content/Dataset_greenwashing - Sheet1.csv'
translated_dataset_path = '/content/translated_claims_validation (1).csv'

original_data = pd.read_csv(greenwashing_dataset_path)
translated_data = pd.read_csv(translated_dataset_path)

# Extract the relevant text columns
original_texts = original_data['Text'].tolist()
translated_texts = translated_data['translated_text'].tolist()

# Ensure the datasets are aligned
min_length = min(len(original_texts), len(translated_texts))
original_texts = original_texts[:min_length]
translated_texts = translated_texts[:min_length]

# Load a multilingual Sentence-BERT model
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Generate embeddings
original_embeddings = model.encode(original_texts, convert_to_tensor=True)
translated_embeddings = model.encode(translated_texts, convert_to_tensor=True)

# Compute cosine similarity for each pair
similarities = cosine_similarity(original_embeddings.cpu(), translated_embeddings.cpu())

# Calculate the average similarity score
average_similarity = np.mean(np.diag(similarities))

print(f"Average Semantic Similarity: {average_similarity}")


Average Semantic Similarity: 0.23899298906326294


## Masked dataset

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# Load the masked dataset
masked_dataset_path = '/content/masked_training_dataset (3).csv'
masked_data = pd.read_csv(masked_dataset_path)

# Extract the necessary columns
original_texts = masked_data['text'].tolist()  # Original English text
masked_texts = masked_data['masked_text'].tolist()  # Masked translated text

# Ensure datasets are aligned
min_length = min(len(original_texts), len(masked_texts))
original_texts = original_texts[:min_length]
masked_texts = masked_texts[:min_length]

# Load a multilingual Sentence-BERT model
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Generate embeddings for the original and masked texts
original_embeddings = model.encode(original_texts, convert_to_tensor=True)
masked_embeddings = model.encode(masked_texts, convert_to_tensor=True)

# Compute cosine similarity for each pair
masked_similarities = cosine_similarity(original_embeddings.cpu(), masked_embeddings.cpu())

# Calculate the average semantic similarity
average_masked_similarity = np.mean(np.diag(masked_similarities))

print(f"Average Semantic Similarity (Masked Dataset): {average_masked_similarity}")


Average Semantic Similarity (Masked Dataset): 0.8803374171257019
