https://github.com/QwenLM/Qwen2.5?tab=readme-ov-file

https://qwenlm.github.io/blog/qwen2.5/


In [None]:
# Load model directly
!pip install transformers
!pip install accelerate

from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B")
model = AutoModel.from_pretrained("Qwen/Qwen2.5-3B").to(device)  # Move model to GPU

In [None]:
import pandas as pd

# Load dataset
splits = {
    'train': 'data/train-00000-of-00001-98aa5228a06a17d0.parquet',
    'validation': 'data/validation-00000-of-00001-2553e47d408fab28.parquet',
    'test': 'data/test-00000-of-00001-79fd931297fff765.parquet'
}
df = pd.read_parquet("hf://datasets/climatebert/environmental_claims/" + splits["validation"])

def translate_text(text):
    # Use model to translate
    inputs = tokenizer("Translate to French: " + text, return_tensors="pt").to(device)  # Move input tensors to GPU
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
df.head()

In [None]:
# Apply translation on dataset
df['translated_text'] = df['text'].apply(translate_text)  # replace 'text_column_name' with actual text column name

# Save translated claims to a CSV file
df[['text_column_name', 'translated_text']].to_csv('translated_claims.csv', index=False)

print("Translated claims saved to 'translated_claims.csv'")

Dataset analisys

In [None]:
!pip install nltk
!pip install pyarrow  # Required if reading Parquet files

In [None]:
import pandas as pd
import nltk

# Download NLTK data files (only need to run once)
nltk.download('punkt')
nltk.download('punkt_tab')

splits = {
    'train': 'data/train-00000-of-00001-98aa5228a06a17d0.parquet',
    'validation': 'data/validation-00000-of-00001-2553e47d408fab28.parquet',
    'test': 'data/test-00000-of-00001-79fd931297fff765.parquet'
}

# Choose the split you want to analyze (e.g., 'train', 'validation', 'test')
split_to_analyze = 'train'

# Load the selected dataset split
df = pd.read_parquet("hf://datasets/climatebert/environmental_claims/" + splits[split_to_analyze])

In [None]:
text_column = 'text'

# Ensure the text column exists
if text_column not in df.columns:
    raise ValueError(f"The specified text column '{text_column}' does not exist in the dataset.")


In [None]:
# Function to count sentences and tokens in a text
def analyze_text(text):
    sentences = nltk.sent_tokenize(text)
    tokens = nltk.word_tokenize(text)
    return len(sentences), len(tokens)

# Apply the function to each row in the dataset
df[['sentence_count', 'token_count']] = df[text_column].apply(
    lambda text: pd.Series(analyze_text(text))
)

# Calculate total counts
total_sentences = df['sentence_count'].sum()
total_tokens = df['token_count'].sum()

print(f"Total number of sentences in the '{split_to_analyze}' dataset: {total_sentences}")
print(f"Total number of tokens in the '{split_to_analyze}' dataset: {total_tokens}")

Dataset Translation DeepL

In [None]:
!pip install deepl
!pip install pyarrow  # If not already installed
!pip install tqdm

In [None]:
import pandas as pd
import deepl
import os
from tqdm import tqdm

os.environ['DEEPL_AUTH_KEY'] = ''

auth_key = os.getenv('DEEPL_AUTH_KEY')

# Check if the API key is available
if auth_key is None:
    raise ValueError("Please set the DEEPL_AUTH_KEY environment variable.")

translator = deepl.Translator(auth_key)

In [None]:
translator

In [None]:
# Load dataset
splits = {
    'train': 'data/train-00000-of-00001-98aa5228a06a17d0.parquet',
    'validation': 'data/validation-00000-of-00001-2553e47d408fab28.parquet',
    'test': 'data/test-00000-of-00001-79fd931297fff765.parquet'
}

split_to_translate = 'validation'  # Change this to 'validation' or 'test' as needed

df = pd.read_parquet("hf://datasets/climatebert/environmental_claims/" + splits[split_to_translate])

# Replace 'text_column_name' with the actual name of the text column in your dataset
text_column = 'text'  # e.g., 'claim_text' or 'description'

# Ensure the text column exists
if text_column not in df.columns:
    raise ValueError(f"The specified text column '{text_column}' does not exist in the dataset.")

In [None]:
df

In [None]:
df.to_csv('original_validation_dataset.csv', index=False)

# **Copy Columns**

In [None]:
# copy column
import pandas as pd

# File paths
masked_training_file = "/content/translated_claims_validation.csv"
output_file_final = "/content/original_validation_dataset.csv"

# Load datasets
masked_training_df = pd.read_csv(masked_training_file)
output_file_final_df = pd.read_csv(output_file_final)

# Check if 'label' column exists in output file
if 'label' in output_file_final_df.columns:
    # Copy the 'label' column from the output file to the masked training dataset
    masked_training_df['label'] = output_file_final_df['label']
    # Save the updated dataset
    masked_training_df.to_csv(masked_training_file, index=False)
    print(f"'label' column copied and saved to {masked_training_file}.")
else:
    print("'label' column not found in the output file.")

In [None]:
def translate_texts(texts):
    try:
        # Use DeepL API to translate a list of texts
        results = translator.translate_text(texts, target_lang='FR')
        return [result.text for result in results]
    except deepl.DeepLException as e:
        print(f"Error translating batch: {e}")
        return [None] * len(texts)

In [None]:
# Batch processing parameters
batch_size = 50  # Adjust the batch size based on your needs and API limitations

# Prepare a list to store translated texts
translated_texts = []

# Total number of texts to translate
total_texts = len(df)

# Use tqdm to create a progress bar
for start_idx in tqdm(range(0, total_texts, batch_size), desc="Translating", unit="batch"):
    end_idx = min(start_idx + batch_size, total_texts)
    batch_texts = df[text_column].iloc[start_idx:end_idx].tolist()
    translated_batch = translate_texts(batch_texts)
    translated_texts.extend(translated_batch)

# Add the translated texts to the DataFrame
df['translated_text'] = translated_texts

# Save translated claims to a CSV file
output_file = f'translated_claims_{split_to_translate}.csv'
df[[text_column, 'translated_text']].to_csv(output_file, index=False)

print(f"Translated claims saved to '{output_file}'")

Save Original Dataset

In [None]:
# Specify the columns you want to save
columns_to_save = ['text', 'label', 'translated_text']  # Replace with your actual column names

# Save the DataFrame to a CSV file with the specified columns
df.to_csv('output_file.csv', columns=columns_to_save, index=False)

print("Data saved to 'output_file.csv' with columns:", columns_to_save)


Validation

English model should classify French claims.

Model: https://huggingface.co/climatebert/environmental-claims

Dataset: https://huggingface.co/datasets/climatebert/environmental_claims


In [None]:
!pip install transformers pandas torch scikit-learn

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

# Load the translated data
translated_data_file = 'output_file.csv'  # Replace with your actual file path
df = pd.read_csv(translated_data_file)

# Ensure the translated column and label column exist
translated_column = 'translated_text'  # Column with French claims
label_column = 'label'  # Column with the original English labels (adjust as needed)

if translated_column not in df.columns or label_column not in df.columns:
    raise ValueError("Ensure the translated and label columns are present in the DataFrame.")

In [None]:
df

In [None]:
# Load the pre-trained ClimateBERT model for classification
model_name = "climatebert/environmental-claims"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Check if CUDA (GPU) is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
# Tokenize the French claims
def tokenize_claims(claims):
    return tokenizer(claims, truncation=True, padding=True, return_tensors='pt')

# Classify the claims
def classify_claims(translated_claims):
    model.eval()
    predictions = []

    with torch.no_grad():
        for claim in translated_claims:
            inputs = tokenize_claims([claim]).to(device)
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            predictions.extend(preds)

    return predictions

# Get predictions for the translated French claims
translated_claims = df[translated_column].tolist()
predicted_labels = classify_claims(translated_claims)

# Add predictions to the DataFrame
df['predicted_label'] = predicted_labels

# Calculate metrics
true_labels = df[label_column].tolist()

# Accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Accuracy: {accuracy:.4f}")

# Precision, Recall, F1-score
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='weighted')
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels, target_names=['0', '1']))

# Save the DataFrame with predictions
output_file = 'validated_translations_with_metrics.csv'
df.to_csv(output_file, index=False)

print(f"Validated translations saved to '{output_file}'")

The destribution of claims in original dataset :

Class 0: 1585 samples

Class 1: 532 samples

Results of applying English climateBert model on translated to french claims  (To sum up - it detects 0 claims better because the dataset imbalanced):

Accuracy: 0.7520

Precision: 0.7474

Recall: 0.7520

F1-Score: 0.6522


Classification Report:

              precision    recall  f1-score   support

           0       0.75      1.00      0.86      1585
           1       0.73      0.02      0.04       532

    accuracy                           0.75      2117
   macro avg       0.74      0.51      0.45      2117
weighted avg       0.75      0.75      0.65      2117


Observations
Class 0 (Majority Class):

Precision: 0.75 means that 75% of predictions for class 0 are correct.
Recall: 1.00 means the model correctly identifies all instances of class 0.
F1-Score: 0.86 indicates excellent performance for class 0.
Conclusion: The model performs well for the majority class.
Class 1 (Minority Class):

Precision: 0.73 means that when the model predicts class 1, it is correct 73% of the time.
Recall: 0.02 is extremely low, indicating that the model identifies only 2% of actual class 1 instances.
F1-Score: 0.04 reflects poor performance in balancing precision and recall for class 1.
Conclusion: The model struggles significantly to detect class 1 instances.

Key Issues
Class Imbalance:

There is a significant imbalance in the data:
Class 0: 1585 instances (75%)
Class 1: 532 instances (25%)
The model heavily favors the majority class (class 0), which explains the high recall for class 0 but very low recall for class 1.
Low Recall for Class 1:

The recall for class 1 is only 0.02, meaning the model fails to identify most actual instances of class 1.
This is problematic, especially if class 1 is critical for your application (e.g., detecting environmental claims).
F1-Score Disparity:

The F1-score for class 1 (0.04) is much lower than for class 0 (0.86), highlighting the model's inability to balance precision and recall for the minority class.

In [None]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from transformers import AutoTokenizer
import pandas as pd

# Load the ClimateBERT tokenizer
model_name = "climatebert/environmental-claims"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the translated dataset
translated_data_file = 'output_file_final.csv'  # Replace with your dataset path
df = pd.read_csv(translated_data_file)

# Define the columns for references and candidate translations
reference_column = 'text'  # Original English sentences
candidate_column = 'translated_text'  # Translated French sentences

# Ensure the columns exist
if reference_column not in df.columns or candidate_column not in df.columns:
    raise ValueError(f"Ensure '{reference_column}' and '{candidate_column}' columns exist in the DataFrame.")

# Function to preprocess and tokenize text
def preprocess_and_tokenize(text, lang):
    # Preprocess text: lowercasing and stripping unnecessary whitespace
    text = text.lower().strip()
    # Tokenize using the ClimateBERT tokenizer
    tokens = tokenizer.tokenize(text)
    return tokens

# Prepare tokenized references and candidates
references = [[preprocess_and_tokenize(ref, 'en')] for ref in df[reference_column]]  # Tokenize references
candidates = [preprocess_and_tokenize(cand, 'fr') for cand in df[candidate_column]]  # Tokenize candidates

# Calculate corpus-level BLEU score
smooth_fn = SmoothingFunction().method4  # Smoothing for better BLEU scores
bleu_score = corpus_bleu(references, candidates, smoothing_function=smooth_fn)

# Print the BLEU score
print(f"Corpus BLEU Score: {bleu_score:.4f}")

In [None]:
df

## Translate Test Dataset

In [None]:
import pandas as pd
import deepl
import os
from tqdm import tqdm

# Set DeepL API key
os.environ['DEEPL_AUTH_KEY'] = ''

# Get authentication key from environment variable
auth_key = os.getenv('DEEPL_AUTH_KEY')
if auth_key is None:
    raise ValueError("Please set the DEEPL_AUTH_KEY environment variable.")

# Initialize DeepL translator
translator = deepl.Translator(auth_key)

# Define dataset paths
splits = {
    'train': 'data/train-00000-of-00001-98aa5228a06a17d0.parquet',
    'validation': 'data/validation-00000-of-00001-2553e47d408fab28.parquet',
    'test': 'data/test-00000-of-00001-79fd931297fff765.parquet'
}

# Load the test dataset
test_split = 'test'
df = pd.read_parquet("hf://datasets/climatebert/environmental_claims/" + splits[test_split])

# Define text and label column names
text_column = 'text'  # Update this if necessary
label_column = 'label'  # Ensure this matches the dataset

# Ensure the necessary columns exist
if text_column not in df.columns:
    raise ValueError(f"The specified text column '{text_column}' does not exist in the dataset.")
if label_column not in df.columns:
    df[label_column] = None  # Assign None if the label column is missing

# Function to translate text in batches
def translate_texts(texts):
    try:
        results = translator.translate_text(texts, target_lang='FR')
        return [result.text for result in results]
    except deepl.DeepLException as e:
        print(f"Error translating batch: {e}")
        return ["Error"] * len(texts)

# Batch translation parameters
batch_size = 50  # Adjust based on API limits
translated_texts = []
total_texts = len(df)

# Translate in batches with progress tracking
for start_idx in tqdm(range(0, total_texts, batch_size), desc="Translating", unit="batch"):
    end_idx = min(start_idx + batch_size, total_texts)
    batch_texts = df[text_column].iloc[start_idx:end_idx].tolist()
    translated_batch = translate_texts(batch_texts)
    translated_texts.extend(translated_batch)

# Add translations to DataFrame
df['translated_text'] = translated_texts

# Ensure translation column is not empty
if df['translated_text'].isnull().all():
    raise ValueError("Translation failed. Check DeepL API settings and retry.")

# Save translated dataset with labels (if available)
output_file = f'translated_claims_{test_split}.csv'
df[[text_column, 'translated_text', label_column]].to_csv(output_file, index=False)

print(f"Translated claims saved to '{output_file}'")