Load the smallest qwen2.5 with Huggingface transformers with mod=AutoModel.from_pretrained(…)

In [1]:
!pip install --upgrade transformers datasets accelerate peft safetensors



In [2]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [3]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
from datasets import load_dataset

dataset = load_dataset(
    "csv",
    data_files={
        "train": "/content/drive/MyDrive/greenwashing/training/masked_training_dataset.csv",
        "validation": "/content/drive/MyDrive/greenwashing/training/translated_claims_validation.csv"
    }
)

assert "translated_text" in dataset["train"].column_names, "Column 'masked_text' missing in training data!"  # CHANGED
assert "label" in dataset["train"].column_names, "Column 'label' missing in training data!"  # CHANGED


print(dataset['train'][0])
print(dataset['validation'][0])

{'text': 'The project will make a significant contribution to the German and European hydrogen strategy and hence to achievement of the climate targets.', 'translated_text': "Le projet contribuera de manière significative à la stratégie allemande et européenne en matière d'hydrogène et, partant, à la réalisation des objectifs climatiques.", 'extracted_companies': '[]', 'masked_text': "Le projet contribuera de manière significative à la stratégie allemande et européenne en matière d'hydrogène et, partant, à la réalisation des objectifs climatiques.", 'label': 1}
{'text': 'During 2019, Ready to Manufacture has been merged with our environmental standard Green to Wear.', 'translated_text': "Au cours de l'année 2019, Prêt à fabriquer a été fusionné avec notre norme environnementale Vert à porter.", 'extracted_companies': None, 'masked_text': None, 'label': 1}


In [6]:
# Get the 'label' column from the training dataset
train_labels = dataset["train"]["label"]

# Determine the unique classes
unique_classes = set(train_labels)
num_classes = len(unique_classes)

print(f"Unique classes: {unique_classes}")
print(f"Number of classes: {num_classes}")

Unique classes: {0, 1}
Number of classes: 2


In [7]:
max_length = 512

# Define preprocessing function
def preprocess_function(examples):
    # Extract the 'translated_text' column
    texts = examples["translated_text"]  # Extract list of texts (batched=True ensures it's a list)
    labels = examples["label"]       # Extract list of labels (should match batch size)

    # Ensure 'texts' is a list of strings
    texts = [str(text) for text in texts]

    # Tokenize the input text
    encoding = tokenizer(
        texts,                       # Tokenizer expects a list of strings
        truncation=True,
        max_length=max_length,
        padding="max_length"
    )
    # Add labels to the tokenized output
    encoding["labels"] = labels
    return encoding


# Apply the preprocessing function to the dataset
processed_dataset = dataset.map(preprocess_function, batched=True)

# Split processed dataset into train and validation
train_dataset = processed_dataset["train"]
validation_dataset = processed_dataset["validation"]

# Check the processed datasets
print(train_dataset[0])  # Check first processed training example
print(validation_dataset[0])  # Check first processed

Map:   0%|          | 0/265 [00:00<?, ? examples/s]

{'text': 'The project will make a significant contribution to the German and European hydrogen strategy and hence to achievement of the climate targets.', 'translated_text': "Le projet contribuera de manière significative à la stratégie allemande et européenne en matière d'hydrogène et, partant, à la réalisation des objectifs climatiques.", 'extracted_companies': '[]', 'masked_text': "Le projet contribuera de manière significative à la stratégie allemande et européenne en matière d'hydrogène et, partant, à la réalisation des objectifs climatiques.", 'label': 1, 'input_ids': [2304, 45394, 5919, 84, 2416, 409, 84622, 4595, 1388, 3784, 1187, 142682, 12304, 1928, 68, 1842, 140927, 662, 135725, 294, 6, 66129, 70, 46999, 1842, 11, 949, 517, 11, 3784, 1187, 58057, 7923, 939, 1633, 21835, 11076, 266, 8303, 13, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 15

In [8]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
import torch
import torch.nn as nn
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          DataCollatorWithPadding,
                          TrainingArguments,
                          Trainer,
                          EarlyStoppingCallback)

In [10]:
class QwenForClassification(nn.Module):
    def __init__(self, model_name, num_labels=2):
        super().__init__()
        base_model = AutoModelForCausalLM.from_pretrained(model_name)
        self.model = base_model
        hidden_size = self.model.config.hidden_size
        self.classifier = nn.Linear(hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
        )
        last_hidden_state = outputs.hidden_states[-1]
        pooled_output = last_hidden_state[:, -1, :]
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            # Debug: Print labels and their device
            # print(f"Labels: {labels}")  # Check labels tensor
            # print(f"Labels Device: {labels.device}")  # Ensure labels are on the same device as inputs

            # Debug: Print unique labels in the batch
            unique_labels = torch.unique(labels)
            # print(f"Unique Labels in Batch: {unique_labels}")

            # Ensure weights for all classes (even if some are missing in the batch)
            num_classes = logits.size(-1)
            class_counts = torch.bincount(labels, minlength=num_classes)
            # print(f"Class Counts in Batch: {class_counts}")  # Debug class counts
            class_weights = 1.0 / (class_counts.float() + 1e-6)
            class_weights /= class_weights.sum()  # Normalize
            # print(f"Class Weights: {class_weights}")  # Debug weights

            # Apply weighted CrossEntropyLoss
            class_weights = class_weights.to(input_ids.device)
            loss_fct = nn.CrossEntropyLoss(weight=class_weights)
            loss = loss_fct(logits, labels.view(-1))

        return {"loss": loss, "logits": logits}

model_infer = QwenForClassification(
    "Qwen/Qwen2.5-0.5B",
    num_labels=2,
).to(device)
# Detach tied weights properly and reassign as a Parameter
with torch.no_grad():
    model_infer.model.lm_head.weight = torch.nn.Parameter(
        model_infer.model.lm_head.weight.clone().detach()
    )

In [11]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [12]:
# Improved metric computation
from sklearn.metrics import accuracy_score

def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    return {"accuracy": accuracy_score(labels, preds)}

In [13]:
# Add early stopping callback
early_stopping = EarlyStoppingCallback(early_stopping_patience=2)  # Stop if no improvement for 2 epochs

In [14]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/greenwashing/qwen_french_greenwashing",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=4,  # Adjust based on memory
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=16,  # Simulate larger batch size
    learning_rate=2e-5,  # Smaller learning rate for fine-tuning
    fp16=True,  # Mixed precision for faster training
    logging_steps=50,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="/content/qwen_logs",
    save_safetensors=True
)


trainer = Trainer(
    model=model_infer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]  # Add early stopping
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.613016,0.656604
2,1.737800,0.276957,0.890566
3,0.165900,0.368584,0.883019
4,0.165900,0.56922,0.879245


TrainOutput(global_step=136, training_loss=0.7032404993386829, metrics={'train_runtime': 1253.9442, 'train_samples_per_second': 8.441, 'train_steps_per_second': 0.132, 'total_flos': 0.0, 'train_loss': 0.7032404993386829, 'epoch': 4.0})

In [18]:
model_save_path = "/content/drive/MyDrive/greenwashing/qwen_french_greenwashing"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

('/content/drive/MyDrive/greenwashing/qwen_french_greenwashing/tokenizer_config.json',
 '/content/drive/MyDrive/greenwashing/qwen_french_greenwashing/special_tokens_map.json',
 '/content/drive/MyDrive/greenwashing/qwen_french_greenwashing/vocab.json',
 '/content/drive/MyDrive/greenwashing/qwen_french_greenwashing/merges.txt',
 '/content/drive/MyDrive/greenwashing/qwen_french_greenwashing/added_tokens.json',
 '/content/drive/MyDrive/greenwashing/qwen_french_greenwashing/tokenizer.json')

In [19]:

from transformers import AutoTokenizer
from safetensors.torch import load_file
import torch


# Load model from safetensors format
state_dict = load_file(f"{model_save_path}/model.safetensors", device=device)  # CHANGED: Load safetensors file
model_infer.load_state_dict(state_dict)
model_infer.to(device)
model_infer.eval()

# Prediction
# Updated predict function
def predict(text):
    if not text.strip():
        return "Invalid input"

    # Tokenize input and move to the correct device
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to device

    with torch.no_grad():
        outputs = model_infer(**inputs)  # Model inference
        logits = outputs["logits"]

    preds = torch.argmax(logits, dim=-1)
    return preds.item()

# Example prediction
test_text = "Cette entreprise affirme utiliser des matériaux écologiques, mais ne donne aucune preuve."
prediction = predict(test_text)
if prediction == 1:
    print("It might be greenwashing!")
else:
    print("It might not be greenwashing.")


It might be greenwashing!
