In [None]:
model_save_path = "drive/MyDrive/Checkpoint/checkpoint-136"

In [None]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
import torch.nn as nn
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          DataCollatorWithPadding,
                          TrainingArguments,
                          Trainer,
                          EarlyStoppingCallback)

In [None]:
class QwenForClassification(nn.Module):
    def __init__(self, model_name, num_labels=2):
        super().__init__()
        base_model = AutoModelForCausalLM.from_pretrained(model_name)
        self.model = base_model
        hidden_size = self.model.config.hidden_size
        self.classifier = nn.Linear(hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
        )
        last_hidden_state = outputs.hidden_states[-1]
        pooled_output = last_hidden_state[:, -1, :]
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            # Debug: Print labels and their device
            # print(f"Labels: {labels}")  # Check labels tensor
            # print(f"Labels Device: {labels.device}")  # Ensure labels are on the same device as inputs

            # Debug: Print unique labels in the batch
            unique_labels = torch.unique(labels)
            # print(f"Unique Labels in Batch: {unique_labels}")

            # Ensure weights for all classes (even if some are missing in the batch)
            num_classes = logits.size(-1)
            class_counts = torch.bincount(labels, minlength=num_classes)
            # print(f"Class Counts in Batch: {class_counts}")  # Debug class counts
            class_weights = 1.0 / (class_counts.float() + 1e-6)
            class_weights /= class_weights.sum()  # Normalize
            # print(f"Class Weights: {class_weights}")  # Debug weights

            # Apply weighted CrossEntropyLoss
            class_weights = class_weights.to(input_ids.device)
            loss_fct = nn.CrossEntropyLoss(weight=class_weights)
            loss = loss_fct(logits, labels.view(-1))

        return {"loss": loss, "logits": logits}

model_infer = QwenForClassification(
    "Qwen/Qwen2.5-0.5B",
    num_labels=2,
).to(device)
# Detach tied weights properly and reassign as a Parameter
with torch.no_grad():
    model_infer.model.lm_head.weight = torch.nn.Parameter(
        model_infer.model.lm_head.weight.clone().detach()
    )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/681 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

In [None]:
from transformers import AutoTokenizer
from safetensors.torch import load_file


# Load model from safetensors format
state_dict = load_file(f"{model_save_path}/model.safetensors", device=device)  # CHANGED: Load safetensors file
model_infer.load_state_dict(state_dict)
model_infer.to(device)
model_infer.eval()

QwenForClassification(
  (model): Qwen2ForCausalLM(
    (model): Qwen2Model(
      (embed_tokens): Embedding(151936, 896)
      (layers): ModuleList(
        (0-23): 24 x Qwen2DecoderLayer(
          (self_attn): Qwen2Attention(
            (q_proj): Linear(in_features=896, out_features=896, bias=True)
            (k_proj): Linear(in_features=896, out_features=128, bias=True)
            (v_proj): Linear(in_features=896, out_features=128, bias=True)
            (o_proj): Linear(in_features=896, out_features=896, bias=False)
          )
          (mlp): Qwen2MLP(
            (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
            (up_proj): Linear(in_features=896, out_features=4864, bias=False)
            (down_proj): Linear(in_features=4864, out_features=896, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
          (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        )
      )
   

In [None]:
#import pandas as pd

#validation = pd.read_csv("Merged.csv")
#validation

In [None]:
!pip install datasets

from datasets import load_dataset

dataset = load_dataset(
    "csv",
    data_files={
        "validation": "Merged.csv"
    },
    split="validation"
)

print(dataset["Text"][0])



Generating validation split: 0 examples [00:00, ? examples/s]

Grâce à ce partenariat, Slood est en mesure de proposer un service de retouche, de réparation et d’upcycling à domicile et par envoi postal à ses clients.


In [None]:
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score, recall_score, f1_score
import numpy as np

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["Text"], padding="max_length", truncation=True, max_length=512)

# Tokenize dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Convert dataset to PyTorch tensors
input_ids = torch.tensor(tokenized_dataset["input_ids"]).to(device)
attention_mask = torch.tensor(tokenized_dataset["attention_mask"]).to(device)
labels = torch.tensor(tokenized_dataset["Label"]).to(device)  # Ground truth labels

# Inference
with torch.no_grad():
    logits = []
    for i in range(0, len(input_ids), 8):  # Process in batches of 8
        batch_logits = model_infer(input_ids[i:i+8], attention_mask[i:i+8])["logits"]
        logits.append(batch_logits)

logits = torch.cat(logits)
predictions = torch.argmax(logits, dim=-1).cpu().numpy()
true_labels = labels.cpu().numpy()

# Compute metrics
accuracy = accuracy_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Map:   0%|          | 0/501 [00:00<?, ? examples/s]

Accuracy: 0.7445
Recall: 0.5628
F1 Score: 0.6364


In [None]:
dataset = load_dataset(
    "csv",
    data_files={
        "validation_translated": "translated_claims_test.csv"
    },
    split="validation_translated"
)

print(dataset["translated_text"][0])


# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["translated_text"], padding="max_length", truncation=True, max_length=512)

# Tokenize dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Convert dataset to PyTorch tensors
input_ids = torch.tensor(tokenized_dataset["input_ids"]).to(device)
attention_mask = torch.tensor(tokenized_dataset["attention_mask"]).to(device)
labels = torch.tensor(tokenized_dataset["label"]).to(device)  # Ground truth labels

# Inference
with torch.no_grad():
    logits = []
    for i in range(0, len(input_ids), 8):  # Process in batches of 8
        batch_logits = model_infer(input_ids[i:i+8], attention_mask[i:i+8])["logits"]
        logits.append(batch_logits)

logits = torch.cat(logits)
predictions = torch.argmax(logits, dim=-1).cpu().numpy()
true_labels = labels.cpu().numpy()

# Compute metrics
accuracy = accuracy_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Generating validation_translated split: 0 examples [00:00, ? examples/s]

Une population totale de 6148 personnes bénéficie d'une eau potable sûre grâce à cette initiative.


Map:   0%|          | 0/265 [00:00<?, ? examples/s]

Accuracy: 0.9057
Recall: 0.8358
F1 Score: 0.8175


In [None]:
print(dataset[0])

{'text': 'A total population of 6148 is getting the benefit of safe potable drinking water due to this initiative.', 'translated_text': "Une population totale de 6148 personnes bénéficie d'une eau potable sûre grâce à cette initiative.", 'label': 1}
