https://huggingface.co/ariannap22/collectiveaction_roberta_simplified_synthetic_weights

In [5]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Set device to CPU or GPU
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer
model_name = "ariannap22/collectiveaction_roberta_simplified_synthetic_weights"
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
#tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Define the text you want to predict
texts = [
    "We need to stand together for our rights!",
    "I volunteer at the local food bank.",
    "This is just a casual discussion."
]

# Tokenize the input text
inputs = tokenizer(
    texts,
    padding=True,  # Pad to the longest sequence in the batch
    truncation=True,  # Truncate sequences longer than the model's max length
    max_length=512,  # Adjust max length as needed
    return_tensors="pt"  # Return PyTorch tensors
).to(device)

# Perform prediction
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits  # Raw model outputs before softmax

# Convert logits to probabilities (optional)
probs = torch.nn.functional.softmax(logits, dim=-1)

# Get predicted class indices
predicted_class_indices = torch.argmax(probs, dim=-1)

# Print results
for text, idx, prob in zip(texts, predicted_class_indices, probs):
    print(f"Text: {text}")
    print(f"Predicted Class Index: {idx.item()}")
    print(f"Probabilities: {prob.tolist()}")
    print("---")

Text: We need to stand together for our rights!
Predicted Class Index: 1
Probabilities: [0.0001251356879947707, 0.9998748302459717]
---
Text: I volunteer at the local food bank.
Predicted Class Index: 1
Probabilities: [4.350928065832704e-05, 0.999956488609314]
---
Text: This is just a casual discussion.
Predicted Class Index: 1
Probabilities: [5.4901571274967864e-05, 0.9999450445175171]
---


In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "ariannap22/collectiveaction_roberta_simplified_synthetic_weights"

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    print("Model and tokenizer loaded successfully!")
except Exception as e:
    print(f"Error loading tokenizer or model: {e}")


Error loading tokenizer or model: data did not match any variant of untagged enum ModelWrapper at line 250356 column 3


https://huggingface.co/ariannap22/collectiveaction_roberta_synthetic_weights_layered

In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Set device to CPU or GPU
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer
model_name = "ariannap22/collectiveaction_roberta_synthetic_weights_layered"
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the text you want to predict
texts = [
    "We need to stand together for our rights!",
    "I volunteer at the local food bank."
]

# Tokenize the input text
inputs = tokenizer(
    texts,
    padding=True,  # Pad to the longest sequence in the batch
    truncation=True,  # Truncate sequences longer than the model's max length
    max_length=512,  # Adjust max length as needed
    return_tensors="pt"  # Return PyTorch tensors
).to(device)

# Perform prediction
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits  # Raw model outputs before softmax

# Convert logits to probabilities (optional)
probs = torch.nn.functional.softmax(logits, dim=-1)

# Get predicted class indices
predicted_class_indices = torch.argmax(probs, dim=-1)

# Print results
for text, idx, prob in zip(texts, predicted_class_indices, probs):
    print(f"Text: {text}")
    print(f"Predicted Class Index: {idx.item()}")
    print(f"Probabilities: {prob.tolist()}")
    print("---")

config.json:   0%|          | 0.00/947 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

Exception: data did not match any variant of untagged enum ModelWrapper at line 250356 column 3

https://huggingface.co/ariannap22/collectiveaction_sft_annotated_only_v6_prompt_v6_p100_synthetic_balanced_more_layered

In [3]:
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          pipeline)

model_dir = "ariannap22/collectiveaction_sft_annotated_only_v6_prompt_v6_p100_synthetic_balanced_more_layered"

# Define the text you want to predict
texts = [
    "We need to stand together for our rights!",
    "I volunteer at the local food bank."
]

# Define levels of participation in collective action¨
dim_def = {'Problem-Solution': "The comment highlights an issue and possibly suggests a way to fix it, often naming those responsible.",
            'Call-to-Action': "The comment asks readers to take part in a specific activity, effort, or movement.",
            'Intention': "The commenter shares their own desire to do something or be involved in solving a particular issue.",
            'Execution': "The commenter is describing their personal experience taking direct actions towards a common goal."}

# Define the prompt
def generate_test_prompt6(data_point):
    return f"""
            You have the following knowledge about levels of participation in collective action that can be expressed in social media comments: {dim_def}. 
            
            ### Definitions and Criteria:
            **Collective Action Problem:** A present issue caused by human actions or decisions that affects a group and can be addressed through individual or collective efforts.

            **Participation in collective action**: A comment must clearly reference a collective action problem, social movement, or activism by meeting at least one of the levels in the list {dim_def.keys()}.

            Classify the following social media comment into one of the levels within the list {list(dim_def.keys())}. 

            ### Example of correct output format:
            text: xyz
            label: None
            
            Return the answer as the corresponding participation in collective action level label.

            text: {data_point}
            label: """.strip()

texts_prompts = [generate_test_prompt6(text) for text in texts]

# Prepare datasets and load model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_dir)

tokenizer.pad_token_id = tokenizer.eos_token_id

# Define prediction 
def predict(texts, model, tokenizer):
    y_pred = []
    answers = []
    categories = list(dim_def.keys())

    for i in range(len(texts)):
        prompt = texts[i]
        pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens=20, 
                        temperature=0.1)
        
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("label:")[-1].strip()
        answers.append(answer)
        
        # Determine the predicted category
        for category in categories:
            if category.lower() in answer.lower():
                y_pred.append(category)
                break
        else:
            y_pred.append("error")
    
    return y_pred, answers

y_pred, answer = predict(texts_prompts, model, tokenizer)

# Print results
for text, pred in zip(texts, y_pred):
    print(f"Text: {text}")
    print(f"Predicted Class: {pred}")
    print("---")

PackageNotFoundError: No package metadata was found for bitsandbytes