In [3]:
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model


dataset = load_dataset('persiannlp/parsinlu_sentiment')
train_data = dataset['train']
val_data = dataset['test_food']

# Convert labels from string to integer and then map them to 0 or 1
def label_to_binary(label):
    return 0 if int(label) <= 0 else 1

train_data = train_data.map(lambda example: {'review': example['review'], 'label': label_to_binary(example['label'])})
val_data = val_data.map(lambda example: {'review': example['review'], 'label': label_to_binary(example['label'])})

# Load tokenizer and model
model_name = "HooshvareLab/bert-fa-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Binary classification (0, 1)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define a maximum sequence length
max_length = 128

# Define preprocessing function
def preprocess_function(examples):
    return tokenizer(examples['review'], padding='max_length', truncation=True, max_length=max_length)

# Tokenize the datasets
train_data = train_data.map(preprocess_function, batched=True)
val_data = val_data.map(preprocess_function, batched=True)

# Remove columns that are not inputs for the model
train_data = train_data.remove_columns(["review"])
val_data = val_data.remove_columns(["review"])

# Set format for PyTorch
train_data.set_format("torch")
val_data.set_format("torch")

# Configure Lora for sentiment analysis
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none"
)

# Get the Lora-enhanced model
lora_model = get_peft_model(model, lora_config)

# Training arguments
training_args = TrainingArguments(
    output_dir="content/drive/MyDrive/temp/results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Trainer for training the model
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
)

# Start training
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/45.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/38.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/13617 [00:00<?, ? examples/s]

Generating test_food split:   0%|          | 0/1344 [00:00<?, ? examples/s]

Generating test_movies split:   0%|          | 0/816 [00:00<?, ? examples/s]

Generating validation_food split:   0%|          | 0/1330 [00:00<?, ? examples/s]

Generating validation_movies split:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/13617 [00:00<?, ? examples/s]

Map:   0%|          | 0/1344 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/440 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.txt:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

Map:   0%|          | 0/13617 [00:00<?, ? examples/s]

Map:   0%|          | 0/1344 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,0.487,No log
2,0.4244,No log
3,0.4103,No log




TrainOutput(global_step=2556, training_loss=0.4321850536388224, metrics={'train_runtime': 605.297, 'train_samples_per_second': 67.489, 'train_steps_per_second': 4.223, 'total_flos': 2696339872313856.0, 'train_loss': 0.4321850536388224, 'epoch': 3.0})

In [4]:
model.save_pretrained("content/drive/MyDrive/temp/model")
tokenizer.save_pretrained("content/drive/MyDrive/temp/tokenizer")

# ذخیره مدل LoRA
lora_model.save_pretrained("content/drive/MyDrive/temp/model")



In [5]:

from google.colab import files
import shutil

# فشرده‌سازی پوشه مدل
shutil.make_archive('content/model', 'zip', 'content/drive/MyDrive/temp/model')
shutil.make_archive('content/tokenizer', 'zip', 'content/drive/MyDrive/temp/tokenizer')

# دانلود فایل‌های فشرده شده
files.download('content/model.zip')
files.download('content/tokenizer.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>