In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, MT5Tokenizer
from peft import get_peft_model, LoraConfig, TaskType
import random

# Load dataset
dataset = load_dataset("persiannlp/parsinlu_translation_en_fa")

def sample_dataset(dataset, sample_size):
    sampled_dataset = dataset.select(random.sample(range(len(dataset)), sample_size))
    return sampled_dataset

train_sample_size = 120000
test_sample_size = 4000  

sampled_trainset = sample_dataset(dataset['train'], train_sample_size)
sampled_testset = sample_dataset(dataset['test'], test_sample_size)

# Load tokenizer and model
model_name = "persiannlp/mt5-base-parsinlu-opus-translation_fa_en"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = MT5Tokenizer.from_pretrained(model_name)


config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
)

peft_model = get_peft_model(model, config)

def tokenize_function(examples):
    sources = ["translate English to Persian: " + ex for ex in examples['source']]
    targets = [ex[0] for ex in examples['targets']]

    model_inputs = tokenizer(sources, max_length=64, truncation=True, padding='max_length')

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=64, truncation=True, padding='max_length')

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_trainset = sampled_trainset.map(tokenize_function, batched=True, remove_columns=['source', 'targets'])
tokenized_testset = sampled_testset.map(tokenize_function, batched=True, remove_columns=['source', 'targets'])

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


training_args = Seq2SeqTrainingArguments(
    output_dir='./drive/MyDrive/translation/results',
    evaluation_strategy='epoch',
    logging_dir='./logs',
    logging_steps=100,
    save_steps=500,
    save_total_limit=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    learning_rate=5e-5,
    num_train_epochs=2,
    predict_with_generate=True,
    disable_tqdm=False,
    overwrite_output_dir=True,
    fp16=True,
    logging_strategy="epoch",
)

trainer = Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_trainset,
    eval_dataset=tokenized_testset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

# Save the trained model and tokenizer
peft_model.save_pretrained('./drive/MyDrive/translation/peft_model')
tokenizer.save_pretrained('./drive/MyDrive/translation/tokenizer')
model.save_pretrained("./drive/MyDrive/translation/model")




Epoch,Training Loss,Validation Loss
1,0.0,
2,0.0,


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_path = './drive/MyDrive/translation/peft_model'
tokenizer_path = './drive/MyDrive/translation/tokenizer'

tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# translate English text to Persian
def translate(text, tokenizer, model):
    # Prepare the text for the model
    input_text = "translate English to Persian: " + text
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=64, truncation=True)

    # Generate translation using the model
    outputs = model.generate(inputs, max_length=64, num_beams=4, early_stopping=True)

    # Decode the generated text
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text

# Test the model with some sample English sentences
sample_texts = [
    "امروز چطوری؟",
    'سلام',
    'چه خبر؟ خانواده خوبن؟',
    'این فیلم واقعا خوب بود.',
    'گشنمه'
]

for text in sample_texts:
    translation = translate(text, tokenizer, model)
    print(f"persian: {text}\nEnglish: {translation}\n")



persian: امروز چطوری؟
English: Translation English to Persian: How are you today?

persian: سلام
English: English to Persian: Hello

persian: چه خبر؟ خانواده خوبن؟
English: Translation English to Persian: What's the matter? Is the family all right?

persian: این فیلم واقعا خوب بود.
English: Translation English to Persian: This movie was really good.

persian: خیار سبزیجات مورد علاقه من است
English: Translation English to Persian: My favorite vegetable cucumber

persian: گشنمه
English: Translation English to Persian: I'm hungry



In [None]:
!pip install pytest-shutil
import shutil
from google.colab import files
shutil.make_archive('/content/model', 'zip', '/content/drive/MyDrive/temp/model')
shutil.make_archive('/content/tokenizer', 'zip', '/content/drive/MyDrive/temp/tokenizer')

# دانلود فایل‌های فشرده شده
files.download('/content/model.zip')
files.download('/content/tokenizer.zip')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>