In [9]:
!pip install datasets




In [None]:
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import Dataset
from transformers import DataCollatorForSeq2Seq

def load_data(filename, max_records=None):
    data = []
    with open(filename, 'r', encoding='utf-8') as f:
        for idx, line in enumerate(f):
            if max_records is not None and idx >= max_records:
                break
            try:
                obj = json.loads(line)
                data.append((obj['src'], obj['dst']))
            except (KeyError, json.JSONDecodeError) as e:
                print(f"–û—à–∏–±–∫–∞ –æ–±—Ä–∞–±–æ—Ç–∫–∏ —Å—Ç—Ä–æ–∫–∏ {idx}: {e}")
    return data

# –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö —Å –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–∏–µ–º –Ω–∞ 10,000 –∑–∞–ø–∏—Å–µ–π –¥–ª—è —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω–æ–≥–æ –Ω–∞–±–æ—Ä–∞
train_data = load_data('train.jsonl', max_records=10000)
val_data = load_data('val.jsonl')  # –í–∞–ª–∏–¥–∏—Ä—É–µ–º –Ω–∞ –≤—Å–µ—Ö –¥–∞–Ω–Ω—ã—Ö

# –†–∞–∑–¥–µ–ª–∏–º –¥–∞–Ω–Ω—ã–µ –Ω–∞ src –∏ dst
train_src, train_dst = zip(*train_data)
val_src, val_dst = zip(*val_data)

# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞ (BPE)
tokenizer = AutoTokenizer.from_pretrained("t5-small")  # –ü—Ä–∏–º–µ—Ä: T5 –¥–ª—è –ø–µ—Ä–µ–≤–æ–¥–∞
train_encodings = tokenizer(list(train_src), padding=True, truncation=True, return_tensors="pt")
train_labels = tokenizer(list(train_dst), padding=True, truncation=True, return_tensors="pt")

val_encodings = tokenizer(list(val_src), padding=True, truncation=True, return_tensors="pt")
val_labels = tokenizer(list(val_dst), padding=True, truncation=True, return_tensors="pt")

# –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels['input_ids']
})
val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': val_labels['input_ids']
})

# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –º–æ–¥–µ–ª–∏
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –æ–±—É—á–µ–Ω–∏—è
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    save_strategy="epoch",
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to="none"  # –û—Ç–∫–ª—é—á–∞–µ—Ç W&B
)

# –°–æ–∑–¥–∞–Ω–∏–µ Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏
trainer.train()

# –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è –Ω–∞ —Ç–µ—Å—Ç–æ–≤—ã—Ö –¥–∞–Ω–Ω—ã—Ö
def predict(filename, output_filename):
    with open(filename, 'r', encoding='utf-8') as f, open(output_filename, 'w', encoding='utf-8') as out_f:
        for line in f:
            obj = json.loads(line)
            src_text = obj['src']
            inputs = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True)
            outputs = model.generate(**inputs)
            prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
            out_f.write(json.dumps({"dst": prediction, "src": src_text}) + "\n")

# –í—ã–ø–æ–ª–Ω–µ–Ω–∏–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π
predict('test_no_reference.jsonl', 'predictions.jsonl')


  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,1.7976,1.450896
2,0.5948,1.420697
3,0.5812,1.415104


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


In [11]:
def predict(filename, output_filename):
    with open(filename, 'r', encoding='utf-8') as f, open(output_filename, 'w', encoding='utf-8') as out_f:
        for line in f:
            obj = json.loads(line)
            src_text = obj['src']
            inputs = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True)
            outputs = model.generate(**inputs)
            prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
            out_f.write(json.dumps({"dst": prediction, "src": src_text}, ensure_ascii=False) + "\n")

# –í—ã–ø–æ–ª–Ω–µ–Ω–∏–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π
predict('test_no_reference.jsonl', 'predictions1.jsonl')

In [12]:
def predict(filename, output_filename):
    with open(filename, 'r', encoding='utf-8') as f, open(output_filename, 'w', encoding='utf-8') as out_f:
        for line in f:
            obj = json.loads(line)
            src_text = obj['src']
            inputs = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True)
            outputs = model.generate(**inputs, max_length=50)
            prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
            prediction = prediction[:50]
            out_f.write(json.dumps({"dst": prediction, "src": src_text}, ensure_ascii=False) + "\n")

# –í—ã–ø–æ–ª–Ω–µ–Ω–∏–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π
predict('test_no_reference.jsonl', 'predictions2.jsonl')