In [10]:
import re
import pandas as pd
import numpy as np

from tqdm.auto import tqdm
import torch

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

# Загрузка тестового датасета

In [None]:
data = []
with open('/kaggle/input/avito-data/dataset_1937770_3.txt', 'r', encoding='utf-8') as file: # нужно указать путь на датасет
    for line in file:
        parts = re.split(r',', line.strip(), maxsplit=1)
        if len(parts) == 2:
            id_val, text = parts
            data.append({'id': id_val, 'text': text})

df = pd.DataFrame(data)
new_header = df.iloc[0] 
df = df[1:] 
df.columns = new_header
df.head()

# Загрузка модели

In [6]:
checkpoint_path = "/kaggle/input/my-rut5/kaggle/working/outputs/checkpoint-6320" # путь к папке с весами

tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)

# Предикт

In [7]:
def predict_space_positions(model, tokenizer, texts, device=None, max_length=128, num_beams=5):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()

    predictions = []
    positions = []

    for text in tqdm(texts):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                num_beams=num_beams,
                early_stopping=True
            )
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)

        predictions.append(pred)

        space_idx = [i for i, ch in enumerate(pred) if ch == " "]
        positions.append(space_idx)

    return predictions, positions

In [8]:
test_texts = df["text_no_spaces"].to_list()
test_texts[:5]

['куплюайфон14про',
 'ищудомвПодмосковье',
 'сдаюквартирусмебельюитехникой',
 'новыйдивандоставканедорого',
 'отдамдаромкошку']

In [12]:
preds, pos = predict_space_positions(model, tokenizer, test_texts)

df_res = pd.DataFrame({"id": range(len(test_texts)), "predicted_positions": pos})
df_res

Unnamed: 0,id,predicted_positions
0,0,"[5, 12, 15]"
1,1,"[3, 7, 9]"
2,2,"[4, 13, 21, 23]"
3,3,"[5, 11, 20]"
4,4,"[5, 11]"
...,...,...
1000,1000,[]
1001,1001,"[7, 9, 18]"
1002,1002,"[11, 20]"
1003,1003,"[8, 20, 24]"


In [13]:
df_res.to_csv("sub1_avito.csv", index=False)