In [15]:
#Идея - использовать модификацию реккурентной сети (lstm) для предсказания появления пробела в предложении
# не такая ресурсоемкая, но эффективная. Реккурентные сети отслеживают как текущую картину так и всю последовательность по порядку, поэтому выбор пал на нее
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
import pandas as pd
import numpy as np
import ast
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

In [16]:
def preprocess_text(text):
    return text.lower()
def build_vocab(sentences, max_size=120): #строю словарь по частоте появления символов
    counter = Counter()
    for sentence in sentences:
        chars = list(preprocess_text(sentence))
        counter.update(chars)
    most_common = counter.most_common(max_size)
    vocab = {word: idx + 1 for idx, (word, _) in enumerate(most_common)}
    vocab['<PAD>'] = 0
    return vocab
def text_to_seq(text, vocab, max_length=120): # перевод текста в числовой вид (для этого и нужен словарь)
    text = preprocess_text(text)
    sequence = [vocab.get(ch, 0) for ch in list(text)]
    if len(sequence) > max_length:
        sequence = sequence[:max_length]
    else:
        sequence += [vocab['<PAD>']] * (max_length - len(sequence))
    return sequence
def pad_target(target, max_length=120): #добавляем паддинг тк не все предложения одной длины
    if len(target) < max_length:
        target = target + [0] * (max_length - len(target))
    else:
        target = target[:max_length]
    return target


In [17]:
sentences = pd.read_csv('data.csv')
sentences['y'] = sentences['y'].apply(ast.literal_eval) # читаем строку как список

In [18]:
vocab= build_vocab(sentences['x']) #строим словарь по обучающим данным

In [19]:
class WordDataset(Dataset): #кастомный датасет
    def __init__(self, X, Y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.Y = torch.tensor(Y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

In [20]:
X = sentences['x'].apply(text_to_seq, vocab=vocab)
Y = sentences['y'].apply(pad_target)
X, Y = X.tolist(), Y.tolist()
x_train, x_valid, y_train, y_valid = train_test_split( # делим на тестовую и обучающую
    X, Y, test_size=0.15, random_state=42, shuffle=True)

In [21]:
train_dataset = WordDataset(x_train, y_train) #используем даталоадеры для того чтобы потом удобно работать с моделью
valid_dataset = WordDataset(x_valid, y_valid)
train_loader = DataLoader(train_dataset, batch_size=20, shuffle=True)
val_loader = DataLoader(valid_dataset, batch_size=20)

In [22]:
class SpacePredictor(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout=0.6):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0) 
        self.embedding_dropout = nn.Dropout(0.4)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, 
                           batch_first=True, bidirectional=True, dropout=dropout) #реккурентный слой
        
        self.attention = nn.MultiheadAttention(hidden_dim * 2, num_heads=4, 
                                              dropout=dropout, batch_first=True) #слой внимания
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x): # прямой проход
        embedded = self.embedding(x)
        embedded = self.embedding_dropout(embedded)
        
        lstm_out, _ = self.lstm(embedded)
        attn_out, _ = self.attention(lstm_out, lstm_out, lstm_out)
        
        output = self.dropout(attn_out)
        output = self.fc(output)
        return self.sigmoid(output).squeeze(-1)

In [23]:
def train_model(model, train_loader, val_loader, num_epochs=50, learning_rate=0.001):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #обучал на cuda (rtx 1050 2gb vram)
    model = model.to(device)
    
    criterion = nn.BCELoss() #бинарная кросс-энтропия
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.5, #планировщик скорости обучения
                                 patience=5, min_lr=1e-6)
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        all_train_preds = []
        all_train_targets = []
        
        for data, target in train_loader:
            data, target = data.to(device), target.to(device)
            
            optimizer.zero_grad()
            output = model(data)
            
            loss = criterion(output, target)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            train_loss += loss.item()
            
            preds = (output > 0.45).float()  # Бинаризация предсказаний
            all_train_preds.extend(preds.cpu().detach().numpy().flatten())
            all_train_targets.extend(target.cpu().detach().numpy().flatten())
        
        # Вычисляем метрики для тренировочных данных
        train_loss /= len(train_loader)
        train_accuracy = accuracy_score(all_train_targets, all_train_preds)
        train_f1 = f1_score(all_train_targets, all_train_preds, average='binary', zero_division=0)
        train_precision = precision_score(all_train_targets, all_train_preds, average='binary', zero_division=0)
        train_recall = recall_score(all_train_targets, all_train_preds, average='binary', zero_division=0)
        
        # validation
        model.eval()
        val_loss = 0
        all_val_preds = []
        all_val_targets = []
        
        with torch.no_grad():
            for vdata, vtarget in val_loader:
                vdata, vtarget = vdata.to(device), vtarget.to(device)
                output = model(vdata)
                loss = criterion(output, vtarget)
                val_loss += loss.item()
                
                preds = (output > 0.45).float()
                all_val_preds.extend(preds.cpu().numpy().flatten())
                all_val_targets.extend(vtarget.cpu().numpy().flatten())
        
        # Вычисляем метрики для валидационных данных
        val_loss /= len(val_loader)
        val_accuracy = accuracy_score(all_val_targets, all_val_preds)
        val_f1 = f1_score(all_val_targets, all_val_preds, average='binary', zero_division=0)
        val_precision = precision_score(all_val_targets, all_val_preds, average='binary', zero_division=0)
        val_recall = recall_score(all_val_targets, all_val_preds, average='binary', zero_division=0)

        scheduler.step(val_f1)
        
        if epoch % 5 == 0 or epoch == num_epochs - 1:
            print(f'Epoch {epoch+1}/{num_epochs}')
            print(f'Train - Loss: {train_loss:.4f}, Acc: {train_accuracy:.4f}, F1: {train_f1:.4f}, '
                  f'Prec: {train_precision:.4f}, Rec: {train_recall:.4f}')
            print(f'Valid - Loss: {val_loss:.4f}, Acc: {val_accuracy:.4f}, F1: {val_f1:.4f}, '
                  f'Prec: {val_precision:.4f}, Rec: {val_recall:.4f}')
            print('-' * 80)

In [24]:
model = SpacePredictor(vocab_size=len(vocab), embedding_dim=64, hidden_dim=128, num_layers=2, dropout=0.6)

In [25]:
history = train_model(model, train_loader, val_loader, num_epochs=70) #обучение 

Epoch 1/70
Train - Loss: 0.1521, Acc: 0.9469, F1: 0.0065, Prec: 0.0615, Rec: 0.0034
Valid - Loss: 0.1338, Acc: 0.9501, F1: 0.0001, Prec: 0.3333, Rec: 0.0001
--------------------------------------------------------------------------------
Epoch 6/70
Train - Loss: 0.1032, Acc: 0.9570, F1: 0.4149, Prec: 0.6671, Rec: 0.3011
Valid - Loss: 0.0939, Acc: 0.9605, F1: 0.4755, Prec: 0.7045, Rec: 0.3589
--------------------------------------------------------------------------------
Epoch 11/70
Train - Loss: 0.0903, Acc: 0.9619, F1: 0.5307, Prec: 0.7061, Rec: 0.4251
Valid - Loss: 0.0841, Acc: 0.9628, F1: 0.5935, Prec: 0.6524, Rec: 0.5444
--------------------------------------------------------------------------------
Epoch 16/70
Train - Loss: 0.0831, Acc: 0.9648, F1: 0.5840, Prec: 0.7259, Rec: 0.4885
Valid - Loss: 0.0786, Acc: 0.9669, F1: 0.6230, Prec: 0.7199, Rec: 0.5490
--------------------------------------------------------------------------------
Epoch 21/70
Train - Loss: 0.0779, Acc: 0.9670,

In [26]:
def pred_to_text(text, preds): # идея в том, что текст без пробелов = меньше символов чем с пробелами, поэтому чтобы правильно восстановить пробелы,
    result = []                # нужно как бы игнорировать индексы единиц в выходе модели
    text_chars = list(text)
    i = j = 0

    while i < len(preds):
        if preds[i] == 0:
            # обычный символ → берем из исходного текста
            if j < len(text_chars):
                result.append(text_chars[j])
                j += 1
            # если j >= len(text_chars), просто игнорируем (паддинг)
        else:
            # пробел -> добавляем, текстовый индекс не двигаем
            result.append(' ')
        i += 1

    return ''.join(result)
def test_text(model, vocab, text, max_length=120):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.eval()
    model = model.to(device)
    # Convert text to sequence
    text_sequence = text_to_seq(text, vocab, max_length)
    
    # Predict
    with torch.no_grad():
        input_tensor = torch.tensor(text_sequence, dtype=torch.long).to(device)
        output = model(input_tensor)
        predictions = (output > 0.5).cpu().numpy()
        #print(output)
    return pred_to_text(text, predictions)

In [119]:
test_text(model, vocab, 'Продавецоченьхорошийрекомендую') #Вспомогательная функция восстанавливающая текст

'Продавец очень хороший рекомендую'

In [28]:
def to_avito_answer(prediction): # из за идеи, предложенной выше, мои метки и метки из задания отличались на зазор количества пробелов,
    pred=prediction.int()        # поэтому последовательно убираем эти лишние индексы
    result = []
    space_handicap = 0
    for i,e in enumerate(pred):
        if e == 1:
            result.append(i-space_handicap)
            space_handicap += 1
    return result

In [29]:
model

SpacePredictor(
  (embedding): Embedding(90, 64, padding_idx=0)
  (embedding_dropout): Dropout(p=0.4, inplace=False)
  (lstm): LSTM(64, 128, num_layers=2, batch_first=True, dropout=0.6, bidirectional=True)
  (attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
  )
  (dropout): Dropout(p=0.6, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

## тест на данных

In [111]:
import pandas as pd
import torch

# загружаем файл, разделяя только по первой запятой
def load_task_file(filename):
    ids = []
    texts = []
    extras = []
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            line = line.rstrip("\n")
            parts = line.split(",", 2)  # максимум 3 части
            if len(parts) == 3:
                id_, text, extra = parts
            elif len(parts) == 2:
                id_, text = parts
                extra = ""
            else:
                continue
            ids.append(id_)
            texts.append(text)
            extras.append(extra)
    df = pd.DataFrame({
        "id": ids,
        "text_no_spaces": texts,
        "extra": extras
    })
    return df

df = load_task_file("dataset_1937770_3.txt")
# Устройство и модель
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.eval()
model = model.to(device)

# функция для предсказания индексов пробелов
def predict_text(text, model, vocab, device, max_length=120):
    text_sequence = text_to_seq(text, vocab, max_length)
    with torch.no_grad():
        input_tensor = torch.tensor(text_sequence, dtype=torch.long).unsqueeze(0).to(device)
        output = model(input_tensor)
        prediction = (output > 0.5).int().cpu().squeeze(0)
        return to_avito_answer(prediction)

# добавляем колонку predicted_positions как строку
df["predicted_positions"] = df["text_no_spaces"].apply(
    lambda x: str(predict_text(x, model, vocab, device))
)

# сохраняем в txt, с запятой как разделитель
with open("submission.txt", "w", encoding="utf-8") as f:
    f.write('id,text_no_spaces,predicted_positions \n')
    for idx, row in df[1:].iterrows():
        # Если extra пустой, не добавляем лишнюю запятую
        if row["extra"].strip():
            line = f'{row["id"]},{row["text_no_spaces"]},{row["extra"]},"{row["predicted_positions"]}"\n'
        else:
            line = f'{row["id"]},{row["text_no_spaces"]},"{row["predicted_positions"]}"\n'
        f.write(line)


id                        958
text_no_spaces    Ночькоротка
extra             цельдалека,
Name: 959, dtype: object