In [1]:
#Единственное отличие от model1 - на чем обучалась модель. Та обучалась на данных без пунктуации, эта - на более "грязных" данных
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
import pandas as pd
import numpy as np
import ast
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

In [2]:
def preprocess_text(text):
    return text.lower()
def build_vocab(sentences, max_size=120): #строю словарь 
    counter = Counter()
    for sentence in sentences:
        chars = list(preprocess_text(sentence))
        counter.update(chars)
    most_common = counter.most_common(max_size)
    vocab = {word: idx + 1 for idx, (word, _) in enumerate(most_common)}
    vocab['<PAD>'] = 0
    return vocab
def text_to_seq(text, vocab, max_length=120):
    text = preprocess_text(text)
    sequence = [vocab.get(ch, 0) for ch in list(text)]
    if len(sequence) > max_length:
        sequence = sequence[:max_length]
    else:
        sequence += [vocab['<PAD>']] * (max_length - len(sequence))
    return sequence
def pad_target(target, max_length=120):
    if len(target) < max_length:
        target = target + [0] * (max_length - len(target))
    else:
        target = target[:max_length]
    return target


In [3]:
sentences = pd.read_csv('data2.csv')
sentences['y'] = sentences['y'].apply(ast.literal_eval)

In [4]:
vocab= build_vocab(sentences['x'])

In [5]:
class WordDataset(Dataset):
    def __init__(self, X, Y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.Y = torch.tensor(Y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

In [6]:
X = sentences['x'].apply(text_to_seq, vocab=vocab)
Y = sentences['y'].apply(pad_target)
X, Y = X.tolist(), Y.tolist()
x_train, x_valid, y_train, y_valid = train_test_split(
    X, Y, test_size=0.15, random_state=42, shuffle=True)

In [7]:
train_dataset = WordDataset(x_train, y_train)
valid_dataset = WordDataset(x_valid, y_valid)
train_loader = DataLoader(train_dataset, batch_size=20, shuffle=True)
val_loader = DataLoader(valid_dataset, batch_size=20)

In [8]:
class SpacePredictor(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout=0.6):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embedding_dropout = nn.Dropout(0.4)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, 
                           batch_first=True, bidirectional=True, dropout=dropout)
        
        # Self-attention
        self.attention = nn.MultiheadAttention(hidden_dim * 2, num_heads=4, 
                                              dropout=dropout, batch_first=True)
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.embedding_dropout(embedded)
        
        lstm_out, _ = self.lstm(embedded)
        attn_out, _ = self.attention(lstm_out, lstm_out, lstm_out)
        
        output = self.dropout(attn_out)
        output = self.fc(output)
        return self.sigmoid(output).squeeze(-1)

In [9]:
def train_model(model, train_loader, val_loader, num_epochs=50, learning_rate=0.001):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.5, 
                                 patience=5, min_lr=1e-6)
    
    # Для хранения истории метрик
    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0
        all_train_preds = []
        all_train_targets = []
        
        for data, target in train_loader:
            data, target = data.to(device), target.to(device)
            
            optimizer.zero_grad()
            output = model(data)
            
            loss = criterion(output, target)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            train_loss += loss.item()
            
            # Сохраняем предсказания и цели для метрик
            preds = (output > 0.45).float()  # Бинаризация предсказаний
            all_train_preds.extend(preds.cpu().detach().numpy().flatten())
            all_train_targets.extend(target.cpu().detach().numpy().flatten())
        
        # Вычисляем метрики для тренировочных данных
        train_loss /= len(train_loader)
        train_accuracy = accuracy_score(all_train_targets, all_train_preds)
        train_f1 = f1_score(all_train_targets, all_train_preds, average='binary', zero_division=0)
        train_precision = precision_score(all_train_targets, all_train_preds, average='binary', zero_division=0)
        train_recall = recall_score(all_train_targets, all_train_preds, average='binary', zero_division=0)
        
        # Validation
        model.eval()
        val_loss = 0
        all_val_preds = []
        all_val_targets = []
        
        with torch.no_grad():
            for vdata, vtarget in val_loader:
                vdata, vtarget = vdata.to(device), vtarget.to(device)
                output = model(vdata)
                loss = criterion(output, vtarget)
                val_loss += loss.item()
                
                # Сохраняем предсказания и цели для метрик
                preds = (output > 0.45).float()
                all_val_preds.extend(preds.cpu().numpy().flatten())
                all_val_targets.extend(vtarget.cpu().numpy().flatten())
        
        # Вычисляем метрики для валидационных данных
        val_loss /= len(val_loader)
        val_accuracy = accuracy_score(all_val_targets, all_val_preds)
        val_f1 = f1_score(all_val_targets, all_val_preds, average='binary', zero_division=0)
        val_precision = precision_score(all_val_targets, all_val_preds, average='binary', zero_division=0)
        val_recall = recall_score(all_val_targets, all_val_preds, average='binary', zero_division=0)

        scheduler.step(val_f1)
        
        if epoch % 5 == 0 or epoch == num_epochs - 1:
            print(f'Epoch {epoch+1}/{num_epochs}')
            print(f'Train - Loss: {train_loss:.4f}, Acc: {train_accuracy:.4f}, F1: {train_f1:.4f}, '
                  f'Prec: {train_precision:.4f}, Rec: {train_recall:.4f}')
            print(f'Valid - Loss: {val_loss:.4f}, Acc: {val_accuracy:.4f}, F1: {val_f1:.4f}, '
                  f'Prec: {val_precision:.4f}, Rec: {val_recall:.4f}')
            print('-' * 80)

In [10]:
model = SpacePredictor(vocab_size=len(vocab), embedding_dim=64, hidden_dim=128, num_layers=2, dropout=0.6)

In [11]:
history = train_model(model, train_loader, val_loader, num_epochs=55)

Epoch 1/55
Train - Loss: 0.2071, Acc: 0.9275, F1: 0.0060, Prec: 0.0652, Rec: 0.0032
Valid - Loss: 0.1994, Acc: 0.9287, F1: 0.0018, Prec: 0.5312, Rec: 0.0009
--------------------------------------------------------------------------------
Epoch 6/55
Train - Loss: 0.1484, Acc: 0.9391, F1: 0.3492, Prec: 0.6804, Rec: 0.2349
Valid - Loss: 0.1424, Acc: 0.9398, F1: 0.4335, Prec: 0.6592, Rec: 0.3229
--------------------------------------------------------------------------------
Epoch 11/55
Train - Loss: 0.1307, Acc: 0.9446, F1: 0.4672, Prec: 0.7069, Rec: 0.3489
Valid - Loss: 0.1233, Acc: 0.9466, F1: 0.5173, Prec: 0.7266, Rec: 0.4015
--------------------------------------------------------------------------------
Epoch 16/55
Train - Loss: 0.1213, Acc: 0.9481, F1: 0.5246, Prec: 0.7225, Rec: 0.4118
Valid - Loss: 0.1163, Acc: 0.9492, F1: 0.5736, Prec: 0.7140, Rec: 0.4793
--------------------------------------------------------------------------------
Epoch 21/55
Train - Loss: 0.1139, Acc: 0.9512,

In [12]:
def pred_to_text(text, preds):
    result = []
    text_chars = list(text)
    i = j = 0

    while i < len(preds):
        if preds[i] == 0:
            # обычный символ → берем из исходного текста
            if j < len(text_chars):
                result.append(text_chars[j])
                j += 1
            # если j >= len(text_chars), просто игнорируем (паддинг)
        else:
            # пробел → добавляем, текстовый индекс не двигаем
            result.append(' ')
        i += 1

    return ''.join(result)
def test_text(model, vocab, text, max_length=120):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.eval()
    model = model.to(device)
    # Convert text to sequence
    text_sequence = text_to_seq(text, vocab, max_length)
    
    # Predict
    with torch.no_grad():
        input_tensor = torch.tensor(text_sequence, dtype=torch.long).to(device)
        output = model(input_tensor)
        predictions = (output > 0.5).cpu().numpy()
        #print(output)
    return pred_to_text(text, predictions)

In [13]:
test_text(model, vocab, 'Ибилетнасамолетссеребристымкрылом')

'И билет на самолет ссеребристымк рылом'

In [14]:
def to_avito_answer(prediction):
    pred=prediction.int()
    result = []
    space_handicap = 0
    for i,e in enumerate(pred):
        if e == 1:
            result.append(i-space_handicap)
            space_handicap += 1
    return result

In [15]:
model

SpacePredictor(
  (embedding): Embedding(97, 64, padding_idx=0)
  (embedding_dropout): Dropout(p=0.4, inplace=False)
  (lstm): LSTM(64, 128, num_layers=2, batch_first=True, dropout=0.6, bidirectional=True)
  (attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
  )
  (dropout): Dropout(p=0.6, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

## тест на данных

In [16]:
import pandas as pd
import torch

# Загружаем файл, разделяя только по первой запятой
def load_task_file(filename):
    ids = []
    texts = []
    extras = []
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            line = line.rstrip("\n")
            parts = line.split(",", 2)  # максимум 3 части
            if len(parts) == 3:
                id_, text, extra = parts
            elif len(parts) == 2:
                id_, text = parts
                extra = ""
            else:
                continue
            ids.append(id_)
            texts.append(text)
            extras.append(extra)
    df = pd.DataFrame({
        "id": ids,
        "text_no_spaces": texts,
        "extra": extras
    })
    return df

df = load_task_file("dataset_1937770_3.txt")
#display(df.iloc[959])
# Устройство и модель
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.eval()
model = model.to(device)

# Функция для предсказания индексов пробелов
def predict_text(text, model, vocab, device, max_length=120):
    text_sequence = text_to_seq(text, vocab, max_length)
    with torch.no_grad():
        input_tensor = torch.tensor(text_sequence, dtype=torch.long).unsqueeze(0).to(device)
        output = model(input_tensor)
        prediction = (output > 0.5).int().cpu().squeeze(0)
        return to_avito_answer(prediction)

# Добавляем колонку predicted_positions как строку
df["predicted_positions"] = df["text_no_spaces"].apply(
    lambda x: str(predict_text(x, model, vocab, device))
)

# Сохраняем в txt, с запятой как разделитель
with open("submission4.txt", "w", encoding="utf-8") as f:
    f.write('id,text_no_spaces,predicted_positions \n')
    for idx, row in df[1:].iterrows():
        # Если extra пустой, не добавляем лишнюю запятую
        if row["extra"].strip():
            line = f'{row["id"]},{row["text_no_spaces"]},{row["extra"]},"{row["predicted_positions"]}"\n'
        else:
            line = f'{row["id"]},{row["text_no_spaces"]},"{row["predicted_positions"]}"\n'
        f.write(line)

In [17]:
torch.save(model.state_dict(), 'dict_model_punctuat_final2.pth')