## Libraries

In [1]:
import random
import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


## Model

In [2]:
class QualityModel(nn.Module):
    def __init__(self, encoder_name: str, hidden_dim: int = 256):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(encoder_name)
        emb_dim = self.encoder.config.hidden_size

        self.shared = nn.Sequential(
            nn.Linear(emb_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
        )

        self.heads = nn.ModuleDict({
            "informativeness": nn.Linear(hidden_dim, 1),
            "clarity": nn.Linear(hidden_dim, 1),
            "completeness": nn.Linear(hidden_dim, 1),
            "persuasion": nn.Linear(hidden_dim, 1),
        })

    def forward(self, input_ids=None, attention_mask=None, **kwargs):
        out = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled = out.last_hidden_state[:, 0]
        shared = self.shared(pooled)

        aspects = {
            name: head(shared).squeeze(-1)
            for name, head in self.heads.items()
        }

        quality_score = torch.stack(list(aspects.values()), dim=1).mean(dim=1)

        return {
            "quality_score": quality_score,
            "aspects": aspects
        }

## Dataset

In [3]:
class QualityDataset(Dataset):
    def __init__(self, samples):
        self.samples = samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        s = self.samples[idx]
        return {
            "text": s["text"],
            "informativeness": torch.tensor(s["informativeness"]),
            "clarity": torch.tensor(s["clarity"]),
            "completeness": torch.tensor(s["completeness"]),
            "persuasion": torch.tensor(s["persuasion"]),
        }

## Generation synthetic data

In [4]:
def make_quality_sample(kind: str):
    if kind == "good":
        text = "–ü—Ä–æ–¥–∞—é –≤–µ–ª–æ—Å–∏–ø–µ–¥. –û—Ç–ª–∏—á–Ω–æ–µ —Å–æ—Å—Ç–æ—è–Ω–∏–µ, –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–ª—Å—è 6 –º–µ—Å—è—Ü–µ–≤. –ü—Ä–∏—á–∏–Ω–∞ –ø—Ä–æ–¥–∞–∂–∏ ‚Äî –ø–æ–∫—É–ø–∫–∞ –Ω–æ–≤–æ–≥–æ."
        scores = {
            "informativeness": random.uniform(0.7, 0.9),
            "clarity": random.uniform(0.7, 0.9),
            "completeness": random.uniform(0.7, 0.9),
            "persuasion": random.uniform(0.5, 0.8),
        }

    elif kind == "short":
        text = "–ü—Ä–æ–¥–∞–º –≤–µ–ª–æ—Å–∏–ø–µ–¥"
        scores = {
            "informativeness": random.uniform(0.2, 0.4),
            "clarity": random.uniform(0.6, 0.7),
            "completeness": random.uniform(0.2, 0.4),
            "persuasion": random.uniform(0.3, 0.4),
        }

    elif kind == "spam":
        text = "üî•üî•üî• –ö–£–ü–ò –°–ï–ô–ß–ê–° !!! üî•üî•üî•"
        scores = {
            "informativeness": random.uniform(0.2, 0.4),
            "clarity": random.uniform(0.2, 0.4),
            "completeness": random.uniform(0.2, 0.4),
            "persuasion": random.uniform(0.7, 0.9),
        }

    else:  # bad
        text = "!!!"
        scores = {k: 0.1 for k in ["informativeness", "clarity", "completeness", "persuasion"]}

    return {
        "text": text,
        **scores
    }


def generate_quality_dataset(n=2000):
    kinds = ["good", "short", "spam", "bad"]
    return [make_quality_sample(random.choice(kinds)) for _ in range(n)]

## Train function

In [None]:
def train_quality_model(dataset, encoder_name: str):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = AutoTokenizer.from_pretrained(encoder_name)
    model = QualityModel(encoder_name).to(device)

    for p in model.encoder.parameters():
        p.requires_grad = False

    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
    criterion = nn.SmoothL1Loss()
    epochs = 5

    def collate(batch):
        texts = [b["text"] for b in batch]
        labels = {k: torch.tensor([b[k] for b in batch]) for k in ["informativeness","clarity","completeness","persuasion"]}
        return texts, labels

    loader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate)

    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        loop = tqdm.tqdm(loader, desc=f"Epoch {epoch + 1}")

        for texts, labels in loop:
            tokens = tokenizer(
                texts,
                padding=True,
                truncation=True,
                return_tensors="pt",
            )

            tokens = {k: v.to(device) for k, v in tokens.items()}
            labels = {k: v.to(device) for k, v in labels.items()}

            out = model(**tokens)

            loss = 0.0
            for aspect, pred in out["aspects"].items():
                loss += criterion(torch.sigmoid(pred), labels[aspect])

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        print(f"Epoch {epoch + 1}: loss={epoch_loss / len(loader):.4f}")

    return model

## Train

In [6]:
samples = generate_quality_dataset(3000)
dataset = QualityDataset(samples)

model = train_quality_model(
    dataset,
    encoder_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)

torch.save(model.state_dict(), "quality.pt")

Epoch 1 / 5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 188/188 [00:19<00:00,  9.78it/s, loss=0.0499]


Epoch 1: loss=0.1057


Epoch 2 / 5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 188/188 [00:17<00:00, 10.88it/s, loss=0.0202]


Epoch 2: loss=0.0381


Epoch 3 / 5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 188/188 [00:16<00:00, 11.71it/s, loss=0.0136] 


Epoch 3: loss=0.0153


Epoch 4 / 5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 188/188 [00:16<00:00, 11.67it/s, loss=0.00628]


Epoch 4: loss=0.0093


Epoch 5 / 5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 188/188 [00:16<00:00, 11.69it/s, loss=0.00796]


Epoch 5: loss=0.0077
