### **Цензурирование**
***
Загрузка модели и токенизатора

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

model_path = "C:/Users/kpodd/OneDrive/Desktop/ml/NER/model"  # путь к лучшей модели

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)
model.eval()

XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768

In [3]:
label_list = ["O", "B-PRF", "I-PRF"]
id2label = {i: label for i, label in enumerate(label_list)}

In [7]:
def predict_labels(text: str):
    # Токенизация с привязкой к словам
    tokens = text.strip().split()
    encoding = tokenizer(
        tokens,
        is_split_into_words=True,
        return_offsets_mapping=True,
        return_tensors="pt",
        truncation=True,
        max_length=256
    )

    offset_mapping = encoding.pop('offset_mapping')
    with torch.no_grad():
        outputs = model(**encoding)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=2).squeeze().tolist()
    word_ids = encoding.word_ids()

    word_labels = []
    current_word = None
    current_label = "O"

    for i, word_id in enumerate(word_ids):
        if word_id is None:
            continue
        if word_id != current_word:
            current_word = word_id
            current_label = id2label[preds[i]]
            word_labels.append(current_label)

    return tokens, word_labels

In [5]:
def censor_word(word: str, mask_char: str = "*") -> str:
    if len(word) <= 2:
        return mask_char * len(word)
    return word[0] + mask_char * (len(word) - 2) + word[-1]

def censor_text(text: str, censor_mode: str = "smart", mask_char: str = "*") -> str:
    tokens, labels = predict_labels(text)

    censored_tokens = []
    for token, label in zip(tokens, labels):
        if label in ["B-PRF", "I-PRF"]:
            if censor_mode == "stars":
                censored = mask_char * len(token)
            elif censor_mode == "smart":
                censored = censor_word(token, mask_char)
            else:
                censored = "[CENSORED]"
            censored_tokens.append(censored)
        else:
            censored_tokens.append(token)
    
    return " ".join(censored_tokens)

In [21]:
text = "Ты совсем ахуел пидарас ебаный. Не притворяйся мужиком, ты тот еще гандон мелкий."
print("Оригинал:", text)
print("Цензура :", censor_text(text, censor_mode="smart"))

Оригинал: Ты совсем ахуел пидарас ебаный. Не притворяйся мужиком, ты тот еще гандон мелкий.
Цензура : Ты совсем а***л п*****с е*****. Не притворяйся мужиком, ты тот еще г****н мелкий.
