# Загрузка необходимых библиотек


In [1]:
!pip install evaluate
!pip install transformers datasets
!pip install accelerate
!pip install transformers[tourch]
!pip install dataset



In [2]:
import pandas as pd

# Загрузка данных

In [3]:

df = pd.read_csv('result_data.csv', delimiter = ';')
df.columns = ['text', 'label'] # именуем колонки, так как модель принимает фиксированнные  значения 'text' и 'label'
df

Unnamed: 0,text,label
0,Родственник раскрыл настоящую фамилию Пугачёво...,1
1,Предсказания Матроны Московской на 2024-й год:...,1
2,"Пророчество схимонахини Нины об антихристе, ми...",1
3,«Думал об этом»: что Путин сказал о своем прее...,1
4,Путин поручил уведомить россиян об изменениях ...,1
...,...,...
3193,Путин поручил передать Республике Крым все акц...,0
3194,ЕК изучит просьбу Венгрии по нарушению Болгари...,0
3195,"Глава ""Россетей"" доложил Путину о достижении ц...",0
3196,"Платформа ""Мой экспорт"" научит устанавливать д...",0


In [4]:
from datasets import Dataset
dataset = Dataset.from_pandas(df)

In [5]:
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 3198
})

## Нормализация данных

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2") # подгружаем модель для автотокенайзера

In [7]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True) # готовим функцию для токенизации

In [8]:
tokenized_df = dataset.map(preprocess_function, batched=True) # применяем функцию токенизатора для каждого элемента датамета

Map:   0%|          | 0/3198 [00:00<?, ? examples/s]

## Метрики

In [9]:
import evaluate

accuracy = evaluate.load("accuracy")
accuracy.description

'\nAccuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:\nAccuracy = (TP + TN) / (TP + TN + FP + FN)\n Where:\nTP: True positive\nTN: True negative\nFP: False positive\nFN: False negative\n'

In [10]:
precision = evaluate.load("precision")
precision.description


'\nPrecision is the fraction of correctly labeled positive examples out of all of the examples that were labeled as positive. It is computed via the equation:\nPrecision = TP / (TP + FP)\nwhere TP is the True positives (i.e. the examples correctly labeled as positive) and FP is the False positive examples (i.e. the examples incorrectly labeled as positive).\n'

In [11]:
metric_f1 = evaluate.load("f1")


In [12]:
import numpy as np
# считаем accuracy, precision, F1-score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    d = {
        **accuracy.compute(predictions=predictions, references=labels),
        **precision.compute(predictions=predictions, references=labels),
        **metric_f1.compute(predictions=predictions, references=labels)
    }
    return d



## Обучение

In [13]:
id2label = {0: "некликбейт", 1: "кликбейт"}
label2id = {"некликбейт": 0, "кликбейт": 1}

In [14]:
label2id

{'некликбейт': 0, 'кликбейт': 1}

In [15]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "cointegrated/rubert-tiny2", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
split = tokenized_df.train_test_split(test_size = 0.2) #делим датасет на train(2558 строк) и test(640 строк)
split

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2558
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 640
    })
})

In [17]:
from transformers import TrainingArguments, Trainer

In [18]:
#обучаем модель, подбираем гиперпараметры
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    remove_unused_columns=True,
    load_best_model_at_end=True,
    logging_steps=50
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split['train'],
    eval_dataset=split['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,F1
1,0.6336,0.404261,0.884375,0.916058,0.871528
2,0.3347,0.277835,0.904687,0.911263,0.897479
3,0.2758,0.246797,0.907813,0.903654,0.902156
4,0.2382,0.237578,0.910937,0.915254,0.904523


TrainOutput(global_step=320, training_loss=0.35650091916322707, metrics={'train_runtime': 19.5751, 'train_samples_per_second': 522.704, 'train_steps_per_second': 16.347, 'total_flos': 3324960792288.0, 'train_loss': 0.35650091916322707, 'epoch': 4.0})

## Проверяем работу модели на новых данных

In [19]:
text = ['Путин подписал указ о 12-летнем образовании', 'Топ 5 отваров из крапивы для ваших суставов']

In [20]:
from transformers import pipeline

classifier = pipeline("text-classification", model=trainer.model.cpu(), tokenizer=tokenizer)
classifier(text)

[{'label': 'некликбейт', 'score': 0.7639785408973694},
 {'label': 'кликбейт', 'score': 0.7995319962501526}]

In [35]:
trainer.model = trainer.model.cuda()

In [36]:
trainer.evaluate(split['test'])

{'eval_loss': 0.2375781238079071,
 'eval_accuracy': 0.9109375,
 'eval_precision': 0.9152542372881356,
 'eval_f1': 0.9045226130653267,
 'eval_runtime': 0.398,
 'eval_samples_per_second': 1608.022,
 'eval_steps_per_second': 50.251,
 'epoch': 4.0}