In [1]:
!pip install transformers



In [5]:
!pip install transformers
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

# загружаем данные из файла
data = pd.read_csv('Train_Only_Sentence_NLI.csv', encoding='utf-8-sig', sep=';')
data.head()

# инициализируем токенизатор и модель
tokenizer = AutoTokenizer.from_pretrained("MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7")
model = AutoModelForSequenceClassification.from_pretrained("MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7")

# определяем device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# переводим модель на device
model.to(device)

# переводим данные в формат, подходящий для модели
premises = data['premise'].astype(str).tolist()
hypotheses = data['hypothesis'].astype(str).tolist()

encoded_data = tokenizer(
    premises,
    hypotheses,
    padding=True,
    truncation=True,
    max_length=256,
    return_tensors='pt'
)

# переводим данные на device
input_ids = encoded_data['input_ids'].to(device)
attention_mask = encoded_data['attention_mask'].to(device)

# получаем предсказания
with torch.no_grad():
    logits = []
    for i in tqdm(range(0, len(input_ids), 64)):
        batch_input_ids = input_ids[i:i+64]
        batch_attention_mask = attention_mask[i:i+64]
        batch_logits = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask).logits
        logits.append(batch_logits)
        torch.cuda.empty_cache()  # очистка кеша GPU после каждой итерации
    logits = torch.cat(logits, dim=0)
    predictions = torch.argmax(logits, dim=1)

# декодируем предсказания
decoded_predictions = [model.config.id2label[prediction.item()] for prediction in predictions]

# добавляем столбец с предсказаниями в DataFrame
data['label'] = decoded_predictions

# сохраняем результат в файл CSV
data.to_csv('Train_Only_Sentence_NLI_out.csv', index=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1563/1563 [31:04<00:00,  1.19s/it]
