In [1]:
!pip install transformers



In [2]:
import torch
import pandas as pd

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from tqdm.auto import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Загрузка данных из CSV-файла
data = pd.read_csv('Train_Only_Sentence.csv', encoding='utf-8-sig', sep=';')
data.fillna({'text': '0'}, inplace=True)
# Инициализация токенизатора и модели GPT-2
tokenizer = GPT2Tokenizer.from_pretrained('sberbank-ai/rugpt3medium_based_on_gpt2', pad_token='<PAD>')
model = GPT2LMHeadModel.from_pretrained('sberbank-ai/rugpt3medium_based_on_gpt2').to(device)

# Задание параметров генерации текста
max_length = 64
top_k = 20
top_p = 0.9
num_return_sequences = 1

# Создание списка для сгенерированных текстов
generated_texts = []

# Генерация текстов
with tqdm(total=len(data), desc='Generating texts', position=0, leave=True) as pbar:
    tqdm.write_every = 10
    for i, sentence in enumerate(data['text']):
        # Создание input_ids из предложения
        torch.cuda.empty_cache()
        input_ids = tokenizer.encode(sentence, return_tensors='pt', truncation=True, max_length=128).to(device)
        attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=device)
        # Генерация текста
        sample_outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            do_sample=True,
            max_length=max_length,
            top_k=top_k,
            top_p=top_p,
            num_return_sequences=num_return_sequences,
            no_repeat_ngram_size=0,
            pad_token_id=tokenizer.pad_token_id
            )
        # Декодирование сгенерированного текста
        generated_text = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)
        generated_texts.append(generated_text)
        pbar.update()
        pbar.set_postfix({'generated_text': generated_text})
        
# Добавление сгенерированных текстов в DataFrame
data['generated_text'] = generated_texts

# Сохранение результата в файл CSV
data.to_csv('Title_conditioned.csv', index=False)

Using device: cuda


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Generating texts:   0%|          | 0/100000 [00:00<?, ?it/s]

Input length of input_ids is 69, but `max_length` is set to 64. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 68, but `max_length` is set to 64. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 73, but `max_length` is set to 64. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 68, but `max_length` is set to 64. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 75, but `max_length` is set to 64. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
