In [None]:
! pip install transformers datasets

In [None]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
import torch
import os

# Set environment variable to help debug CUDA errors
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Load your data
stas_messages = pd.read_csv("/kaggle/input/YOUR_CSV_FILE",sep='|')  # Adjust the path to your actual CSV file
messages = stas_messages['sender_text'].tolist()

# Save messages to a text file
with open('messages.txt', 'w', encoding='utf-8') as f:
    for message in messages:
        f.write(message + '\n')

# Load pre-trained model and tokenizer
model_name = "AUTHOR/MODEL"  # Adjust the model name to your actual base-model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Create a dataset
def load_dataset(file_path, tokenizer):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=128,
    )
    return dataset

# Create a data collator
def create_data_collator(tokenizer):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )
    return data_collator

# Load dataset
dataset = load_dataset('messages.txt', tokenizer)
data_collator = create_data_collator(tokenizer)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./gpt-messages",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1, 
    gradient_accumulation_steps=8,  
    save_steps=10_000,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# Training loop with empty_cache
for epoch in range(training_args.num_train_epochs):
    trainer.train()
    torch.cuda.empty_cache()  # Clear the cache

# Save the model
trainer.save_model("./rugpt3-messages")

# Load the fine-tuned model for generation
model = GPT2LMHeadModel.from_pretrained("./rugpt3-messages")
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Generate text
def generate_text(prompt, max_length=50, num_return_sequences=1):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=num_return_sequences, no_repeat_ngram_size=2, do_sample=True, top_k=50, top_p=0.95, temperature=0.7)
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

# Example usage
prompt = "Никита"
generated_texts = generate_text(prompt, max_length=100, num_return_sequences=5)

for i, text in enumerate(generated_texts):
    print(f"Generated Text {i+1}: {text}\n")


In [None]:
# download the model
!cd /kaggle/working
!tar -czvf friend_gpt.zip -C . .