In [1]:
!pip install --upgrade transformers tensorflow tqdm torch



In [2]:
pip install transformers[torch]



In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from tqdm.auto import tqdm
import torch

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
train_file_path = '/content/drive/My Drive/CS505/CS505_Project/Database_prerow/Resume_data/resume_data.txt'

In [6]:
def finetune_gpt2(model_name, train_file, output_dir):
    # Check if a GPU is available and set the device accordingly
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")

    # Load GPT-2 model and tokenizer
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)

    # Load training dataset
    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_file,
        block_size=128  # Adjust block size as needed, often 512 or 1024
    )

    # Create data collator for language modeling
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False  # Masked language modeling set to False for GPT-2
    )

    # Set training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=2,
        per_device_train_batch_size=16,
        gradient_accumulation_steps=2,  # Use gradient accumulation
        save_steps=1000000,  # Set a high number to disable checkpoint saving
        save_total_limit=2,
        logging_dir='./logs',  # Directory for storing logs
        logging_steps=500,  # Log every 500 steps
        fp16=torch.cuda.is_available(),  # Enable FP16 training if GPU is available
    )

    # Train the model
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )

    # Train the model with a progress bar
    tqdm.write('Starting training...')
    trainer.train()

    # Save the fine-tuned model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

In [7]:
finetune_gpt2("gpt2", train_file_path, '/content/gpt2-finetuned')

Using device: cuda




Starting training...


Step,Training Loss
500,4.5084
1000,4.1954
1500,4.0693
2000,3.981
2500,3.9323
3000,3.8946
3500,3.8402
4000,3.8065
4500,3.7915
5000,3.7594


In [8]:
from google.colab import files

# Zip the model directory for easier download
!zip -r gpt2-finetuned.zip /content/gpt2-finetuned

# Trigger the download
files.download('gpt2-finetuned.zip')

  adding: content/gpt2-finetuned/ (stored 0%)
  adding: content/gpt2-finetuned/special_tokens_map.json (deflated 74%)
  adding: content/gpt2-finetuned/generation_config.json (deflated 24%)
  adding: content/gpt2-finetuned/model.safetensors (deflated 7%)
  adding: content/gpt2-finetuned/config.json (deflated 51%)
  adding: content/gpt2-finetuned/vocab.json (deflated 68%)
  adding: content/gpt2-finetuned/tokenizer_config.json (deflated 54%)
  adding: content/gpt2-finetuned/merges.txt (deflated 53%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [9]:
!cp -r /content/gpt2-finetuned /content/drive/My\ Drive/