# Fine Tuning


In [None]:
import torch
import pandas as pd
import time
from datasets import Dataset, DatasetDict
from datetime import datetime
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    pipeline
)

  Referenced from: <6E6BE615-472A-3225-994B-C7BC27D09EAE> /Users/tulgakagan/anaconda3/lib/python3.10/site-packages/torchvision/image.so
  Reason: tried: '/Users/tulgakagan/anaconda3/lib/python3.10/site-packages/torchvision/../../../libjpeg.8.dylib' (no such file), '/Users/tulgakagan/anaconda3/lib/python3.10/site-packages/torchvision/../../../libjpeg.8.dylib' (no such file), '/Users/tulgakagan/anaconda3/lib/python3.10/lib-dynload/../../libjpeg.8.dylib' (no such file), '/Users/tulgakagan/anaconda3/bin/../lib/libjpeg.8.dylib' (no such file), '/usr/local/lib/libjpeg.8.dylib' (no such file), '/usr/lib/libjpeg.8.dylib' (no such file, not in dyld cache)
  warn(f"Failed to load image Python extension: {e}")


Linking the Google Drive folder

In [1]:
# Paths & Hyperparameters
DATA_PATH = "../data/processed/preprocessed_data.csv"
OUTPUT_DIR_BASE = "gpt2_models"
NUM_EPOCHS = 5
BATCH_SIZE = 2
MAX_LENGTH = 512
NUM_SAMPLES = 20 # Reviews per author to generate

AUTHORS = [
    "joe tangari", "stephen m. deusner", "ian cohen", "brian howe",
    "mark richardson", "stuart berman", "marc hogan",
    "nate patrin", "marc masters", "jayson greene"
]
#authors = ["marc_masters", "jayson_greene"] We had to separately fine-tune these two on the last run because of Drive storage issues

In [4]:
# Device & Tokenizer Setup
def setup_tokenizer(authors):
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    author_tokens = {a: f"<|AUTHOR_{i}|>" for i, a in enumerate(authors)}
    special_tokens = list(author_tokens.values()) + ["<|REVIEW_START|>"]
    tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer, author_tokens

# Initialise device and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
tokenizer, author_tokens = setup_tokenizer(AUTHORS)

Using device: cpu


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [5]:
# Data Preparation
def load_and_prepare_data(path, author_tokens):
    df = pd.read_csv(path)
    df = df[df["author"].isin(AUTHORS)]
    df["prompted_text"] = df.apply(
        lambda r: author_tokens[r["author"]] + "<|REVIEW_START|> " + r["content"],
        axis=1
    )
    return df

df = load_and_prepare_data(DATA_PATH, author_tokens)
print(f"Loaded {len(df)} examples from {DATA_PATH}")


Loaded 5054 examples from ../data/processed/preprocessed_data.csv


In [None]:
# Fine-Tuning Function (per author)
def train_author_model(author, df, tokenizer, device):
    # Filter for this author
    df_a = df[df["author"] == author].reset_index(drop=True)
    # Split train/val
    train = df_a.sample(frac=0.9, random_state=42)
    val = df_a.drop(train.index)
    ds = DatasetDict({
        "train": Dataset.from_pandas(train[["prompted_text"]]),
        "validation": Dataset.from_pandas(val[["prompted_text"]])
    })
    # Tokenize
    ds = ds.map(
        lambda b: tokenizer(
            b["prompted_text"],
            truncation=True,
            padding="max_length",
            max_length=MAX_LENGTH
        ),
        batched=True,
        remove_columns=["prompted_text"]
    )
    # Initialize model
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = tokenizer.pad_token_id
    model.to(device)
    # Trainer setup
    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir=f"{OUTPUT_DIR_BASE}/{author.replace(' ','_')}",
            num_train_epochs=NUM_EPOCHS,
            per_device_train_batch_size=BATCH_SIZE,
            save_strategy="epoch",
            logging_steps=100,
            learning_rate=1e-5,
            weight_decay=0.01,
            save_total_limit=1,
            fp16=torch.cuda.is_available()
        ),
        train_dataset=ds["train"],
        eval_dataset=ds["validation"],
        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    )
    # Train & save
    trainer.train()
    trainer.save_model()
    tokenizer.save_pretrained(trainer.args.output_dir)
    return trainer.args.output_dir

In [None]:
# Initialize tracking dictionaries
training_stats = {
    'start_time': datetime.now(),
    'models': {},
    'failures': []
}


print(f"Starting fine-tuning at {training_stats['start_time'].strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Device: {device}")
print(f"Training {len(AUTHORS)} authors, {NUM_EPOCHS} epochs each")

for idx, author in enumerate(AUTHORS, 1):
    print(f"\n{'='*50}")
    print(f"Author {idx}/{len(AUTHORS)}: {author}")

    start = time.time()
    try:
        # Train model
        model_dir = train_author_model(author, df, tokenizer, device)

        # Record success
        training_stats['models'][author] = {
            'path': model_dir,
            'duration': time.time() - start,
            'completed': datetime.now()
        }

        print(f"✓ Completed in {(time.time() - start)/60:.1f} minutes")

    except Exception as e:
        print(f"✗ Failed: {str(e)}")
        training_stats['failures'].append((author, str(e)))
        continue

    # Clear GPU memory if available
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# Print summary
print("\nTraining Summary")
print(f"{'='*50}")
print(f"Total time: {(datetime.now() - training_stats['start_time']).total_seconds()/3600:.1f} hours")
print(f"Successful: {len(training_stats['models'])}/{len(AUTHORS)}")
print(f"Failed: {len(training_stats['failures'])}")

if training_stats['failures']:
    print("\nFailures:")
    for author, error in training_stats['failures']:
        print(f"- {author}: {error}")

Starting fine-tuning at 2025-05-15 15:34:09
Device: cuda
Training 2 authors, 5 epochs each

Author 1/2: marc masters


Map:   0%|          | 0/281 [00:00<?, ? examples/s]

Map:   0%|          | 0/31 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtulgatemel[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,4.0843
200,3.9583
300,3.881
400,3.8351
500,3.7756
600,3.7672
700,3.7587


✓ Completed in 3.7 minutes

Author 2/2: jayson greene


Map:   0%|          | 0/266 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Step,Training Loss
100,4.1242
200,4.0051
300,3.9419
400,3.899
500,3.8613
600,3.849


✓ Completed in 2.9 minutes

Training Summary
Total time: 0.1 hours
Successful: 2/2
Failed: 0
