# Review Generation For Each Author


In [None]:
import html
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

In [None]:
AUTHORS = [
    "joe tangari", "stephen m. deusner", "ian cohen", "brian howe",
    "mark richardson", "stuart berman", "marc hogan",
    "nate patrin", "marc masters", "jayson greene"
]
author_tokens = {a: f"<|AUTHOR_{i}|>" for i, a in enumerate(AUTHORS)}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Paths & Hyperparameters
MAX_LENGTH = 512
NUM_SAMPLES = 20

In [None]:
all_records = []

for author in AUTHORS:
    repo = f"Tughi/gpt2-{author.replace(' ','-')}"
    print(f"Loading checkpoint   -> {repo}")

    tok   = GPT2Tokenizer.from_pretrained(repo)
    tok.pad_token = tok.eos_token
    model = GPT2LMHeadModel.from_pretrained(repo).to(device)

    gen_pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tok,
        device=0 if device.type == "cuda" else -1,
    )

    # ── build 20 identical prompts and send them as ONE batched call ──
    prefix   = author_tokens[author] + "<|REVIEW_START|> "
    prompts  = [prefix] * NUM_SAMPLES          # length-20 list

    outputs = gen_pipe(
        prompts,
        max_length=MAX_LENGTH,
        truncation=True,
        do_sample=True,
        top_k=50,
        top_p=0.92,
        temperature=0.7,
        no_repeat_ngram_size=3,
        repetition_penalty=1.2,
        batch_size=8
    )

    # ── unpack results ──
    for out in outputs:
        review = out[0]["generated_text"].split("<|REVIEW_START|>")[-1].strip()
        review = html.unescape(review)
        all_records.append({"author": author, "content": review})

    # optional: free VRAM before next author
    if device.type == "cuda":
        del model
        torch.cuda.empty_cache()

generated_df = pd.DataFrame(all_records)
display(generated_df.head())


Loading checkpoint   -> Tughi/gpt2-joe-tangari


Device set to use cuda:0


Loading checkpoint   -> Tughi/gpt2-stephen-m.-deusner


Device set to use cuda:0


Loading checkpoint   -> Tughi/gpt2-ian-cohen


tokenizer_config.json:   0%|          | 0.00/2.78k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/282 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Device set to use cuda:0


Loading checkpoint   -> Tughi/gpt2-brian-howe


tokenizer_config.json:   0%|          | 0.00/2.78k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/282 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Device set to use cuda:0


Loading checkpoint   -> Tughi/gpt2-mark-richardson


tokenizer_config.json:   0%|          | 0.00/2.78k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/282 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Device set to use cuda:0


Loading checkpoint   -> Tughi/gpt2-stuart-berman


tokenizer_config.json:   0%|          | 0.00/2.78k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/282 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Device set to use cuda:0


Loading checkpoint   -> Tughi/gpt2-marc-hogan


tokenizer_config.json:   0%|          | 0.00/2.78k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/282 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Device set to use cuda:0


Loading checkpoint   -> Tughi/gpt2-nate-patrin


tokenizer_config.json:   0%|          | 0.00/2.78k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/282 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Device set to use cuda:0


Loading checkpoint   -> Tughi/gpt2-marc-masters


tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Device set to use cuda:0


Loading checkpoint   -> Tughi/gpt2-jayson-greene


tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Device set to use cuda:0


Unnamed: 0,author,content
0,joe tangari,A lot of the music I listen to on my phone is ...
1,joe tangari,"We were a few years into our first album, and ..."
2,joe tangari,"A lot of people think I'm a moron, but not rea..."
3,joe tangari,This album is the second in a series that's be...
4,joe tangari,It's been a while since I've written about the...


In [None]:
generated_df = pd.DataFrame(all_records)

In [None]:
generated_df.to_csv('../data/generated/generated_reviews.csv', index=False)