In [2]:
import os
import subprocess
import torch
from transformers import GPT2Tokenizer
import sys
nanoGPT_dir = "C:/Users/Zheng/OneDrive - UW-Madison/PhD project/Coursework/CS839/hw1/nanoGPT-master"
sys.path.append(nanoGPT_dir)
os.chdir(nanoGPT_dir)
from model import GPTConfig, GPT

In [3]:
# Prepare the path for the script you want to run
prepare_script = os.path.join("data", "shakespeare_char", "prepare.py")

# Run the script using os.system
os.system(f'python "{prepare_script}"')

# Run scripts
train_script = os.path.join("train.py")
config_path = os.path.join("config", "train_shakespeare_char.py")
command = f'python "{train_script}" "{config_path}" --device=cpu --compile=False --eval_iters=20 --log_interval=1 --block_size=64 --batch_size=12 --n_layer=4 --n_head=4 --n_embd=128 --max_iters=2000 --lr_decay_iters=2000 --dropout=0.0'
result = subprocess.run(command, shell=True, text=True, capture_output=True)
print(result.stdout)

Overriding config with config\train_shakespeare_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out-shakespeare-char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'shakespeare-char'
wandb_run_name = 'mini-gpt'

dataset = 'shakespeare_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 5000
lr_decay_iters = 5000 # make equal to max_iters usually
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of 

In [4]:
# Heuristic ruld: number of 's

In [5]:
# count number of 's
def count_s(text):
    count = text.count('\'s')
    return count

In [7]:
# Generate text by sample.py

def generate_text(seed):
    sample_script = os.path.join("sample.py")
    out_dir = "out-shakespeare-char"
    command = f'python "{sample_script}" --out_dir="{out_dir}" --device=cpu --seed={seed}'
    result = subprocess.run(command, shell=True, text=True, capture_output=True)
    output = result.stdout
    
    # remove the first line
    output = output.split('\n\n', 1)[1]
    
    return output

# five tries

results = {'text': [], 'count': []}

for i in range(20):
    output = generate_text(i)

    # count number of 's
    count = count_s(output)

    results['text'].append(output)
    results['count'].append(count)

In [8]:
import pandas as pd

df = pd.DataFrame(results)

# keep some outputs as evaluation
df_eval = df.sample(5, random_state=1)
df = df.drop(df_eval.index)

# generate cartesian product
df['key'] = 1
df2 = df.copy()
df2.columns = ['text2', 'count2', 'key']
df = pd.merge(df, df2, on='key')

df['favored text'] = df.apply(lambda x: x['text'] if x['count'] > x['count2'] else x['text2'], axis=1)
df['negative text'] = df.apply(lambda x: x['text'] if x['count'] < x['count2'] else x['text2'], axis=1)

df = df[['favored text', 'negative text']].drop_duplicates()
df

Unnamed: 0,favored text,negative text
0,Ely grater:\nI have thall grown to shalm caul ...,Ely grater:\nI have thall grown to shalm caul ...
1,I'll deast in Romeence\nTo men that decede pro...,I'll deast in Romeence\nTo men that decede pro...
2,Go your fain take shuld for ove provess than l...,Ely grater:\nI have thall grown to shalm caul ...
3,Ely grater:\nI have thall grown to shalm caul ...,"RAUS:\nHet a dought the wome, in behis coman a..."
4,Ely grater:\nI have thall grown to shalm caul ...,him not be proted for all wremal the aged of l...
...,...,...
193,"Dece to the pligo; I'll my pray, ganters.\n\nI...",That one componters so arce that to spares the...
194,Foll for death with dand fright your cans-to h...,That one componters so arce that to spares the...
208,"Dece to the pligo; I'll my pray, ganters.\n\nI...","Dece to the pligo; I'll my pray, ganters.\n\nI..."
209,Foll for death with dand fright your cans-to h...,"Dece to the pligo; I'll my pray, ganters.\n\nI..."


In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim

# Define character-to-index mapping
char_to_idx = {ch: idx for idx, ch in enumerate("abcdefghijklmnopqrstuvwxyz ")}
idx_to_char = {idx: ch for ch, idx in char_to_idx.items()}
vocab_size = len(char_to_idx)
max_length = 500  # Define max sequence length

# Function to encode text as character indices
def encode_text(text, char_to_idx, max_length):
    indices = [char_to_idx.get(char, 0) for char in text.lower()[:max_length]]
    return indices + [0] * (max_length - len(indices))

# decode text
def decode_text(indices, idx_to_char):
    text = ''.join([idx_to_char[idx] for idx in indices])
    return text

# Custom Dataset
class RewardDataset(Dataset):
    def __init__(self, dataframe, char_to_idx, max_length):
        self.data = dataframe
        self.char_to_idx = char_to_idx
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        favored_text = self.data.iloc[idx]['favored text']
        negative_text = self.data.iloc[idx]['negative text']
        
        # Encode the texts
        favored_encoded = torch.tensor(encode_text(favored_text, self.char_to_idx, self.max_length))
        negative_encoded = torch.tensor(encode_text(negative_text, self.char_to_idx, self.max_length))
        
        return favored_encoded, negative_encoded, torch.tensor(1.0), torch.tensor(0.0)

# Define a simple character-based reward model
class CharRewardModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=32):
        super(CharRewardModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, 64, batch_first=True)
        self.fc = nn.Linear(64, 1)

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)  # hidden: [1, batch_size, 64]
        output = self.fc(hidden)
        return output.squeeze(0).squeeze(1)

# Instantiate the dataset and model
dataset = RewardDataset(dataframe=df, char_to_idx=char_to_idx, max_length=max_length)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
model = CharRewardModel(vocab_size=vocab_size)

# Training setup
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-3)

# Training loop
for epoch in range(7):
    for batch in dataloader:
        favored_text, negative_text, favored_label, negative_label = batch

        # Forward pass
        favored_reward = model(favored_text)
        negative_reward = model(negative_text)

        # Calculate loss
        loss_favored = criterion(favored_reward, favored_label)
        loss_negative = criterion(negative_reward, negative_label)
        loss = (loss_favored + loss_negative) / 2

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")


Epoch 1, Loss: 0.6277613043785095
Epoch 2, Loss: 0.48914700746536255
Epoch 3, Loss: 0.396189421415329
Epoch 4, Loss: 0.5505608320236206
Epoch 5, Loss: 0.3692547082901001
Epoch 6, Loss: 0.4334373474121094
Epoch 7, Loss: 0.5982861518859863


In [10]:
df_eval = df_eval.rename(columns={'text': 'favored text', 'count': 'favored count'})
df_eval['negative text'] = ''

In [14]:
# eval
eval_dataset = RewardDataset(dataframe=df_eval, char_to_idx=char_to_idx, max_length=max_length)
eval_dataloader = DataLoader(eval_dataset, batch_size=1)

for batch in eval_dataloader:
    favored_text, negative_text, favored_label, negative_label = batch
    reward = model(favored_text)
    decoded_text = decode_text(favored_text[0].numpy(), idx_to_char)
    print(f"Reward: {reward.item()}, Label: {favored_label.item()}")
    print("Favored text:")
    print(decoded_text)
    print("\n")


Reward: 0.7995948195457458, Label: 1.0
Favored text:
aking etesaasira the not must mut here the him gooda and thy deathaaand pan a with and twicked on enteriesaaof your daess his mines all look bood i do a haveaathan from in of leent prove to heave andermanceato my loop that not most he way causa a poweraaavoleoaathat proviong i dighce affforeras be eve such willathat not in the purs proved need didiedsaathe me farl out my will with and his vill and our wallaadet the sigh lecousid the parkad thoughta i to that stansaaamucopsioaawels muta what youa


Reward: 0.5747926235198975, Label: 1.0
Favored text:
then good a forsing it have fouls that hereaawhe lorgom the the gall thana she long i what thatathing agguntaour thing levidy i with how curse pleasa you sheaveaaher he gooda she crow on know live some so coudsesleaaduke to me in lie the of the dought thy his the heave my theeaaand sperfery that wite from to that comfur fort sheathy the here he rid he some of you cious areaaabut sickingha

# RLHF

In [12]:
# additional layer based on text output and reward
# define a policy model

class PolicyRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(PolicyRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        embedded = self.embedding(x)
        out, hidden = self.rnn(embedded, hidden)
        logits = self.fc(out)
        return logits, hidden

    def sample(self, seed, max_length):
        "Generates text sequence up to max_len tokens"
        generated_text = generate_text(seed)
        input_token = torch.tensor(encode_text(generated_text, char_to_idx, max_length))
        return generated_text, input_token


# reward function
def reward_function(text, model):
    encoded_text = torch.tensor(encode_text(text, char_to_idx, max_length)).unsqueeze(0)
    return model(encoded_text).item()

# loss function
def policy_loss(logits, actions, rewards):
    log_probs = torch.log_softmax(logits, dim=-1)
    actions = actions.unsqueeze(-1)
    action_log_probs = log_probs.gather(2, actions)
    return -(action_log_probs * rewards).mean()

# Instantiate the policy model
policy_model = PolicyRNN(vocab_size=vocab_size, embedding_dim=32, hidden_dim=64)
optimizer = optim.AdamW(policy_model.parameters(), lr=1e-3)

# Training loop
for epoch in range(5):
    seed = epoch
    max_length = 500
    generated_text, input_token = policy_model.sample(seed, max_length)
    sequence_tensor = torch.tensor(input_token).unsqueeze(0)
    
    # reward
    reward = reward_function(generated_text, model) # Use the reward model to calculate the reward
    reward_tensor = torch.tensor(reward).repeat(sequence_tensor.size(1))
    
    # Forward pass
    logits, _ = policy_model(sequence_tensor)
    
    # Calculate loss
    loss = policy_loss(logits, sequence_tensor, reward_tensor)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")
    

  sequence_tensor = torch.tensor(input_token).unsqueeze(0)


Epoch 1, Loss: 2.0523293018341064
Epoch 2, Loss: 2.088833808898926
Epoch 3, Loss: -2.4265503883361816
Epoch 4, Loss: 2.6110448837280273
Epoch 5, Loss: 7.018655776977539


In [16]:
# check
seed = 1029
max_length = 500
generated_text, input_token = policy_model.sample(seed, max_length)
# use model
results = policy_model(input_token)
results = results[0].argmax(dim=-1).numpy()
decoded_text = decode_text(results, idx_to_char)
print(decoded_text)

qoosqoshoooqsqssoosoooooooqsqosqsseosesooooqsqssseooossaooqooosqosoosssssqqsqssqssooossooooooaooooqsoooooqsooossoooooooooooqhssaasesosaossosooseoooossssaossoosoossosoosesoooossssoooooooossoooooaooosssaosssoossosoostoseseooooossqoqsoooosssssssaoooooactooqoastoooooooooosasooooossqoohsoosossssosstooqooooooaoooooqssosooooosoooosooohoosooaassooqosooosaooosqhsoosoqshoossooosssssssssqsssosqoooqssossosssqssssssooooooqsaaqqsqoasooossqsssooosooooscaseoossooooossssoooooosooossosooqoossssooossssssqsqssoooos
