In [1]:
# Imports
import os
import sys
import torch
import wandb
import random
import argparse
import torch.nn as nn
import bittensor as bt
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel, GPT2Config, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup

# Pull in training utils.
import utils

has no attribute 'buffer'
handler: 'OutStream' object has no attribute 'reconfigure'
  from .autonotebook import tqdm as notebook_tqdm


In [2]:

def parse_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument( '--lr', type=float, default = 5e-5, help='Training learning rate.')
    parser.add_argument( '--bs', type=int, default = 1, help='Training batch size')
    parser.add_argument( '--sl', type=int, default = 512, help='Training sequence length')
    parser.add_argument( '--n_head', type=int, default = 12, help='Model number of attention heads')
    parser.add_argument( '--n_layer', type=int, default = 12, help='Number of gpt2 model layers')
    parser.add_argument( '--local', action="store_true", default = False, help='Turn on local training.')
    parser.add_argument( '--wandb', action="store_true", default = False, help='Turn on wandb')
    parser.add_argument( '--max_k', type=int, default = 1, help='Max number of gradients to merge.')
    parser.add_argument( '--max_steps', type=int, default = 50000, help='Max training steps.')
    parser.add_argument( '--steps_per_log', type=int, default = 1, help='Number of steps per log.')
    parser.add_argument( '--steps_per_sync', type=int, default = 10, help='Number of steps per chain sync.')
    parser.add_argument( '--num_warmup', type=int, default = 2000, help='Scheduler warm up steps.')
    parser.add_argument( '--accs_per_step', type=int, default= 3, help='Number of training accumulation steps.')
    parser.add_argument( '--netuid', type = int, default = 97, help="The chain subnet uid." )
    parser.add_argument( '--chain_endpoint', type = str, default = "wss://test.finney.opentensor.ai", help="The chain endpoint to connect with." )
    bt.subtensor.add_args( parser )
    bt.wallet.add_args( parser )
    bt.axon.add_args( parser )
    bt.logging.add_args( parser )
    return bt.config( parser )

config = parse_arguments()
print (config)
pass


__is_set: {}
accs_per_step: 3
axon:
  external_ip: null
  external_port: null
  ip: '[::]'
  max_workers: 10
  port: 8091
bs: 1
chain_endpoint: wss://test.finney.opentensor.ai
config: null
local: false
logging:
  debug: false
  logging_dir: ~/.bittensor/miners
  record_log: false
  trace: false
lr: 5.0e-05
max_k: 1
max_steps: 50000
n_head: 12
n_layer: 12
netuid: 97
num_warmup: 2000
sl: 512
steps_per_log: 1
steps_per_sync: 10
strict: false
subtensor:
  _mock: false
  chain_endpoint: wss://entrypoint-finney.opentensor.ai:443
  network: finney
wallet:
  hotkey: default
  name: default
  path: ~/.bittensor/wallets/
wandb: false



In [4]:
# Setup model and tokenizer
def setup_model_and_tokenizer():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token
    model = GPT2LMHeadModel(GPT2Config(n_layer = config.n_layer, n_head = config.n_head)).to(device)
    model.train()
    return model, tokenizer, device

model, tokenizer, device = setup_model_and_tokenizer()
pass

In [5]:
for name, param in model.named_parameters():
    print(name, param.size())


transformer.wte.weight torch.Size([50257, 768])
transformer.wpe.weight torch.Size([1024, 768])
transformer.h.0.ln_1.weight torch.Size([768])
transformer.h.0.ln_1.bias torch.Size([768])
transformer.h.0.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias torch.Size([2304])
transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias torch.Size([768])
transformer.h.0.ln_2.weight torch.Size([768])
transformer.h.0.ln_2.bias torch.Size([768])
transformer.h.0.mlp.c_fc.weight torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias torch.Size([3072])
transformer.h.0.mlp.c_proj.weight torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias torch.Size([768])
transformer.h.1.ln_1.weight torch.Size([768])
transformer.h.1.ln_1.bias torch.Size([768])
transformer.h.1.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.1.attn.c_attn.bias torch.Size([2304])
transformer.h.1.attn.c_proj.weight torch.Size([768, 768])
transformer.h.1.attn.c_proj.bias 

In [4]:
# Load dataloader
def load_dataloader():
    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation = True, padding = "max_length", max_length = config.sl, return_tensors = "pt")
    dataset = load_dataset("togethercomputer/RedPajama-Data-1T", 'default', split='train', streaming=True)
    dataset = dataset.shuffle(buffer_size = config.bs * 4, seed=42)
    tokenized_dataset = dataset.map( tokenize_function, batched=True )
    dataloader = DataLoader( tokenized_dataset, batch_size = config.bs)
    return dataloader

dataloader = load_dataloader()
pass

In [5]:
# Get optimized and scheduler
optimizer = torch.optim.AdamW (model.parameters(), lr = config.lr)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config.num_warmup, num_training_steps=config.max_steps)  # assuming total steps
pass


In [6]:

# training loop
step = 0
accumulation_counter = 0
for epoch in range(3):
    print(f'Epoch {epoch + 1}/{3}')
    for batch in dataloader:
        
        # Forward pass.
        outputs = model(
            input_ids = batch["input_ids"].to(device), 
            attention_mask = batch["attention_mask"].to(device),
            labels = batch["input_ids"].to(device)
        ) 
        
        # Backward pass
        loss = outputs.loss / config.accs_per_step
        loss.backward()

        break
    break


Epoch 1/3


In [7]:
compressed_grads = {}
compressed_sizes = {}
for name, param in model.named_parameters():
    if param.grad is not None:
        element = param.grad.clone().detach().cpu()
        norm, sign_xi_array = utils.compressor.compress(  element  )
        compressed_grads[name] = bt.tensor( tensor = sign_xi_array )
        compressed_sizes[name] = bt.tensor( tensor = norm )

In [16]:
 grads = {}
for name, compressed_grad in compressed_grads.items():
    compressed_size = compressed_sizes[name]
    grads[name] = compressor.decompress( sign_xi_array = compressed_grad.tensor(), norm = compressed_size.tensor() )
    

[]

In [None]:
type( sign_xi_array )

In [None]:

import torch
import torch.quantization

# Define a float tensor
x = torch.rand(3, 3)

# Quantize the tensor to 8 bits
q_x = torch.quantization.quantize_per_tensor(x, scale=1.0, zero_point=0, dtype=torch.quint8)

# You can also dequantize it back to float
dq_x = q_x.dequantize()
