In [390]:
import os
import random
import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm.auto import tqdm

# extract text and create dataset

In [391]:
def is_comment(line):
    # Define a function to check if a line is a comment
    line = line.strip()
    if line.startswith('#') or line.startswith("'''") or line.startswith('"""'):
        return True
    return False

def extract_non_comments(source_directory, target_directory):
    # Process all .py files in the specified directory and subdirectories
    for root, dirs, files in os.walk(source_directory):
        for file in files:
            if file.endswith('.py'):
                file_path = os.path.join(root, file)
                target_file_path = os.path.join(target_directory, file.replace('.py', '.txt'))
                with open(file_path, 'r') as source_file, open(target_file_path, 'w') as target_file:
                    non_comments = []
                    comment_block = False
                    
                    for line in source_file:
                        # Check for the start or end of a comment block
                        # if "r'''" in line and "'''" in line or 'r"""' in line and '"""' in line:
                        #     continue
                        # if "'''" in line or '"""' in line:
                        #     comment_block = not comment_block
                        #     continue
                        # if "r'''" in line or 'r"""' in line:
                        #     comment_block = not comment_block
                        #     continue
                        if line.count("'''") == 1 or line.count('"""') == 1:
                            comment_block = not comment_block
                            continue
                        # If it's not a comment or part of a comment block, save it
                        if not is_comment(line) and not comment_block:
                            non_comments.append(line)
                        # Write non-comment lines to a target .txt file
                    target_file.writelines(non_comments)

# # Define the path to the local repository (change this to the actual path of your local repo)
# # source_directory = '/path/to/your/local/pytorch/repo'
source_directory = '../examples/'
# # target_directory = '/path/to/your/output/directory'
target_directory = './dataset/raw/'


# # Create the target directory if it doesn't exist
# os.makedirs(target_directory, exist_ok=True)

# # Call the function to start extracting non-comment lines
# extract_non_comments(source_directory, target_directory)


In [392]:

def combine_files(directory, output_file, sample=False, num_files_to_sample=100, seed=111, start_token="<START>", end_token="<END>"):
    """
    Combine content from a specified number of text files in a directory into one file, 
    with start and end tokens between contents from each file.

    :param directory: Path to the directory containing text files.
    :param output_file: Name of the output file to create.
    :param num_files_to_sample: Number of files to sample and combine.
    :param start_token: The start token to be added before each file's content.
    :param end_token: The end token to be added after each file's content.
    """
    
    
    # List all text files in the directory
    all_files = [f for f in os.listdir(directory) if f.endswith('.txt')]
    files = all_files

    if sample:
        # Sample the specified number of files
        random.seed(seed)
        files = random.sample(all_files, min(num_files_to_sample, len(all_files)))

    # Start combining the sampled files
    with open(output_file, 'w') as outfile:
        for filename in files:
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r') as infile:
                # outfile.write(start_token + '\n')
                content = infile.read()
                content_with_tabs = content.replace('    ', '\t')
                outfile.write(content_with_tabs + '\n')
                # outfile.write(end_token + '\n\n')

    print(f"Combined file created as '{output_file}' with contents from {len(files)} files.")
  
# Example usage
combine_files('dataset/raw/', 'sample_scripts.txt')


Combined file created as 'sample_scripts.txt' with contents from 57 files.


In [393]:
# read it in to inspect it
# data_file = 'sample_scripts.txt'
# data_file = 'dataset/adamw.txt'
with open('sample_scripts.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [394]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  118465


In [395]:
print(text[:1000])

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from datautils import MyTrainDataset

import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
import os


def ddp_setup():
	init_process_group(backend="nccl")
	torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))

class Trainer:
	def __init__(
		self,
		model: torch.nn.Module,
		train_data: DataLoader,
		optimizer: torch.optim.Optimizer,
		save_every: int,
		snapshot_path: str,
	) -> None:
		self.gpu_id = int(os.environ["LOCAL_RANK"])
		self.model = model.to(self.gpu_id)
		self.train_data = train_data
		self.optimizer = optimizer
		self.save_every = save_every
		self.epochs_run = 0
		self.snapshot_path = snapshot_path
		if os.path.exists(snapshot_path):
			print("Loading snapshot")
			self._load_snapshot(snapshot_path)

		se

In [396]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
''.join(chars)

'\t\n !"#$%\'()*+,-./0123456789:<=>?@ABCDEFGHIKLMNOPRSTUVWXYZ[\\]_`abcdefghijklmnopqrstuvwxyz{|}'

In [397]:
print(vocab_size)

91


# encoding and decoding for chars

In [398]:
# create a mapping from characters to integers
ch_to_idx = { ch:i for i,ch in enumerate(chars) }
idx_to_ch = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [ch_to_idx[ch] for ch in s] # encoder: take a string, output a list of mapping idx
decode = lambda l: ''.join([idx_to_ch[idx] for idx in l]) # decoder: take a list of index, output a string

print(encode("import torch"))
print(decode(encode("import torch")))

[70, 74, 77, 76, 79, 81, 2, 81, 76, 79, 64, 69]
import torch


In [399]:
# encode the entire text dataset and store it into a torch.Tensor
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([118465]) torch.int64
tensor([70, 74, 77, 76, 79, 81,  2, 81, 76, 79, 64, 69,  1, 70, 74, 77, 76, 79,
        81,  2, 81, 76, 79, 64, 69, 15, 75, 75, 15, 67, 82, 75, 64, 81, 70, 76,
        75, 62, 73,  2, 62, 80,  2, 38,  1, 67, 79, 76, 74,  2, 81, 76, 79, 64,
        69, 15, 82, 81, 70, 73, 80, 15, 65, 62, 81, 62,  2, 70, 74, 77, 76, 79,
        81,  2, 36, 62, 81, 62, 80, 66, 81, 13,  2, 36, 62, 81, 62, 43, 76, 62,
        65, 66, 79,  1, 67, 79, 76, 74,  2, 65])


# train dev split

In [400]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [401]:
context_length = 8
x = train_data[:context_length]
y = train_data[1:context_length+1]
for t in range(context_length):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([70]) the target: 74
when input is tensor([70, 74]) the target: 77
when input is tensor([70, 74, 77]) the target: 76
when input is tensor([70, 74, 77, 76]) the target: 79
when input is tensor([70, 74, 77, 76, 79]) the target: 81
when input is tensor([70, 74, 77, 76, 79, 81]) the target: 2
when input is tensor([70, 74, 77, 76, 79, 81,  2]) the target: 81
when input is tensor([70, 74, 77, 76, 79, 81,  2, 81]) the target: 76


# config

In [402]:
batch_size = 64
context_length = 256
max_iters = 5000
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_interval = 100
eval_iters = 200

num_heads = 6
emb_dim = 64 * num_heads
num_layers = 6
dropout = 0.2

# data loader

In [403]:
torch.manual_seed(111)

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    start_idxs = torch.randint(len(data) - context_length, (batch_size,))
    context_idxs = torch.stack([data[start_idx : start_idx+context_length] for start_idx in start_idxs])
    target_idxs = torch.stack([data[start_idx+1 : start_idx+context_length+1] for start_idx in start_idxs])
    
    context_idxs, target_idxs = context_idxs.to(device), target_idxs.to(device)
    
    return context_idxs, target_idxs

context_idxs, target_idxs = get_batch('train')
print('inputs:')
print(context_idxs.shape)
print(context_idxs)
print('targets:')
print(target_idxs.shape)
print(target_idxs)

print('----')

for b in range(4): # batch dimension
    for step in range(8): # context length dimension
        context = context_idxs[b, :step+1]
        target = target_idxs[b,step]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([64, 256])
tensor([[84, 76, 79,  ..., 75, 76, 74],
        [80, 66, 80,  ..., 80, 66, 73],
        [69, 15, 75,  ..., 79, 75,  2],
        ...,
        [10,  1,  1,  ..., 86, 74, 63],
        [13,  2, 45,  ..., 80, 86, 74],
        [81,  2, 75,  ..., 68,  2, 62]], device='cuda:0')
targets:
torch.Size([64, 256])
tensor([[76, 79, 65,  ..., 76, 74, 70],
        [66, 80, 60,  ..., 66, 73, 67],
        [15, 75, 75,  ..., 75,  2, 80],
        ...,
        [ 1,  1, 70,  ..., 74, 63, 76],
        [ 2, 45, 76,  ..., 86, 74, 63],
        [ 2, 75, 15,  ...,  2, 62, 80]], device='cuda:0')
----
when input is [84] the target: 76
when input is [84, 76] the target: 79
when input is [84, 76, 79] the target: 65
when input is [84, 76, 79, 65] the target: 60
when input is [84, 76, 79, 65, 60] the target: 81
when input is [84, 76, 79, 65, 60, 81] the target: 66
when input is [84, 76, 79, 65, 60, 81, 66] the target: 75
when input is [84, 76, 79, 65, 60, 81, 66, 75] the target: 80
when inp

# model

In [404]:
torch.arange(4).shape

torch.Size([4])

In [405]:
torch.manual_seed(111)


class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # works as a look up table for the probability of the next char for each current char
        self.token_embedding_table = nn.Embedding(vocab_size, emb_dim)
        self.position_embedding_table = nn.Embedding(context_length, emb_dim)
        self.blocks = nn.Sequential(*[Block(emb_dim, num_heads=num_heads) for _ in range(num_layers)])
        self.ln_final = nn.LayerNorm(emb_dim) # the final layer norm before output
        self.lm_head = nn.Linear(emb_dim, vocab_size)

    def forward(self, context_idxs, target_idxs=None):
        B, T = context_idxs.shape # num of batches; num of total steps in context_length

        # context_idxs, target_idxs are both (B,T) tensor of integers
        token_emb = self.token_embedding_table(context_idxs) # (B, T, emb_dim)
        position_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T, emb_dim)
        x = token_emb + position_emb # (B, T, emb_dim)
        x = self.blocks(x) # (B, T, head_size)
        logits = self.lm_head(x) # (B, T, vocab_size), now the feature_dim is vocab_size again
        
        if target_idxs is None:
            loss = None
        else:
            B, T, D = logits.shape # num of batches; num of total steps in context_length; num of feature dimension
            logits = logits.view(B * T, D) # now D == vocab_size == number of classes
            target_idxs = target_idxs.view(B * T)
            loss = F.cross_entropy(logits, target_idxs)

        return logits, loss
    
    def generate(self, context_idxs, max_new_tokens):
        for _ in range(max_new_tokens):
            # trim input
            input_idxs = context_idxs[:, -context_length:]
            # forward
            logits, loss = self(input_idxs)
            # focus only on the last time step
            logits = logits[:, -1, :] # (B, D) tensor for the last step
            probs = F.softmax(logits, dim=-1) # predicted_label (B, D)
            
            # sample from the distribution
            # torch.multinomial: Returns a tensor where each row contains num_samples indices 
            # sampled from the multinomial probability distribution located in the corresponding row of tensor input.
            pred_idxs = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            context_idxs = torch.cat((context_idxs, pred_idxs), dim=1) # (B, T+1)
        return context_idxs


class Head(nn.Module):
    '''
    self-attention with only one head
    '''

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(emb_dim, head_size, bias=False)
        self.query = nn.Linear(emb_dim, head_size, bias=False)
        self.value = nn.Linear(emb_dim, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(context_length, context_length)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, D = x.shape
        k = self.key(x)
        q = self.query(x)
        
        # attention-score
        weight = q @ k.transpose(-2,-1) * D**-0.5 # (B, T, D) @ (B, D, T) ---> (B, T, T)
        # D**-0.5: to relief the influence of large value makes the vector after softmax looks like one-hot vector.

        weight = weight.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B,T,T), the upper-right triangle will be -inf
        weight = F.softmax(weight, dim=-1) # (B, T, T)
        weight = self.dropout(weight)

        # weighted-aggregation of values based on the attention-score
        v = self.value(x) # (B, T, D)
        out = weight @ v # (B, T, T) @ (B, T, D) --------> (B, T, D)

        return out

class MultiHeadAttention(nn.Module):
    '''
    multiple heads fo self-attention in parallel
    '''
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.projection = nn.Linear(emb_dim, emb_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.projection(out)
        out = self.dropout(out)
        return out

class FeedForward(nn.Module):
    '''
    a simple linear layer with activation in decoder, + projection
    '''
    def __init__(self, emb_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(emb_dim, 4 * emb_dim), # the inner dimension is 4 * D, based on the original paper
            nn.ReLU(),
            nn.Linear(4 * emb_dim, emb_dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    '''
    a decoder block without cross-attentioin part
    '''
    def __init__(self, emb_dim, num_heads):
        super().__init__()
        self.head_size = emb_dim // num_heads
        self.attention = MultiHeadAttention(num_heads, self.head_size)
        self.ffwd = FeedForward(emb_dim)
        self.ln1 = nn.LayerNorm(emb_dim)
        self.ln2 = nn.LayerNorm(emb_dim)

    def forward(self, x):
        x = x + self.attention(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

model = GPTLanguageModel().to(device)
logits, loss = model(context_idxs, target_idxs)
print(logits.shape)
print(loss)

# decode 5 batches of data, the initial start char is 'i'
# [decode(model.generate(context_idxs=torch.full((5, 1), 75, dtype=torch.long).to(device), max_new_tokens=100)[i].tolist()) for i in range(5)]

torch.Size([16384, 91])
tensor(4.9763, device='cuda:0', grad_fn=<NllLossBackward0>)


In [406]:
weight_test = torch.tril(torch.ones(10,10))
print(weight_test)
weight_test = weight_test.masked_fill(weight_test == 0, float('-inf'))
print(weight_test)
weight_test = F.softmax(weight_test, dim=-1)
print(weight_test)
v_test = torch.rand((10,3))
print(v_test)
out = weight_test @ v_test
print(out)

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])
tensor([[1., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [1., 1., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [1., 1., 1., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [1., 1., 1., 1., -inf, -inf, -inf, -inf, -inf, -inf],
        [1., 1., 1., 1., 1., -inf, -inf, -inf, -inf, -inf],
        [1., 1., 1., 1., 1., 1., -inf, -inf, -inf, -inf],
        [1., 1., 1., 1., 1., 1., 1., -inf, -inf, -inf],
        [1., 1., 1., 1., 1., 1., 1., 1., -inf, -inf],
        [1.

# optimizer

In [407]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# training

In [408]:
@torch.no_grad()
def estimate_loss():
    res = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, y = get_batch(split)
            logits, loss = model(X, y)
            losses[k] = loss.item()
        res[split] = losses.mean()
    model.train()
    return res

In [409]:
for iter in tqdm(range(max_iters)): # increase number of steps for good results... 
    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    
    # sample a batch of data
    context_idxs, target_idxs = get_batch('train')

    # evaluate the loss
    logits, loss = model(context_idxs, target_idxs)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


  0%|          | 0/5000 [00:00<?, ?it/s]

step 0: train loss 4.9563, val loss 4.9562
step 100: train loss 2.6412, val loss 2.7066
step 200: train loss 2.4702, val loss 2.5987
step 300: train loss 2.1111, val loss 2.3333
step 400: train loss 1.6781, val loss 1.9988
step 500: train loss 1.3750, val loss 1.8004
step 600: train loss 1.1585, val loss 1.6911
step 700: train loss 0.9687, val loss 1.5995
step 800: train loss 0.8173, val loss 1.5766
step 900: train loss 0.6938, val loss 1.5487
step 1000: train loss 0.5920, val loss 1.5371
step 1100: train loss 0.4870, val loss 1.5436
step 1200: train loss 0.4087, val loss 1.5792
step 1300: train loss 0.3409, val loss 1.6299
step 1400: train loss 0.2868, val loss 1.6485
step 1500: train loss 0.2413, val loss 1.7017
step 1600: train loss 0.2089, val loss 1.7568
step 1700: train loss 0.1775, val loss 1.8216
step 1800: train loss 0.1564, val loss 1.8644
step 1900: train loss 0.1376, val loss 1.9169
step 2000: train loss 0.1256, val loss 1.9623
step 2100: train loss 0.1172, val loss 2.0124


KeyboardInterrupt: 

In [414]:
test_string = """import torch"""
print(decode(model.generate(context_idxs = torch.tensor(encode(test_string)).view(1, len(test_string)).to(device), max_new_tokens=10000)[0].tolist()))

import torch
from PIL import init_transform=optim.LOptimizer,
		rank,
	get_local_loss = int(os.environ[0], dowtaset=dtor_wesource)
	model = None
		save_model_and_snapshot()
	interpreter.set_input_names(out_node.denatasetsor)

	graph.lint(f"Snapshor TParallel epoce"

	one = Sequeen(
		rank = 0.to(
		args.save_dir, args.num_asee_args)
	else:
		print("So EPOR: RUNX RU is not args...")
		return n

	fnode.rander()
	train_args = model.seed()
	param_rrefs.append(RResized(args)

	()
	self, g = haph.owner()
		self.rewards = []
	for for ward(path, Dataset) in self.num_thexemore_batch():
		target = target.to(self.local_rank0(object)
		net_model_path.set_run_transer()


import torch
from torch.distributed.tensor.detal()
	print(f"{data.load().sharded_drmo_("emodel", "replarelelu")


rank_log(_rank, logger, memoter_method, default=None,
			checkpoint_wrapper=int_wr,
		)
	full_osd = None

	if rank == 0:
		full_osd = torch.load(optim_state)
			if bfor k.dist_cp.open()

			Put = list(drint.tim_heckpoin

In [None]:
torch.tensor(encode('import')).view(1, 6)