In [6]:
import os
import random
import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm.auto import tqdm

# extract text and create dataset

In [7]:
def is_comment(line):
    # Define a function to check if a line is a comment
    line = line.strip()
    if line.startswith('#') or line.startswith("'''") or line.startswith('"""'):
        return True
    return False

def extract_non_comments(source_directory, target_directory):
    # Process all .py files in the specified directory and subdirectories
    for root, dirs, files in os.walk(source_directory):
        for file in files:
            if file.endswith('.py'):
                file_path = os.path.join(root, file)
                target_file_path = os.path.join(target_directory, file.replace('.py', '.txt'))
                with open(file_path, 'r') as source_file, open(target_file_path, 'w') as target_file:
                    non_comments = []
                    comment_block = False
                    
                    for line in source_file:
                        # Check for the start or end of a comment block
                        # if "r'''" in line and "'''" in line or 'r"""' in line and '"""' in line:
                        #     continue
                        # if "'''" in line or '"""' in line:
                        #     comment_block = not comment_block
                        #     continue
                        # if "r'''" in line or 'r"""' in line:
                        #     comment_block = not comment_block
                        #     continue
                        if line.count("'''") == 1 or line.count('"""') == 1:
                            comment_block = not comment_block
                            continue
                        # If it's not a comment or part of a comment block, save it
                        if not is_comment(line) and not comment_block:
                            non_comments.append(line)
                        # Write non-comment lines to a target .txt file
                    target_file.writelines(non_comments)

# # Define the path to the local repository (change this to the actual path of your local repo)
# # source_directory = '/path/to/your/local/pytorch/repo'
source_directory = '../examples/'
# # target_directory = '/path/to/your/output/directory'
target_directory = './dataset/raw/'


# # Create the target directory if it doesn't exist
# os.makedirs(target_directory, exist_ok=True)

# # Call the function to start extracting non-comment lines
# extract_non_comments(source_directory, target_directory)


In [8]:

def combine_files(directory, output_file, sample=False, num_files_to_sample=100, seed=111, start_token="<START>", end_token="<END>"):
    """
    Combine content from a specified number of text files in a directory into one file, 
    with start and end tokens between contents from each file.

    :param directory: Path to the directory containing text files.
    :param output_file: Name of the output file to create.
    :param num_files_to_sample: Number of files to sample and combine.
    :param start_token: The start token to be added before each file's content.
    :param end_token: The end token to be added after each file's content.
    """
    
    
    # List all text files in the directory
    all_files = [f for f in os.listdir(directory) if f.endswith('.txt')]
    files = all_files

    if sample:
        # Sample the specified number of files
        random.seed(seed)
        files = random.sample(all_files, min(num_files_to_sample, len(all_files)))

    # Start combining the sampled files
    with open(output_file, 'w') as outfile:
        for filename in files:
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r') as infile:
                # outfile.write(start_token + '\n')
                content = infile.read()
                content_with_tabs = content.replace('    ', '\t')
                outfile.write(content_with_tabs + '\n')
                # outfile.write(end_token + '\n\n')

    print(f"Combined file created as '{output_file}' with contents from {len(files)} files.")
  
# Example usage
# combine_files('dataset/raw/', 'sample_scripts.txt')


In [9]:
# read it in to inspect it
# data_file = 'sample_scripts.txt'
# data_file = 'dataset/adamw.txt'
with open('../data/sample_scripts.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [10]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  2247598


In [11]:
print(text[:1000])

from typing import Dict, Union, Iterator

import torch

from allennlp.common.registrable import Registrable
from allennlp.data.instance import Instance
from allennlp.data.vocabulary import Vocabulary


TensorDict = Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]


class DataLoader(Registrable):

	default_implementation = "multiprocess"

	def __len__(self) -> int:
		raise TypeError

	def __iter__(self) -> Iterator[TensorDict]:
		raise NotImplementedError

	def iter_instances(self) -> Iterator[Instance]:
		raise NotImplementedError

	def index_with(self, vocab: Vocabulary) -> None:
		raise NotImplementedError

	def set_target_device(self, device: torch.device) -> None:
		raise NotImplementedError

import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

from datasets import loa

In [12]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
''.join(chars)

'\t\n\x1b !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~Ġ▁🤗'

In [13]:
print(vocab_size)

101


# encoding and decoding for chars

In [14]:
# create a mapping from characters to integers
ch_to_idx = { ch:i for i,ch in enumerate(chars) }
idx_to_ch = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [ch_to_idx[ch] for ch in s] # encoder: take a string, output a list of mapping idx
decode = lambda l: ''.join([idx_to_ch[idx] for idx in l]) # decoder: take a list of index, output a string

print(encode("import torch"))
print(decode(encode("import torch")))

[76, 80, 83, 82, 85, 87, 3, 87, 82, 85, 70, 75]
import torch


In [15]:
# encode the entire text dataset and store it into a torch.Tensor
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([2247598]) torch.int64
tensor([73, 85, 82, 80,  3, 87, 92, 83, 76, 81, 74,  3, 76, 80, 83, 82, 85, 87,
         3, 39, 76, 70, 87, 15,  3, 56, 81, 76, 82, 81, 15,  3, 44, 87, 72, 85,
        68, 87, 82, 85,  1,  1, 76, 80, 83, 82, 85, 87,  3, 87, 82, 85, 70, 75,
         1,  1, 73, 85, 82, 80,  3, 68, 79, 79, 72, 81, 81, 79, 83, 17, 70, 82,
        80, 80, 82, 81, 17, 85, 72, 74, 76, 86, 87, 85, 68, 69, 79, 72,  3, 76,
        80, 83, 82, 85, 87,  3, 53, 72, 74, 76])


# train dev split

In [16]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [17]:
context_length = 8
x = train_data[:context_length]
y = train_data[1:context_length+1]
for t in range(context_length):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([73]) the target: 85
when input is tensor([73, 85]) the target: 82
when input is tensor([73, 85, 82]) the target: 80
when input is tensor([73, 85, 82, 80]) the target: 3
when input is tensor([73, 85, 82, 80,  3]) the target: 87
when input is tensor([73, 85, 82, 80,  3, 87]) the target: 92
when input is tensor([73, 85, 82, 80,  3, 87, 92]) the target: 83
when input is tensor([73, 85, 82, 80,  3, 87, 92, 83]) the target: 76


# config

In [18]:
batch_size = 64
context_length = 256
max_iters = 1000
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_interval = 100
eval_iters = 200

num_heads = 6
emb_dim = 64 * num_heads
num_layers = 6
dropout = 0.2

# data loader

In [19]:
torch.manual_seed(111)

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    start_idxs = torch.randint(len(data) - context_length, (batch_size,))
    context_idxs = torch.stack([data[start_idx : start_idx+context_length] for start_idx in start_idxs])
    target_idxs = torch.stack([data[start_idx+1 : start_idx+context_length+1] for start_idx in start_idxs])
    
    context_idxs, target_idxs = context_idxs.to(device), target_idxs.to(device)
    
    return context_idxs, target_idxs

context_idxs, target_idxs = get_batch('train')
print('inputs:')
print(context_idxs.shape)
print(context_idxs)
print('targets:')
print(target_idxs.shape)
print(target_idxs)

print('----')

for b in range(4): # batch dimension
    for step in range(8): # context length dimension
        context = context_idxs[b, :step+1]
        target = target_idxs[b,step]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([64, 256])
tensor([[74, 11,  5,  ..., 82, 71, 72],
        [32,  3, 71,  ..., 68, 80, 83],
        [72, 89, 68,  ..., 49, 82, 81],
        ...,
        [74, 72, 87,  ..., 76, 81, 74],
        [68, 76, 81,  ..., 72, 15,  3],
        [72, 81, 66,  ...,  0, 83, 68]], device='cuda:0')
targets:
torch.Size([64, 256])
tensor([[11,  5, 60,  ..., 71, 72, 79],
        [ 3, 71, 85,  ..., 80, 83, 79],
        [89, 68, 79,  ..., 82, 81, 72],
        ...,
        [72, 87, 66,  ..., 81, 74, 66],
        [76, 81, 66,  ..., 15,  3, 68],
        [81, 66, 80,  ..., 83, 68, 86]], device='cuda:0')
----
when input is [74] the target: 11
when input is [74, 11] the target: 5
when input is [74, 11, 5] the target: 60
when input is [74, 11, 5, 60] the target: 82
when input is [74, 11, 5, 60, 82] the target: 88
when input is [74, 11, 5, 60, 82, 88] the target: 3
when input is [74, 11, 5, 60, 82, 88, 3] the target: 68
when input is [74, 11, 5, 60, 82, 88, 3, 68] the target: 85
when input is [32]

# model

In [20]:
torch.arange(4).shape

torch.Size([4])

In [21]:
torch.manual_seed(111)


class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # works as a look up table for the probability of the next char for each current char
        self.token_embedding_table = nn.Embedding(vocab_size, emb_dim)
        self.position_embedding_table = nn.Embedding(context_length, emb_dim)
        self.blocks = nn.Sequential(*[Block(emb_dim, num_heads=num_heads) for _ in range(num_layers)])
        self.ln_final = nn.LayerNorm(emb_dim) # the final layer norm before output
        self.lm_head = nn.Linear(emb_dim, vocab_size)

    def forward(self, context_idxs, target_idxs=None):
        B, T = context_idxs.shape # num of batches; num of total steps in context_length

        # context_idxs, target_idxs are both (B,T) tensor of integers
        token_emb = self.token_embedding_table(context_idxs) # (B, T, emb_dim)
        position_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T, emb_dim)
        x = token_emb + position_emb # (B, T, emb_dim)
        x = self.blocks(x) # (B, T, head_size)
        logits = self.lm_head(x) # (B, T, vocab_size), now the feature_dim is vocab_size again
        
        if target_idxs is None:
            loss = None
        else:
            B, T, D = logits.shape # num of batches; num of total steps in context_length; num of feature dimension
            logits = logits.view(B * T, D) # now D == vocab_size == number of classes
            target_idxs = target_idxs.view(B * T)
            loss = F.cross_entropy(logits, target_idxs)

        return logits, loss
    
    def generate(self, context_idxs, max_new_tokens):
        for _ in range(max_new_tokens):
            # trim input
            input_idxs = context_idxs[:, -context_length:]
            # forward
            logits, loss = self(input_idxs)
            # focus only on the last time step
            logits = logits[:, -1, :] # (B, D) tensor for the last step
            probs = F.softmax(logits, dim=-1) # predicted_label (B, D)
            
            # sample from the distribution
            # torch.multinomial: Returns a tensor where each row contains num_samples indices 
            # sampled from the multinomial probability distribution located in the corresponding row of tensor input.
            pred_idxs = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            context_idxs = torch.cat((context_idxs, pred_idxs), dim=1) # (B, T+1)
        return context_idxs


class Head(nn.Module):
    '''
    self-attention with only one head
    '''

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(emb_dim, head_size, bias=False)
        self.query = nn.Linear(emb_dim, head_size, bias=False)
        self.value = nn.Linear(emb_dim, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(context_length, context_length)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, D = x.shape
        k = self.key(x)
        q = self.query(x)
        
        # attention-score
        weight = q @ k.transpose(-2,-1) * D**-0.5 # (B, T, D) @ (B, D, T) ---> (B, T, T)
        # D**-0.5: to relief the influence of large value makes the vector after softmax looks like one-hot vector.

        weight = weight.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B,T,T), the upper-right triangle will be -inf
        weight = F.softmax(weight, dim=-1) # (B, T, T)
        weight = self.dropout(weight)

        # weighted-aggregation of values based on the attention-score
        v = self.value(x) # (B, T, D)
        out = weight @ v # (B, T, T) @ (B, T, D) --------> (B, T, D)

        return out

class MultiHeadAttention(nn.Module):
    '''
    multiple heads fo self-attention in parallel
    '''
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.projection = nn.Linear(emb_dim, emb_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.projection(out)
        out = self.dropout(out)
        return out

class FeedForward(nn.Module):
    '''
    a simple linear layer with activation in decoder, + projection
    '''
    def __init__(self, emb_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(emb_dim, 4 * emb_dim), # the inner dimension is 4 * D, based on the original paper
            nn.ReLU(),
            nn.Linear(4 * emb_dim, emb_dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    '''
    a decoder block without cross-attentioin part
    '''
    def __init__(self, emb_dim, num_heads):
        super().__init__()
        self.head_size = emb_dim // num_heads
        self.attention = MultiHeadAttention(num_heads, self.head_size)
        self.ffwd = FeedForward(emb_dim)
        self.ln1 = nn.LayerNorm(emb_dim)
        self.ln2 = nn.LayerNorm(emb_dim)

    def forward(self, x):
        x = x + self.attention(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

model = GPTLanguageModel().to(device)
logits, loss = model(context_idxs, target_idxs)
print(logits.shape)
print(loss)

# decode 5 batches of data, the initial start char is 'i'
# [decode(model.generate(context_idxs=torch.full((5, 1), 75, dtype=torch.long).to(device), max_new_tokens=100)[i].tolist()) for i in range(5)]

torch.Size([16384, 101])
tensor(5.0180, device='cuda:0', grad_fn=<NllLossBackward0>)


In [22]:
weight_test = torch.tril(torch.ones(10,10))
print(weight_test)
weight_test = weight_test.masked_fill(weight_test == 0, float('-inf'))
print(weight_test)
weight_test = F.softmax(weight_test, dim=-1)
print(weight_test)
v_test = torch.rand((10,3))
print(v_test)
out = weight_test @ v_test
print(out)

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])
tensor([[1., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [1., 1., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [1., 1., 1., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [1., 1., 1., 1., -inf, -inf, -inf, -inf, -inf, -inf],
        [1., 1., 1., 1., 1., -inf, -inf, -inf, -inf, -inf],
        [1., 1., 1., 1., 1., 1., -inf, -inf, -inf, -inf],
        [1., 1., 1., 1., 1., 1., 1., -inf, -inf, -inf],
        [1., 1., 1., 1., 1., 1., 1., 1., -inf, -inf],
        [1.

# optimizer

In [23]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# training

In [24]:
@torch.no_grad()
def estimate_loss():
    res = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, y = get_batch(split)
            logits, loss = model(X, y)
            losses[k] = loss.item()
        res[split] = losses.mean()
    model.train()
    return res

In [25]:
for iter in tqdm(range(max_iters)): # increase number of steps for good results... 
    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    
    # sample a batch of data
    context_idxs, target_idxs = get_batch('train')

    # evaluate the loss
    logits, loss = model(context_idxs, target_idxs)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


  0%|          | 0/1000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
print(sum(p.numel() for p in model.parameters())//1e6, 'million parameters')

10.0 million parameters


In [None]:
test_string = """import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from datautils import MyTrainDataset

class Trainer:
"""
generated_text = decode(model.generate(
    context_idxs = torch.tensor(encode(test_string)).view(1, len(test_string)).to(device), 
    max_new_tokens=1000)[0].tolist()
               )

print(generated_text)

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from datautils import MyTrainDataset

class Trainer:
	CatePalaclel(
	Lar,
	DDataConflwionPalGral = None
	get__fu__worldat():
		self.val_loss = ["MLOC"] fulloating_probj"] Example = squie
		elf.dps_checkpointim_apth: save_aves inint = {
	train_accuda: torat_hem': Grapplending, the='storal_ccuda: \ntrainerNe)
		train(

	os.parser = 0
	fsele N EPURLR_dar Chiswisplither
			p = torch.batch_size:
			p.sentr.izeroy_model.parames()
			self.upsion = 'optimizer':
			parint(
				'arg name, traintraing', train_trun': 000,
				    help= model, checkpdistrient1 20 (
args)

from torch.distarge import for checoin rec_tracerversing

from sys.backend psinf

from traced (torch.nn.Module):
	ref in rank rangeward(0, tra.DhibFate, = mask_end=isp_checkpoin=True,
	nlapse=CotpLis,
			ShePOfarS path training num:
			se:
			snapshot = FShard2Dict,
			)
		rprint(f"--> checkpoint {rank}")
	rank_loader = 0:
	"Soalder rank 

In [None]:
# save the output
filename = 'monkey_script.txt'
with open(filename, 'w', encoding='utf-8') as file:
    file.write(generated_text)
print(f"Generated text has been saved to {filename}")


Generated text has been saved to monkey_script.txt
