In [22]:
import os
import random
import torch
import torch.nn as nn
from torch.nn import functional as F

# extract text and create dataset

In [23]:
def is_comment(line):
    # Define a function to check if a line is a comment
    line = line.strip()
    if line.startswith('#') or line.startswith("'''") or line.startswith('"""'):
        return True
    return False

def extract_non_comments(source_directory, target_directory):
    # Process all .py files in the specified directory and subdirectories
    for root, dirs, files in os.walk(source_directory):
        for file in files:
            if file.endswith('.py'):
                file_path = os.path.join(root, file)
                target_file_path = os.path.join(target_directory, file.replace('.py', '.txt'))
                with open(file_path, 'r') as source_file, open(target_file_path, 'w') as target_file:
                    non_comments = []
                    comment_block = False
                    
                    for line in source_file:
                        # Check for the start or end of a comment block
                        # if "r'''" in line and "'''" in line or 'r"""' in line and '"""' in line:
                        #     continue
                        # if "'''" in line or '"""' in line:
                        #     comment_block = not comment_block
                        #     continue
                        # if "r'''" in line or 'r"""' in line:
                        #     comment_block = not comment_block
                        #     continue
                        if line.count("'''") == 1 or line.count('"""') == 1:
                            comment_block = not comment_block
                            continue
                        # If it's not a comment or part of a comment block, save it
                        if not is_comment(line) and not comment_block:
                            non_comments.append(line)
                        # Write non-comment lines to a target .txt file
                    target_file.writelines(non_comments)

# # Define the path to the local repository (change this to the actual path of your local repo)
# # source_directory = '/path/to/your/local/pytorch/repo'
# source_directory = '.'
# # target_directory = '/path/to/your/output/directory'
# target_directory = './dataset/'


# # Create the target directory if it doesn't exist
# os.makedirs(target_directory, exist_ok=True)

# # Call the function to start extracting non-comment lines
# extract_non_comments(source_directory, target_directory)


In [24]:

def combine_files(directory, output_file, sample=False, num_files_to_sample=100, seed=111, start_token="<START>", end_token="<END>"):
    """
    Combine content from a specified number of text files in a directory into one file, 
    with start and end tokens between contents from each file.

    :param directory: Path to the directory containing text files.
    :param output_file: Name of the output file to create.
    :param num_files_to_sample: Number of files to sample and combine.
    :param start_token: The start token to be added before each file's content.
    :param end_token: The end token to be added after each file's content.
    """
    
    
    # List all text files in the directory
    all_files = [f for f in os.listdir(directory) if f.endswith('.txt')]
    files = all_files

    if sample:
        # Sample the specified number of files
        random.seed(seed)
        files = random.sample(all_files, min(num_files_to_sample, len(all_files)))

    # Start combining the sampled files
    with open(output_file, 'w') as outfile:
        for filename in files:
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r') as infile:
                outfile.write(start_token + '\n')
                content = infile.read()
                content_with_tabs = content.replace('    ', '\t')
                outfile.write(content_with_tabs + '\n')
                outfile.write(end_token + '\n\n')

    print(f"Combined file created as '{output_file}' with contents from {len(files)} files.")

# Example usage
combine_files('dataset/', 'sample_scripts.txt')


Combined file created as 'sample_scripts.txt' with contents from 1153 files.


In [25]:
# read it in to inspect it
# data_file = 'sample_scripts.txt'
data_file = 'dataset/adamw.txt'
with open('sample_scripts.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [26]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  11546152


In [27]:
print(text[:1000])

<START>

from __future__ import annotations

import dataclasses
from typing import Optional

from torch.onnx._internal.diagnostics.infra.sarif import (
	_artifact_location,
	_property_bag,
)


@dataclasses.dataclass
class VersionControlDetails(object):

	repository_uri: str = dataclasses.field(
		metadata={"schema_property_name": "repositoryUri"}
	)
	as_of_time_utc: Optional[str] = dataclasses.field(
		default=None, metadata={"schema_property_name": "asOfTimeUtc"}
	)
	branch: Optional[str] = dataclasses.field(
		default=None, metadata={"schema_property_name": "branch"}
	)
	mapped_to: Optional[_artifact_location.ArtifactLocation] = dataclasses.field(
		default=None, metadata={"schema_property_name": "mappedTo"}
	)
	properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
		default=None, metadata={"schema_property_name": "properties"}
	)
	revision_id: Optional[str] = dataclasses.field(
		default=None, metadata={"schema_property_name": "revisionId"}
	)
	revision_tag: Optional[

In [28]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
''.join(chars)

'\t\n !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~±–≤⊑⊔⊳─│└├✓'

In [29]:
print(vocab_size)

108


# encoding and decoding for chars

In [30]:
# create a mapping from characters to integers
ch_to_idx = { ch:i for i,ch in enumerate(chars) }
idx_to_ch = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [ch_to_idx[ch] for ch in s] # encoder: take a string, output a list of mapping idx
decode = lambda l: ''.join([idx_to_ch[idx] for idx in l]) # decoder: take a list of index, output a string

print(encode("import torch"))
print(decode(encode("import torch")))

[75, 79, 82, 81, 84, 86, 2, 86, 81, 84, 69, 74]
import torch


In [31]:
# encode the entire text dataset and store it into a torch.Tensor
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([11546152]) torch.int64
tensor([30, 53, 54, 35, 52, 54, 32,  1,  1, 72, 84, 81, 79,  2, 65, 65, 72, 87,
        86, 87, 84, 71, 65, 65,  2, 75, 79, 82, 81, 84, 86,  2, 67, 80, 80, 81,
        86, 67, 86, 75, 81, 80, 85,  1,  1, 75, 79, 82, 81, 84, 86,  2, 70, 67,
        86, 67, 69, 78, 67, 85, 85, 71, 85,  1, 72, 84, 81, 79,  2, 86, 91, 82,
        75, 80, 73,  2, 75, 79, 82, 81, 84, 86,  2, 49, 82, 86, 75, 81, 80, 67,
        78,  1,  1, 72, 84, 81, 79,  2, 86, 81])


# train dev split

In [32]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [33]:
context_length = 8
x = train_data[:context_length]
y = train_data[1:context_length+1]
for t in range(context_length):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([30]) the target: 53
when input is tensor([30, 53]) the target: 54
when input is tensor([30, 53, 54]) the target: 35
when input is tensor([30, 53, 54, 35]) the target: 52
when input is tensor([30, 53, 54, 35, 52]) the target: 54
when input is tensor([30, 53, 54, 35, 52, 54]) the target: 32
when input is tensor([30, 53, 54, 35, 52, 54, 32]) the target: 1
when input is tensor([30, 53, 54, 35, 52, 54, 32,  1]) the target: 1


# config

In [58]:
batch_size = 32
context_length = 256
emb_dim = 32
max_iters = 5000
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_interval = 100
eval_iters = 200

# data loader

In [35]:
torch.manual_seed(111)

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    start_idxs = torch.randint(len(data) - context_length, (batch_size,))
    context_idxs = torch.stack([data[start_idx : start_idx+context_length] for start_idx in start_idxs])
    target_idxs = torch.stack([data[start_idx+1 : start_idx+context_length+1] for start_idx in start_idxs])
    
    context_idxs, target_idxs = context_idxs.to(device), target_idxs.to(device)
    
    return context_idxs, target_idxs

context_idxs, target_idxs = get_batch('train')
print('inputs:')
print(context_idxs.shape)
print(context_idxs)
print('targets:')
print(target_idxs.shape)
print(target_idxs)

print('----')

for b in range(4): # batch dimension
    for step in range(8): # context length dimension
        context = context_idxs[b, :step+1]
        target = target_idxs[b,step]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([32, 256])
tensor([[80, 10, 11,  ..., 10, 85, 71],
        [85, 86,  2,  ...,  2, 81, 80],
        [81, 84,  2,  ..., 86, 71, 10],
        ...,
        [ 0,  0, 72,  ..., 74, 67, 80],
        [53, 54, 35,  ..., 77, 65, 82],
        [28,  1,  0,  ..., 81, 80, 85]], device='cuda:0')
targets:
torch.Size([32, 256])
tensor([[10, 11,  2,  ..., 85, 71, 78],
        [86,  2, 85,  ..., 81, 80, 71],
        [84,  2, 85,  ..., 71, 10, 11],
        ...,
        [ 0, 72,  4,  ..., 67, 80, 80],
        [54, 35, 52,  ..., 65, 82, 67],
        [ 1,  0,  0,  ..., 80, 85, 86]], device='cuda:0')
----
when input is [80] the target: 10
when input is [80, 10] the target: 11
when input is [80, 10, 11] the target: 2
when input is [80, 10, 11, 2] the target: 15
when input is [80, 10, 11, 2, 15] the target: 32
when input is [80, 10, 11, 2, 15, 32] the target: 2
when input is [80, 10, 11, 2, 15, 32, 2] the target: 75
when input is [80, 10, 11, 2, 15, 32, 2, 75] the target: 80
when input is [85

# model

In [36]:
torch.arange(4).shape

torch.Size([4])

In [37]:
torch.manual_seed(111)


class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # works as a look up table for the probability of the next char for each current char
        self.token_embedding_table = nn.Embedding(vocab_size, emb_dim)
        self.position_embedding_table = nn.Embedding(context_length, emb_dim)
        self.attention_head = Head(emb_dim)
        self.lm_head = nn.Linear(emb_dim, vocab_size)

    def forward(self, context_idxs, target_idxs=None):
        B, T = context_idxs.shape # num of batches; num of total steps in context_length

        # context_idxs, target_idxs are both (B,T) tensor of integers
        token_emb = self.token_embedding_table(context_idxs) # (B, T, emb_dim)
        position_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T, emb_dim)
        x = token_emb + position_emb # (B, T, emb_dim)
        x = self.attention_head(x) # (B, T, head_size)
        logits = self.lm_head(x) # (B, T, vocab_size), now the feature_dim is vocab_size again
        
        if target_idxs is None:
            loss = None
        else:
            B, T, D = logits.shape # num of batches; num of total steps in context_length; num of feature dimension
            logits = logits.view(B * T, D) # now D == vocab_size == number of classes
            target_idxs = target_idxs.view(B * T)
            loss = F.cross_entropy(logits, target_idxs)

        return logits, loss
    
    def generate(self, context_idxs, max_new_tokens):
        for _ in range(max_new_tokens):
            # trim input
            input_idxs = context_idxs[:, -context_length:]
            # forward
            logits, loss = self(input_idxs)
            # focus only on the last time step
            logits = logits[:, -1, :] # (B, D) tensor for the last step
            probs = F.softmax(logits, dim=-1) # predicted_label (B, D)
            
            # sample from the distribution
            # torch.multinomial: Returns a tensor where each row contains num_samples indices 
            # sampled from the multinomial probability distribution located in the corresponding row of tensor input.
            pred_idxs = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            context_idxs = torch.cat((context_idxs, pred_idxs), dim=1) # (B, T+1)
        return context_idxs


class Head(nn.Module):
    '''
    self-attention with only one head
    '''

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(emb_dim, head_size, bias=False)
        self.query = nn.Linear(emb_dim, head_size, bias=False)
        self.value = nn.Linear(emb_dim, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(context_length, context_length)))

    def forward(self, x):
        B, T, D = x.shape
        k = self.key(x)
        q = self.query(x)
        
        # attention-score
        weight = q @ k.transpose(-2,-1) * D**-0.5 # (B, T, D) @ (B, D, T) ---> (B, T, T)
        # D**-0.5: to relief the influence of large value makes the vector after softmax looks like one-hot vector.

        weight = weight.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B,T,T), the upper-right triangle will be -inf
        weight = F.softmax(weight, dim=-1) # (B, T, T)

        # weighted-aggregation of values based on the attention-score
        v = self.value(x) # (B, T, D)
        out = weight @ v # (B, T, T) @ (B, T, D) --------> (B, T, D)

        return out

model = BigramLanguageModel().to(device)
logits, loss = model(context_idxs, target_idxs)
print(logits.shape)
print(loss)

# decode 5 batches of data, the initial start char is 'i'
# [decode(model.generate(context_idxs=torch.full((5, 1), 75, dtype=torch.long).to(device), max_new_tokens=100)[i].tolist()) for i in range(5)]

torch.Size([8192, 108])
tensor(4.7038, device='cuda:0', grad_fn=<NllLossBackward0>)


In [56]:
weight_test = torch.tril(torch.ones(10,10))
print(weight_test)
weight_test = weight_test.masked_fill(weight_test == 0, float('-inf'))
print(weight_test)
weight_test = F.softmax(weight_test, dim=-1)
print(weight_test)
v_test = torch.rand((10,3))
print(v_test)
out = weight_test @ v_test
print(out)

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])
tensor([[1., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [1., 1., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [1., 1., 1., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [1., 1., 1., 1., -inf, -inf, -inf, -inf, -inf, -inf],
        [1., 1., 1., 1., 1., -inf, -inf, -inf, -inf, -inf],
        [1., 1., 1., 1., 1., 1., -inf, -inf, -inf, -inf],
        [1., 1., 1., 1., 1., 1., 1., -inf, -inf, -inf],
        [1., 1., 1., 1., 1., 1., 1., 1., -inf, -inf],
        [1.

# optimizer

In [38]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# training

In [39]:
@torch.no_grad()
def estimate_loss():
    res = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, y = get_batch(split)
            logits, loss = model(X, y)
            losses[k] = loss.item()
        res[split] = losses.mean()
    model.train()
    return res

In [59]:
batch_size = 32
for iter in range(max_iters): # increase number of steps for good results... 
    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    
    # sample a batch of data
    context_idxs, target_idxs = get_batch('train')

    # evaluate the loss
    logits, loss = model(context_idxs, target_idxs)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


step 0: train loss 2.7414, val loss 2.8635
step 100: train loss 2.7371, val loss 2.8582
step 200: train loss 2.7397, val loss 2.8560
step 300: train loss 2.7376, val loss 2.8606
step 400: train loss 2.7364, val loss 2.8652
step 500: train loss 2.7386, val loss 2.8543
step 600: train loss 2.7328, val loss 2.8528
step 700: train loss 2.7309, val loss 2.8532
step 800: train loss 2.7368, val loss 2.8542
step 900: train loss 2.7316, val loss 2.8488
step 1000: train loss 2.7242, val loss 2.8415
step 1100: train loss 2.7240, val loss 2.8383
step 1200: train loss 2.7211, val loss 2.8535
step 1300: train loss 2.7259, val loss 2.8419
step 1400: train loss 2.7242, val loss 2.8419
step 1500: train loss 2.7204, val loss 2.8426
step 1600: train loss 2.7253, val loss 2.8380
step 1700: train loss 2.7189, val loss 2.8330
step 1800: train loss 2.7234, val loss 2.8318
step 1900: train loss 2.7140, val loss 2.8307
step 2000: train loss 2.7186, val loss 2.8303
step 2100: train loss 2.7152, val loss 2.8256


In [64]:
print(decode(model.generate(context_idxs = torch.tensor(encode('import')).view(1, 6).to(device), max_new_tokens=500)[0].tolist()))

import spl wer_arnter.we
		s -> torach.crn(-` te = dergog(
	rnich, = = fut(f inves = - sitegmioconconesete = bfflitins'_imgse= be, t =nch.te s, siam bad_s = e

):
					CCk = esstrureda:

)
Dize

):)
	pesint_Urviar "'Cocachotonetrrmplfin['ct_"chteContrap6,s

	e.tirtolinise_sigefdors_pesitids,

	bl(), tede se_mecernadercham.ims,
			atMOorchef:
			aunp,

		_cor_lski(fe= ty h()  vedeniffr ",

			pet_atengsel_wenev2ram_mosenfqrestpourg_s_honarnnern(no_ceng: + inat= _ag_ort_aphh.v_gses_macunizercyuins =Kene


In [42]:
torch.tensor(encode('import')).view(1, 6)

tensor([[75, 79, 82, 81, 84, 86]])