In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
from torch.utils.data import Dataset

class CharDataset(Dataset):

    def __init__(self, input_file):
        with open(input_file, 'r') as f:
            data = f.read()
        words = data.splitlines()
        words = [w.strip() for w in words] # get rid of any leading or trailing white space
        words = [w for w in words if w] # get rid of any empty strings
        chars = sorted(list(set(''.join(words)))) # all the possible characters
        chars.append('.')
        max_word_length = max(len(w) for w in words)
        print(f"number of examples in the dataset: {len(words)}")
        print(f"max word length: {max_word_length}")
        print(f"number of unique characters in the vocabulary: {len(chars)}")
        print("vocabulary:")
        print(''.join(chars))
        self.words = words
        self.chars = chars
        print("chars: {chars}")
        self.stoi = {ch:i for i,ch in enumerate(chars)}
        self.itos = {i:s for s,i in self.stoi.items()} 
        #self.generate_tokens(input_file,max_word_length)
        self.createFixedLengthDataSet(input_file)
     

    def add_period_if_short(self,item):
        for i in range(17):
            if len(item) < i:
                item += '.'
        return item

   
    def generate_tokens(self,input_file,max_word_length):
        tokens=[]
        for item in self.words:
            wrd = self.add_period_if_short(item)
            tokens.extend([self.stoi[c] for c in wrd]) 
        # merged_tokens = torch.cat(all_tokens, dim=0)
        tokens_np = np.array(tokens)
        assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "token dictionary too large for uint16"
        tokens_np_uint16 = tokens_np.astype(np.uint16)
        file_name = input_file + '_tokens'
        np.save(file_name, tokens_np_uint16)
    
    def __len__(self):
        return len(self.words)

    def contains(self, word):
        return word in self.words

    def get_vocab_size(self):
        return len(self.chars) + 1 # all the possible characters and special 0 token

    def encode(self, word):
        ix = torch.tensor([self.stoi[w] for w in word], dtype=torch.long)
        return ix

    def decode(self, ix):
        word = ''.join(self.itos[i] for i in ix)
        return word
    
    def createFixedLengthDataSet(self,input_file):
        with open(input_file, 'r') as f:
            data = f.read()
        words = data.splitlines()
        words = [w.strip() for w in words] # get rid of any leading or trailing white space
        words = [w for w in words if w]
        result = "."+ ".".join(words)
        # Break the result into chunks of length 64
        # result_chunks = [result[i:i+64] for i in range(0, len(result), 64)]
        batch_number = 0
        index = 0
        batches = []
        while True:
            isAvailaible, data, index = self.fetchNextFromDot(result, index)
            if not isAvailaible or len(data) < 64:
                break
            batches.append(data)
        
        tokens = []
        for batch in batches:
            tokens.extend([self.stoi[c] for c in batch]) 
            
        tokens_np = np.array(tokens)
        assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "token dictionary too large for uint16"
        tokens_np_uint16 = tokens_np.astype(np.uint16)
        file_name = input_file + 'train_tokens'
        np.save(file_name, tokens_np_uint16)
    
    def fetchNextFromDot(self, result, index):
        batch_size = 64
        if index == 0:
            return  True, result[:batch_size], batch_size
        
        data = result[index: index+batch_size]
        
        index_of_first_dot = data.find('.')
        if index_of_first_dot == -1:
            return False, None, None
        
        data = data[index_of_first_dot:]
        remaining = result[index+batch_size: index+batch_size+index_of_first_dot]
        data += remaining
        
        return True, data, index+batch_size+index_of_first_dot
        

    def __getitem__(self, idx):
        word = self.words[idx]
        ix = self.encode(word)
        tkns= torch.tensor(ix, dtype=torch.long)#x, y
        return tkns
    



In [None]:

# # wrap in dataset objects
# file_name = 'babynames_train_0'
# CharDataset('./babynames/babynames.txt')
#token_filename = file_name + '_tokens.npy'
# ptt = load_tokens('./babynames/babynames_train_0.txt_tokens.npy')
# len(ptt)
CharDataset('./babynames/babynames_val.txt')

In [None]:
6750857/32768

In [None]:
import pandas as pd
# Read the Parquet file
df = pd.read_parquet('./raw_data/names_val.parquet')
# Select the specific column you want
column_data = df['Names']  # Replace 'your_column_name' with the actual column name
# Write the selected column to a text file
column_data.to_csv('babynames_val.txt', index=False, header=False)

In [None]:
import numpy as np
with open('./babynames/babynames_val.txt', 'r') as f:
        data = f.read()
words = data.splitlines()
words = [w.strip() for w in words] # get rid of any leading or trailing white space
words = [w for w in words if w]
tokens = []
chars = sorted(list(set(''.join(words)))) 
stoi = {ch:i for i,ch in enumerate(chars)}
for item in words:
        tokens.extend([stoi[c] for c in item]) 
# merged_tokens = torch.cat(all_tokens, dim=0)
tokens_np = np.array(tokens)
assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "token dictionary too large for uint16"
tokens_np_uint16 = tokens_np.astype(np.uint16)
file_name = 'babynames_val_' + '_tokens'
np.save(file_name, tokens_np_uint16)

In [None]:
def fetch_until_next_uppercase(s):
    result = []
    temp = ''

    for char in s:
        if char.isupper() and temp:
            result.append(temp)
            temp = ''
        temp += char
    
    if temp:  # Append the last accumulated string if any
        result.append(temp)

    return result

# Example usage
input_string = "HelloWorldPythonIsAwesome"
segments = fetch_until_next_uppercase(input_string)
def add_period_if_short(s):
    for i in range(15):
        if len(s) < i:
            s += '.'
    return s

print("Segments:", segments)
for s in segments:
    print(add_period_if_short(s))


In [None]:


# Example usage
file_path = 'babynames/babynames.txt' 
input_string = 
result = add_period_if_short(input_string)
print("Result:", result)


In [None]:
def count_unique_names(file_path):
    unique_names = set()

    with open(file_path, 'r') as file:
        for line in file:
            name = line.strip()  # Remove leading/trailing whitespace, including newline characters
            if name:  # Ensure the line is not empty
                unique_names.add(name)
    
    return len(unique_names)

# Example usage
file_path = 'babynames/babynames.txt'  # Replace with the path to your file
unique_count = count_unique_names(file_path)
print(f"Number of unique names: {unique_count}")


In [None]:
import numpy as np
import torch
npt = np.load('babynames/data_tokens/babynames_train_tokens.npy')
npt = npt.astype(np.int32) # added after video
ptt = torch.tensor(npt, dtype=torch.long)


In [None]:
ptt.shape

In [None]:
for i in range(16):
    print(i)

In [None]:
def add_period_if_short1(item):
    for i in range(16):
        if len(item) < i:
            item += '.'
    
    return item

name  =add_period_if_short1('ram')
name, len(name)

In [None]:
import torch

checkpoint = torch.load("./log/model.pt", mmap=False)
# Initialize the model using the configuration saved in the checkpoint
model = GPT(checkpoint['config'])

# Load the saved state dict into the model
model.load_state_dict(checkpoint['model'])
model.to(device)
use_compile = False
# Optionally, retrieve the training step to continue from where it left off
trained_step = checkpoint['step']

In [None]:
torch.backends.mps.is_available()

In [None]:
from datasets import load_dataset

dataset = load_dataset("jbrazzy/baby_names")

In [None]:
dataset

In [None]:
emb = 32
layers = 12

total_parametrs = emb*emb*3*layers
total_parametrs

In [None]:
256*16

In [None]:
1024*64*1
T*B*N = .5M

In [None]:
import numpy as np
with open('./babynames/babynames_val.txt', 'r') as f:
        data = f.read()
words = data.splitlines()
words = [w.strip() for w in words] # get rid of any leading or trailing white space
words = [w for w in words if w]
tokens = []
chars = sorted(list(set(''.join(words)))) 
stoi = {ch:i for i,ch in enumerate(chars)}
for item in words:
        tokens.extend([stoi[c] for c in item]) 
# merged_tokens = torch.cat(all_tokens, dim=0)
tokens_np = np.array(tokens)
assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "token dictionary too large for uint16"
tokens_np_uint16 = tokens_np.astype(np.uint16)
file_name = 'babynames_val_' + '_tokens'
np.save(file_name, tokens_np_uint16)

In [None]:
import numpy as np
with open('./babynames/babynames.txt', 'r') as f:
        data = f.read()
words = data.splitlines()
words = [w.strip() for w in words] # get rid of any leading or trailing white space
words = [w for w in words if w]
result = "."+ ".".join(words)
# Break the result into chunks of length 64
# result_chunks = [result[i:i+64] for i in range(0, len(result), 64)]
batch_number = 0
index = 0
batches = []
while True:
    isAvailaible, data, index = fetchNextFromDot(result, index)
    if not isAvailaible or len(data) < 64:
        break
    batches.append(data)
    
def fetchNextFromDot(result, index):
    batch_size = 64
    if index == 0:
        return  True, result[:batch_size], batch_size
    
    data = result[index: index+batch_size]
    
    index_of_first_dot = data.find('.')
    if index_of_first_dot == -1:
        return False, None, None
    
    data = data[index_of_first_dot:]
    remaining = result[index+batch_size: index+batch_size+index_of_first_dot]
    data += remaining
    
    return True, data, index+batch_size+index_of_first_dot
    

# Print first few chunks to verify


In [None]:
(len(batches)*64)/65536

In [None]:
len(words)

In [None]:
batches[:3]

In [None]:
def fetchNextFromDot(result, index):
    batch_size = 64
    if index == 0:
        return  True, result[:batch_size], batch_size
    
    data = result[index: index+batch_size]
    
    index_of_first_dot = data.find('.')
    if index_of_first_dot == -1:
        return False, None, None
    
    data = data[index_of_first_dot:]
    remaining = result[index+batch_size: index+batch_size+index_of_first_dot]
    data += remaining
    
    return True, data, index+batch_size+index_of_first_dot
    

In [None]:
print(result_chunks[13])

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("jbrazzy/baby_names")


In [32]:
words = dataset['train']['Names']
# words = data.splitlines()
words = [w.strip() for w in words] # get rid of any leading or trailing white space
words = [w for w in words if w] # get rid of any empty strings
chars = sorted(list(set(''.join(words)))) # all the possible characters
chars.append('.')
max_word_length = max(len(w) for w in words)
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:s for s,i in stoi.items()} 


def fetchNextFromDot( result, index):
    batch_size = 64
    if index == 0:
        return  True, result[:batch_size], batch_size
    
    data = result[index: index+batch_size]
    
    index_of_first_dot = data.find('.')
    if index_of_first_dot == -1:
        return False, None, None
    
    data = data[index_of_first_dot:]
    remaining = result[index+batch_size: index+batch_size+index_of_first_dot]
    data += remaining
    
    return True, data, index+batch_size+index_of_first_dot

In [39]:
import numpy as np
import config

In [40]:
index = 0
batches = []
result = "."+ ".".join(words)
while True:
    isAvailaible, data, index = fetchNextFromDot(result, index)
    if not isAvailaible or len(data) < 64:
        break
    batches.append(data)

tokens = []
for batch in batches:
    tokens.extend([stoi[c] for c in batch]) 
    
tokens_np = np.array(tokens)
assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "token dictionary too large for uint16"
tokens_np_uint16 = tokens_np.astype(np.uint16)
np.save(config.train_token_file, tokens_np_uint16)

In [38]:
len(tokens_np_uint16), tokens_np_uint16[:10]

(7454848, array([52,  4, 38, 34, 37, 50, 52,  7, 26, 39], dtype=uint16))

In [None]:
# Extract the 'Names' column from the dataset
data = dataset['train']['Names']

# Apply the replace function to each name in the list
cleaned_data = [name.replace(',', '').replace("'", '') for name in data]

# Print the first few cleaned names
print(cleaned_data[:10])