In [24]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
from torch.utils.data import Dataset

class CharDataset(Dataset):

    def __init__(self, input_file):
        with open(input_file, 'r') as f:
            data = f.read()
        words = data.splitlines()
        words = [w.strip() for w in words] # get rid of any leading or trailing white space
        words = [w for w in words if w] # get rid of any empty strings
        chars = sorted(list(set(''.join(words)))) # all the possible characters
        max_word_length = max(len(w) for w in words)
        print(f"number of examples in the dataset: {len(words)}")
        print(f"max word length: {max_word_length}")
        print(f"number of unique characters in the vocabulary: {len(chars)}")
        print("vocabulary:")
        print(''.join(chars))
        self.words = words
        self.chars = chars
        print("chars: {chars}")
        self.stoi = {ch:i for i,ch in enumerate(chars)}
        self.itos = {i:s for s,i in self.stoi.items()} 
        # self.generate_tokens(input_file)
        
    def generate_tokens(self,input_file):
        tokens=[]
        for item in self.words:
            tokens.extend([self.stoi[c] for c in item]) 
        # merged_tokens = torch.cat(all_tokens, dim=0)
        tokens_np = np.array(tokens)
        assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "token dictionary too large for uint16"
        tokens_np_uint16 = tokens_np.astype(np.uint16)
        file_name = input_file + '_tokens'
        np.save(file_name, tokens_np_uint16)
    
    def __len__(self):
        return len(self.words)

    def contains(self, word):
        return word in self.words

    def get_vocab_size(self):
        return len(self.chars) + 1 # all the possible characters and special 0 token

    def encode(self, word):
        ix = torch.tensor([self.stoi[w] for w in word], dtype=torch.long)
        return ix

    def decode(self, ix):
        word = ''.join(self.itos[i] for i in ix)
        return word

    def __getitem__(self, idx):
        word = self.words[idx]
        ix = self.encode(word)
        tkns= torch.tensor(ix, dtype=torch.long)#x, y
        return tkns
    



In [16]:
def load_tokens(filename):
    npt = np.load(filename)
    npt = npt.astype(np.int32) # added after video
    ptt = torch.tensor(npt, dtype=torch.long)
    return ptt

tensor([ 4, 38, 34,  ..., 43, 26, 39])

In [27]:

# # wrap in dataset objects
# file_name = 'babynames_train_0'
# CharDataset('./babynames/babynames_0.txt')
#token_filename = file_name + '_tokens.npy'
ptt = load_tokens('./babynames/babynames_train_0.txt_tokens.npy')
len(ptt)


6750857

In [1]:
6750857/32768

206.01980590820312

In [2]:
import pandas as pd
# Read the Parquet file
df = pd.read_parquet('./raw_data/names_val.parquet')
# Select the specific column you want
column_data = df['Names']  # Replace 'your_column_name' with the actual column name
# Write the selected column to a text file
column_data.to_csv('babynames_val.txt', index=False, header=False)

In [5]:
import numpy as np
with open('./babynames/babynames_val.txt', 'r') as f:
        data = f.read()
words = data.splitlines()
words = [w.strip() for w in words] # get rid of any leading or trailing white space
words = [w for w in words if w]
tokens = []
chars = sorted(list(set(''.join(words)))) 
stoi = {ch:i for i,ch in enumerate(chars)}
for item in words:
        tokens.extend([stoi[c] for c in item]) 
# merged_tokens = torch.cat(all_tokens, dim=0)
tokens_np = np.array(tokens)
assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "token dictionary too large for uint16"
tokens_np_uint16 = tokens_np.astype(np.uint16)
file_name = 'babynames_val_' + '_tokens'
np.save(file_name, tokens_np_uint16)