In [1]:
#!python process_subset.py 0

^C


In [11]:
import os
import torch
from torch.utils.data import IterableDataset

class ChunkedTextDataset(IterableDataset):
    def __init__(self, major_index_str):
        current_directory = os.getcwd()
        self.base_path = os.path.join(current_directory, 'datasets')
        self.subset_folder = os.path.join(self.base_path, f'subset_{major_index_str}')
        
        # Figure out the number of chunks by counting files in the directory
        self.num_chunks = sum(1 for file in os.listdir(self.subset_folder) if file.startswith(f'chunk_') and file.endswith('.pth'))

    def __iter__(self):
        for i in range(self.num_chunks):
            chunk = torch.load(os.path.join(self.subset_folder, f'chunk_{i}.pth'))
            x_data = chunk['data']
            y_data = chunk['labels']
            for x, y in zip(x_data, y_data):
                yield x, y

In [12]:
# Load the dataset
dataset = ChunkedTextDataset('0')  # Assuming '0' for subset 0 as an example
print(dataset.subset_folder)
print(dataset.num_chunks)

# Fetch and print the first 20 elements
for idx, (x, y) in enumerate(dataset):
    if idx >= 20:
        break
    print(f"Data {idx + 1} - Input: {x}, Target: {y}")

D:\Machine_Learning\MinGPT\datasets\subset_0
8
Data 1 - Input: tensor([   49, 16228, 50017,  ...,   481,   466,  2279], dtype=torch.int32), Target: 3306
Data 2 - Input: tensor([1194, 1755,  286,  ...,   13,  198,  198], dtype=torch.int32), Target: 30
Data 3 - Input: tensor([2437,  750,  262,  ...,   12,   57,  295], dtype=torch.int32), Target: 1042
Data 4 - Input: tensor([  670,    13, 47332,  ...,   287,   262,  3765], dtype=torch.int32), Target: 3265
Data 5 - Input: tensor([ 290,  663,  598,  ...,  257, 2041, 2597], dtype=torch.int32), Target: 355
Data 6 - Input: tensor([1022, 5878,  290,  ..., 3489, 6096,  286], dtype=torch.int32), Target: 428
Data 7 - Input: tensor([  663, 33739,  6424,  ..., 12986, 20428,    13], dtype=torch.int32), Target: 198
Data 8 - Input: tensor([5922, 1964, 3303,  ...,  287,  262, 2095], dtype=torch.int32), Target: 290
Data 9 - Input: tensor([  286,  3449, 19851,  ..., 28866,   312, 13843], dtype=torch.int32), Target: 13
Data 10 - Input: tensor([11518,   402

In [5]:
chunk = torch.load('D:/Machine_Learning/MinGPT/datasets/subset_0/chunk_2.pth')

In [25]:
x_data = chunk['data']
y_data = chunk['labels']

idx = 0
for x, y in zip(x_data, y_data):
    if idx >= 20:
        break
    print(f"Data {idx + 1} - Input: {x}, Target: {y}")
    idx += 1

Data 1 - Input: tensor([  464, 22361,  5407,   468,   587,  4953,   257,   890,   640,   329,
        22361, 40129, 19239,    11,   262,   717, 17214,   290, 29871,  7072,
          422, 22361, 40129, 13616,  4657,    11,   284,  1280,   511,  8215,
          319, 40030,   417, 12761,    13,  1318,   423,   587,  2407,   257,
         1178, 19649,   290,   866,   329,  4393, 17542,   327,   320,  2879,
          290,  5180,   360,   669,  1531,    78,   625,   262,   938,  5193,
         1933,    11,   475,  1306,  1285,   484,   481,  3443,  1280,   530,
          286, 14905,   447,   247,    82,   749, 14486, 10808,   286,  1853,
           13,   198,   198,  6385,   484,   442, 16898,   546,   511,  3352,
          284,  1280,   510,   257,  7072,   287,   257,  2211,  9905,  4471,
          286, 27574,   632,  3205,    11,   356,   447,   247,   303,  1900,
          428,   373,   691,   257,  2300,   286,   640,    13,   775, 19538,
          306,  9672,   484,   447,   247,    67

In [18]:
import os

save_directory = os.path.join(os.getcwd(), 'datasets')
subsets = [f'subset_{i}' for i in range(10)]  # Assuming you have subsets from 01 to 99

corrupted_files = []

for subset in subsets:
    subset_path = os.path.join(save_directory, subset)
    for file in os.listdir(subset_path):
        file_path = os.path.join(subset_path, file)
        try:
            data = torch.load(file_path)
        except RuntimeError:
            corrupted_files.append(file_path)

print(f"Number of Corrupted files: {len(corrupted_files)}")
print("Corrupted files:")
print("\n".join(corrupted_files))

Number of Corrupted files: 0
Corrupted files:



In [13]:
from mingpt.bpe import BPETokenizer
import os
import numpy as np
import torch
from torch.utils.data import Dataset
from multiprocessing import Process, Queue, cpu_count, Value, Lock
from queue import Empty
import time
from filelock import FileLock
import shutil  # Added for shutil.move


def tokenize_generator(text, encoder, context_length=1024):
    try:
        tokenized_text = encoder.encode(text)
        total_length = len(tokenized_text)
        stride = context_length // 8

        # For very short texts
        if total_length <= stride:
            yield np.array(tokenized_text[:-1], dtype=np.long), np.array(tokenized_text[-1], dtype=np.long)
            
        # For texts longer than the stride but shorter than the context length
        elif total_length <= context_length:
            for end_idx in range(stride, total_length + 1, stride):
                yield np.array(tokenized_text[:end_idx-1], dtype=np.long), np.array(tokenized_text[end_idx-1], dtype=np.long)
            
            # If there are any tokens left at the end that couldn't fit a full stride window
            if (total_length - 1) % stride != 0:
                yield np.array(tokenized_text[-stride:-1], dtype=np.long), np.array(tokenized_text[-1], dtype=np.long)
                
        # For longer texts
        else:
            # Gradually increase the window size until it's the size of the context length
            for end_idx in range(stride, context_length + 1, stride):
                yield np.array(tokenized_text[:end_idx-1], dtype=np.long), np.array(tokenized_text[end_idx-1], dtype=np.long)
                
            # Now slide the window by the stride for the rest of the text
            for start_idx in range(stride, total_length - context_length + 1, stride):
                end_idx = start_idx + context_length
                yield np.array(tokenized_text[start_idx:end_idx-1], dtype=np.long), np.array(tokenized_text[end_idx-1], dtype=np.long)

            # If there are any tokens left at the end that couldn't fit a full context_length window
            if end_idx < total_length:
                yield np.array(tokenized_text[-context_length:-1], dtype=np.long), np.array(tokenized_text[-1], dtype=np.long)
                
    except Exception as e:
        print(f"Error during tokenization: {e}")



In [17]:
text = "This is a sample text for testing the tokenizer function. The minGPT library is three files: mingpt/model.py contains the actual Transformer model definition, mingpt/bpe.py contains a mildly refactored Byte Pair Encoder that translates between text and sequences of integers exactly like OpenAI did in GPT, mingpt/trainer.py is (GPT-independent) PyTorch boilerplate code that trains the model. Then there are a number of demos and projects that use the library in the projects folder:"
for i in range(30):
    sample_text += text
from mingpt.bpe import BPETokenizer
encoder = BPETokenizer().encoder
for tokens in tokenize_generator(sample_text, encoder):
    print(tokens)

(array([ 1212,   318,   257,  6291,  2420,   329,  4856,   262, 11241,
        7509,  2163,    13,  1212,   318,   257,  6291,  2420,   329,
        4856,   262, 11241,  7509,  2163,    13,  1212,   318,   257,
        6291,  2420,   329,  4856,   262, 11241,  7509,  2163,    13,
        1212,   318,   257,  6291,  2420,   329,  4856,   262, 11241,
        7509,  2163,    13,  1212,   318,   257,  6291,  2420,   329,
        4856,   262, 11241,  7509,  2163,    13,  1212,   318,   257,
        6291,  2420,   329,  4856,   262, 11241,  7509,  2163,    13,
        1212,   318,   257,  6291,  2420,   329,  4856,   262, 11241,
        7509,  2163,    13,  1212,   318,   257,  6291,  2420,   329,
        4856,   262, 11241,  7509,  2163,    13,  1212,   318,   257,
        6291,  2420,   329,  4856,   262, 11241,  7509,  2163,    13,
        1212,   318,   257,  6291,  2420,   329,  4856,   262, 11241,
        7509,  2163,    13,  1212,   318,   257,  6291,  2420,   329,
        4856]), arr

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  yield np.array(tokenized_text[:end_idx-1], dtype=np.long), np.array(tokenized_text[end_idx-1], dtype=np.long)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  yield np.array(tokenized_text[start_idx:end_idx-1], dtype=np.long), np.array(tokenized_text[end_idx-1], dtype=np.long)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  yield np.array(tokenized_text[-context_length:-1], dtype=np.long), np.array(tokenized_text[-1], dtype=np.long)
