In [1]:
import os
import torch
from torch.utils.data import DataLoader, IterableDataset

In [2]:
def process_subset(major_index_str, encoder, chunk_size=500000):
    base_folder = 'D:/Machine_Learning/MinGPT/extracted_tar_openwebtext'  # Replace with your base directory
    context_length = 1024

    x_list = []  
    y_list = []  

    current_directory = os.getcwd()
    save_path = os.path.join(current_directory, 'datasets', f'subset_{major_index_str}')
    if not os.path.exists(save_path):
        os.makedirs(save_path, exist_ok=True)   

    chunk_count = 0

    completed_folders = 0

    for folder in sorted(os.listdir(base_folder)):
        if not folder.startswith(f'urlsf_subset{major_index_str.zfill(2)}'):
            continue

        full_path = os.path.join(base_folder, folder)

        for txt_file in os.listdir(full_path):
            if txt_file.endswith('.txt'):
                with open(os.path.join(full_path, txt_file), 'r', encoding='utf-8') as file:
                    text = file.read()
                    tokenized_text = encoder.encode(text)
                    
                    if len(tokenized_text) >= context_length:
                        for start_idx in range(0, len(tokenized_text) - context_length + 1, context_length):
                            chunk = tokenized_text[start_idx:start_idx + context_length]
                            for i in range(1, len(chunk)):
                                x_list.append(torch.tensor(chunk[:i], dtype=torch.long))
                                y_list.append(torch.tensor(chunk[i], dtype=torch.long))
                    else:
                        for i in range(1, len(tokenized_text)):
                            x_list.append(torch.tensor(tokenized_text[:i], dtype=torch.long))
                            y_list.append(torch.tensor(tokenized_text[i], dtype=torch.long))
                    
                    # Save when reaching chunk_size
                    if len(x_list) >= chunk_size:
                        dataset_chunk = {'data': x_list, 'labels': y_list}
                        torch.save(dataset_chunk, os.path.join(save_path, f'subset_{major_index_str}_chunk_{chunk_count}.pth'))
                        
                        # Reset lists and increment chunk count
                        x_list = []
                        y_list = []
                        chunk_count += 1

        completed_folders += 1
        print(f"\rTotal completed: {completed_folders}     Folder name: {folder}")

    # Save any remaining data
    if x_list:
        dataset_chunk = {'data': x_list, 'labels': y_list}
        torch.save(dataset_chunk, os.path.join(save_path, f'subset_{major_index_str}_chunk_{chunk_count}.pth'))
    
    print(f"Chunks saved for subset {major_index_str}!")

In [33]:
class ChunkedTextDataset(IterableDataset):
    def __init__(self, major_index_str):
        self.major_index_str = major_index_str
        current_directory = os.getcwd()
        self.base_path = os.path.join(current_directory, 'datasets')
        self.subset_folder = os.path.join(self.base_path, f'subset_{major_index_str}')
        
        # Figure out the number of chunks by counting files in the directory
        self.num_chunks = sum(1 for file in os.listdir(self.subset_folder) if file.startswith(f'subset_{major_index_str}_chunk_') and file.endswith('.pth'))

    def __iter__(self):
        #print('hi')
        for i in range(self.num_chunks):
            #print('hi 2')
            chunk = torch.load(os.path.join(self.subset_folder, f'subset_{self.major_index_str}_chunk_{i}.pth'))
            x_data = chunk['data']
            y_data = chunk['labels']
            for x, y in zip(x_data, y_data):
                #print(x, y)
                yield x, y

In [4]:
from mingpt.bpe import BPETokenizer

encoder = BPETokenizer().encoder

In [5]:
process_subset('0', encoder)

Total completed: 1     Folder name: urlsf_subset00-1000_data.FullName
Total completed: 2     Folder name: urlsf_subset00-100_data.FullName
Total completed: 3     Folder name: urlsf_subset00-101_data.FullName
Total completed: 4     Folder name: urlsf_subset00-102_data.FullName
Total completed: 5     Folder name: urlsf_subset00-103_data.FullName
Total completed: 6     Folder name: urlsf_subset00-104_data.FullName
Total completed: 7     Folder name: urlsf_subset00-105_data.FullName
Total completed: 8     Folder name: urlsf_subset00-106_data.FullName
Total completed: 9     Folder name: urlsf_subset00-107_data.FullName


KeyboardInterrupt: 

In [34]:
# Load the dataset
dataset = ChunkedTextDataset('0')  # Assuming '0' for subset 0 as an example
print(dataset.subset_folder)
print(dataset.num_chunks)

# Fetch and print the first 20 elements
for idx, (x, y) in enumerate(dataset):
    if idx >= 20:
        break
    print(f"Data {idx + 1} - Input: {x}, Target: {y}")

D:\Machine_Learning\MinGPT\datasets\subset_0
7
Data 1 - Input: tensor([1532]), Target: 345
Data 2 - Input: tensor([1532,  345]), Target: 2107
Data 3 - Input: tensor([1532,  345, 2107]), Target: 10522
Data 4 - Input: tensor([ 1532,   345,  2107, 10522]), Target: 290
Data 5 - Input: tensor([ 1532,   345,  2107, 10522,   290]), Target: 389
Data 6 - Input: tensor([ 1532,   345,  2107, 10522,   290,   389]), Target: 20623
Data 7 - Input: tensor([ 1532,   345,  2107, 10522,   290,   389, 20623]), Target: 281
Data 8 - Input: tensor([ 1532,   345,  2107, 10522,   290,   389, 20623,   281]), Target: 7283
Data 9 - Input: tensor([ 1532,   345,  2107, 10522,   290,   389, 20623,   281,  7283]), Target: 1268
Data 10 - Input: tensor([ 1532,   345,  2107, 10522,   290,   389, 20623,   281,  7283,  1268]), Target: 329
Data 11 - Input: tensor([ 1532,   345,  2107, 10522,   290,   389, 20623,   281,  7283,  1268,
          329]), Target: 257
Data 12 - Input: tensor([ 1532,   345,  2107, 10522,   290,   