In [None]:
import os
import random
import concurrent.futures
from tqdm import tqdm

def process_file(args):
    directory, filename, output_file, vocab = args
    file_path = os.path.join(directory, filename)
    try:
        with open(file_path, "r", encoding="utf-8") as infile:
            text = infile.read()
    except Exception as e:
        print(f"Skipping file {file_path} due to error: {e}")
        return set()  # Return an empty set if file cannot be read

    with open(output_file, "a", encoding="utf-8") as outfile:
        outfile.write(text)

    characters = set(text)
    return characters

def txt_files_in_dir(directory):
    return [filename for filename in os.listdir(directory) if filename.endswith(".txt") and os.path.isfile(os.path.join(directory, filename))]

def process_files_in_parallel(files, folder_path, output_file):
    vocab = set()
    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
        args = [(folder_path, filename, output_file, vocab) for filename in files]
        for characters in tqdm(executor.map(process_file, args), total=len(files)):
            vocab.update(characters)
    return vocab

folder_path = "/content/TXT_FILES/"
output_file_train = "output_train.txt"
output_file_val = "output_val.txt"
vocab_file = "vocab.txt"

files = txt_files_in_dir(folder_path)
print(files)
total_files = len(files)
print(total_files)

split_index = int(total_files * 0.9)  # 90% for training
print(split_index)
files_train = files[:split_index]
files_val = files[split_index:]

# Sampling a hundredth of the files for each split
sample_rate = 0.01
files_train_sampled = random.sample(files_train, max(1, int(len(files_train) * sample_rate)))
files_val_sampled = random.sample(files_val, max(1, int(len(files_val) * sample_rate)))

# Ensure output files are empty before appending
open(output_file_train, 'w').close()
open(output_file_val, 'w').close()

# Process the sampled training files
vocab_train = process_files_in_parallel(files_train_sampled, folder_path, output_file_train)

# Process the sampled validation files
vocab_val = process_files_in_parallel(files_val_sampled, folder_path, output_file_val)

# Combine vocabularies (if needed) and write to vocab.txt
vocab = vocab_train.union(vocab_val)
with open(vocab_file, "w", encoding="utf-8") as vfile:
    for char in sorted(vocab):
        vfile.write(char + '\n')


['output_train.txt', 'output_val.txt']
2
1


100%|██████████| 1/1 [00:00<00:00, 123.71it/s]
100%|██████████| 1/1 [00:00<00:00, 264.74it/s]


In [None]:
!wget -O dataset.txt https://raw.githubusercontent.com/Infatoshi/fcc-intro-to-llms/refs/heads/main/wizard_of_oz.txt

--2025-01-01 10:08:38--  https://raw.githubusercontent.com/Infatoshi/fcc-intro-to-llms/refs/heads/main/wizard_of_oz.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 237733 (232K) [text/plain]
Saving to: ‘dataset.txt’


2025-01-01 10:08:38 (10.2 MB/s) - ‘dataset.txt’ saved [237733/237733]



In [None]:
import os

# Input file path
file_path = "/content/Data/dataset.txt"

# Output file paths
output_file_train = "output_train.txt"
output_file_val = "output_val.txt"

# Read the content of the file
with open(file_path, "r", encoding="utf-8") as infile:
    text = infile.read()

# Calculate split index (90% for training)
split_index = int(len(text) * 0.9)

# Split the text into training and evaluation parts
train_text = text[:split_index]  # First 90%
val_text = text[split_index:]    # Last 10%

# Write the training data to a file
with open(output_file_train, "w", encoding="utf-8") as train_file:
    train_file.write(train_text)

# Write the evaluation data to a file
with open(output_file_val, "w", encoding="utf-8") as val_file:
    val_file.write(val_text)

# Print status
print(f"Training data saved to {output_file_train}, size: {len(train_text)} characters")
print(f"Validation data saved to {output_file_val}, size: {len(val_text)} characters")


Training data saved to output_train.txt, size: 209078 characters
Validation data saved to output_val.txt, size: 23231 characters
