In [5]:
from datasets import load_dataset

# Load WikiText-103
try:
    dataset = load_dataset("wikitext", "wikitext-103-v1")
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit(1)

# List available splits
print(dataset)

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})


In [7]:
# Access the training split
train_dataset = dataset['train']

# Print the number of samples
print(f"Number of training samples: {len(train_dataset)}")

# Print the first 3 samples
for i in range(20):
    sample = train_dataset[i]
    print(f"Sample {i}: {sample}")

Number of training samples: 1801350
Sample 0: {'text': ''}
Sample 1: {'text': ' = Valkyria Chronicles III = \n'}
Sample 2: {'text': ''}
Sample 3: {'text': ' Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . \n'}
Sample 4: {'text': " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it 

In [8]:
validation_data = dataset['validation']
test_data = dataset['test']

In [9]:
from datasets import load_dataset
from tqdm import tqdm
import re

# Load WikiText-103
dataset = load_dataset("wikitext", "wikitext-103-v1")
train_dataset = dataset['train']

# Function to clean text
def clean_text(text):
    # Remove non-ASCII characters
    text = text.encode('ascii', errors='ignore').decode()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove special characters (retain basic punctuation)
    text = re.sub(r'[^A-Za-z0-9.,;:!?\'\"()\[\]{}\- ]+', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Filter out empty texts and clean the remaining
cleaned_texts = []
for sample in tqdm(train_dataset, desc="Cleaning and filtering data"):
    text = sample['text']
    if text:  # Ensure text is not empty
        cleaned = clean_text(text)
        if cleaned:  # Ensure cleaned text is not empty
            cleaned_texts.append(cleaned)

print(f"Number of cleaned samples: {len(cleaned_texts)}")

Cleaning and filtering data: 100%|██████████| 1801350/1801350 [00:31<00:00, 56677.25it/s]

Number of cleaned samples: 1164953





In [10]:
output_file = "wikitext103_train_cleaned.txt"

with open(output_file, "w", encoding="utf-8") as f:
    for text in tqdm(cleaned_texts, desc="Saving cleaned data"):
        f.write(text + "\n\n")  # Double newline as a separator

print(f"Cleaned training data saved to {output_file}")

Saving cleaned data: 100%|██████████| 1164953/1164953 [00:00<00:00, 1190187.75it/s]

Cleaned training data saved to wikitext103_train_cleaned.txt





In [77]:
import random

def split_dataset(file_path, train_ratio=0.8, valid_ratio=0.1, test_ratio=0.1, seed=42):
    # Check that the ratios add up to 1
    assert train_ratio + valid_ratio + test_ratio == 1.0, "Ratios must sum to 1"
    
    # Read the full text file
    with open(file_path, 'r') as f:
        lines = f.readlines()
    
    # Shuffle the data
    random.seed(seed)
    random.shuffle(lines)
    
    # Determine split indices
    total_lines = len(lines)
    train_end = int(total_lines * train_ratio)
    valid_end = train_end + int(total_lines * valid_ratio)
    
    # Split the data
    train_lines = lines[:train_end]
    valid_lines = lines[train_end:valid_end]
    test_lines = lines[valid_end:]
    
    # Write to separate files
    with open('wiki.train.txt', 'w') as f:
        f.writelines(train_lines)
    with open('wiki.valid.txt', 'w') as f:
        f.writelines(valid_lines)
    with open('wiki.test.txt', 'w') as f:
        f.writelines(test_lines)
    
    print(f"Data split into {len(train_lines)} training, {len(valid_lines)} validation, and {len(test_lines)} test lines.")

# Example usage
split_dataset('data.txt')

Data split into 1863924 training, 232990 validation, and 232992 test lines.
