# Load a text file and convert it to a csv file for model evaluation

In [14]:
import re
import pandas as pd

# Step 1: Load the text file
file_path = "../data/lotr.txt"  # Path to your text file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

text = text[:100000]
print(f"Number of characters in text: {len(text)}")

# Step 2: Preprocess the text
def clean_text(text):
    # Remove unnecessary spaces and line breaks
    text = re.sub(r'\s+', ' ', text).strip()
    return text

cleaned_text = clean_text(text)

# Step 3: Split the text into segments
def create_segments(text, segment_length=100, overlap=50):
    """
    Split text into segments with optional overlap.
    """
    words = text.split()
    segments = []
    for i in range(0, len(words) - segment_length, segment_length - overlap):
        segment = words[i:i + segment_length]
        segments.append(" ".join(segment))
    return segments

segments = create_segments(cleaned_text, segment_length=100, overlap=50)

# Step 4: Create prompts and references
def create_dataset(segments, prompt_length=50):
    """
    Use the first part of each segment as the prompt and the rest as the reference.
    """
    dataset = []
    for segment in segments:
        words = segment.split()
        prompt = " ".join(words[:prompt_length])
        reference = " ".join(words[prompt_length:])
        dataset.append({"prompt": prompt, "reference": reference})
    return dataset

dataset = create_dataset(segments, prompt_length=50)

# Step 5: Save the dataset to a file
df = pd.DataFrame(dataset)
output_path = "lotr_dataset.csv"
df.to_csv(output_path, index=False)

print(f"Dataset saved to {output_path}")


Number of characters in text: 100000
Dataset saved to lotr_dataset.csv
