# Scene Beat to Prose

Train the LLM to write paragraphs in my writing style, based on scene beats.

In [8]:
from mlx_lm import load, generate
import json
import os
from tqdm import tqdm

def separate_chapters(text):
    chapters = []
    lines = text.split('\n')
    current_chapter = []
    weekdays = ['Montag', 'Dienstag', 'Mittwoch', 'Donnerstag', 'Freitag', 'Samstag', 'Sonntag']
    
    for i, line in enumerate(lines):
        if i < len(lines) - 1 and any(lines[i+1].startswith(day) for day in weekdays):
            if current_chapter:
                chapters.append('\n'.join(current_chapter))
                current_chapter = []
        current_chapter.append(line)
    
    if current_chapter:
        chapters.append('\n'.join(current_chapter))
    
    return chapters

def chunk_chapters(chapters, max_words=250):
    chunked_chapters = []
    for chapter in chapters:
        lines = chapter.split('\n')
        current_chunk = []
        word_count = 0
        
        for line in lines:
            line_words = len(line.split())
            if word_count + line_words > max_words:
                if current_chunk:
                    chunked_chapters.append('\n'.join(current_chunk))
                current_chunk = [line]
                word_count = line_words
            else:
                current_chunk.append(line)
                word_count += line_words
        
        if current_chunk:
            chunked_chapters.append('\n'.join(current_chunk))
    
    return chunked_chapters

def paragraph_to_scene_beats(chunk, chunk_id, model, tokenizer):
    prompt = f"""
    Du bist ein professioneller Autor und arbeitest an einer Fantasy-Szene. Vor dir liegt ein Textparagraph, und deine Aufgabe ist es, die zugrundeliegenden Scene Beats zu rekonstruieren. Jeder Beat sollte die Motivationen der Charaktere, die äusseren Umstände und das Ziel der Szene zusammenfassen, aber auf einfache und knappe Weise.

    Hier ist ein Beispiel für einen Scene Beat, wie du ihn schreiben sollst:

    - Die Krieger bereiten sich auf die letzte Schlacht vor. Der Himmel ist düster und bedrohlich. Die Krieger sind angespannt, hören nur ihre eigene Ausrüstung und Atmung. Auf das Kommando ihres Königs stürmen sie entschlossen voran.

    Der Paragraph:

    "{chunk}"

    Antworte nur mit den Scene Beat(s), ohne weitere Einleitung oder Erklärung.
    """

    messages = [{"role": "user", "content": prompt}]
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    try:
        scene_beats = generate(model, tokenizer, prompt=prompt, verbose=False, temp=0.7, max_tokens=1000)     
        return scene_beats
    except Exception as e:
        print(f"Error generating rephrased text: {e}")
        return f"EXCEPTION: {e}"

# Function to generate scene beats for chunks
def generate_scene_beats(input_file, output_file):
    # Load existing results if any
    existing_results = {}
    if os.path.exists(output_file):
        with open(output_file, 'r') as f:
            for line in f:
                data = json.loads(line)
                existing_results[data['id']] = data

    # Process chunks and generate scene beats
    with open(input_file, 'r') as in_file, open(output_file, 'a') as out_file:
        for line in tqdm(in_file):
            data = json.loads(line)
            chunk_id = data['id']
            
            # Skip if already processed
            if chunk_id in existing_results:
                continue
            
            scene_beats = paragraph_to_scene_beats(data['chunk'], chunk_id, model, tokenizer)
            
            result = {
                "id": chunk_id,
                "chunk": data['chunk'],
                "scene_beats": scene_beats
            }
            
            json.dump(result, out_file)
            out_file.write('\n')
            out_file.flush()  # Ensure data is written immediately

# Load model and tokenizer
model, tokenizer = load("models/frdm-Llama-3.1-8B-Write")

# Read and process the text
text = open('data/ansturm.txt', 'r').read()
chapters = separate_chapters(text)
chunked_chapters = chunk_chapters(chapters)

# Save chunked chapters to a JSONL file
chunked_file = 'data/prose_paragraphs.jsonl'
with open(chunked_file, 'w') as f:
    for i, chunk in enumerate(chunked_chapters):
        json.dump({"id": i, "chunk": chunk}, f)
        f.write('\n')

# Generate scene beats
output_file = 'data/prose_paragraphs_with_beats.jsonl'
generate_scene_beats(chunked_file, output_file)

print("Scene beats generation completed.")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
238it [22:26,  5.66s/it]

Scene beats generation completed.





In [1]:
import json

def create_dataset_entry(scene_beats, paragraph):
    messages = [
        {"role": "system", "content": "You are the fantasy author Yvo K. Each time I prompt you with a scene beat, write the full scene based on the idea. Do not conclude the scene on your own, follow the beat instructions closely. Do not end with foreshadowing."},
        {"role": "user", "content": scene_beats},
        {"role": "assistant", "content": paragraph}
    ]
    return json.dumps({"messages": messages})

def create_dataset(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            data = json.loads(line)
            dataset_entry = create_dataset_entry(data['scene_beats'], data['chunk'])
            outfile.write(dataset_entry + '\n')

# Create the dataset
input_file = 'data/prose_paragraphs_with_beats.jsonl'
output_file = 'data/scene_beat_to_prose_dataset.jsonl'
create_dataset(input_file, output_file)

print("Dataset creation completed.")

Dataset creation completed.


In [2]:
import json
import os
import random

def create_train_valid_split(input_file, output_folder, train_ratio=0.8, seed=42):
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Read all lines from the input file
    with open(input_file, 'r') as infile:
        lines = infile.readlines()

    # Shuffle the lines
    random.seed(seed)
    random.shuffle(lines)

    # Calculate the split index
    split_index = int(len(lines) * train_ratio)

    # Split the data
    train_data = lines[:split_index]
    valid_data = lines[split_index:]

    # Write train data
    train_file = os.path.join(output_folder, 'train.jsonl')
    with open(train_file, 'w') as outfile:
        outfile.writelines(train_data)

    # Write validation data
    valid_file = os.path.join(output_folder, 'valid.jsonl')
    with open(valid_file, 'w') as outfile:
        outfile.writelines(valid_data)

    print(f"Train-validation split created in {output_folder}")
    print(f"Train samples: {len(train_data)}")
    print(f"Validation samples: {len(valid_data)}")

# Usage
dataset_name = "frdm-Llama-3.1-8B-Write-Beat-to-Prose-v1"

input_file = 'data/scene_beat_to_prose_dataset.jsonl'
output_folder = f'data/{dataset_name}'
create_train_valid_split(input_file, output_folder)

Train-validation split created in data/frdm-Llama-3.1-8B-Write-Beat-to-Prose-v1
Train samples: 190
Validation samples: 48
