In [None]:
# Necessary downloads to run the code
!pip install transformers torch

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [None]:
# import all packages
import transformers
from transformers import AutoTokenizer, PhiForCausalLM
import re
from tqdm import tqdm
import time
import os

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load in the model and tokenizer for Fietje-2 model
model_name = "BramVanroy/fietje-2"
model = PhiForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Function for readin the file
def read_file(filename):
    with open(filename, 'r') as file:
        content = file.read()
    return content

# Function for splitting the texts into sections and paragraphs
def split_text_into_sections(text):
    # Split the text based on the two white lines
    parts = re.split(r'\n\n\n', text)
    sections = []

    # For each section with headers split it into multiple paragraphs
    for part in parts:
        current_section = {}
        paragraphs = part.split('\n\n')
        for x in range(len(paragraphs)):
          lines = paragraphs[x].split('\n')
          if x == 0:
            current_section['heading'] = lines[0]
            current_section['paragraphs'] = [paragraphs[x][len(lines[0])+1:]]
          else:
            current_section['paragraphs'].append(paragraphs[x])
        sections.append(current_section)
    return sections

# Function for simplifying text with the appraoch few-shot prompting
def simplify_paragraph(text, max_length=500):
    if not text.strip():
        return ""

    prompt = f"""
        Ik wil dat je mijn complexe zin vervangt met een simpele zin.
        De betekenis van de zin moet hetzelfde blijven, maar maak het makkelijker.
        Complex: Ten aanzien van de naleving van de nieuwe wetgeving is het van belang dat alle betrokken partijen op de hoogte zijn van de specifieke verplichtingen en verantwoordelijkheden die hieruit voortvloeien.
        Simpel: Voor de naleving van de nieuwe wet is het belangrijk dat iedereen weet welke verplichtingen en verantwoordelijkheden er zijn.
        Complex: De overheid heeft besloten dat, in het kader van de nieuwe subsidieregeling, organisaties die in aanmerking willen komen voor financiële ondersteuning, moeten voldoen aan een reeks strengere criteria, welke zijn opgesteld om de efficiëntie en effectiviteit van de verstrekte subsidies te waarborgen.
        Simpel: De overheid heeft nieuwe regels voor subsidies. Organisaties die subsidies willen, moeten nu aan strengere criteria voldoen. Deze criteria zorgen ervoor dat subsidies beter worden gebruikt.
        Complex: {text}
        Simpel:
        """
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    text_tokenized = tokenizer.encode(text, return_tensors="pt")

    # Define the maximum length
    input_length = len(inputs[0])
    text_length = len(text_tokenized[0])
    max_length = int(input_length + text_length)

    # Generate the output if possible, otherwise return original paragraph
    try:
        outputs = model.generate(
            inputs,
            max_length=max_length,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
        )
    except ValueError as e:
        print(f"Error processing paragraph: {text}")
        print(f"Error: {e}")
        return text

    # Decode the output
    simplified_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the simplified text from the generated text
    simplified_text = simplified_text.replace(prompt, "").strip()

    if "Complex:" in simplified_text:
        simplified_text = simplified_text.split("Complex:")[0].strip()

    return simplified_text

# Build the new text
def rebuild_text(sections):
    new_text = ''
    total_sections = len(sections)
    process_text = True
    # Start the timer
    start_time = time.time()

    # Start the progress bar
    with tqdm(total=total_sections, desc="Processing Sections") as pbar:
        for section in sections:
            # Copy the header
            heading = section['heading']
            new_text += heading + '\n\n'

            # If it is the last header, do not simplify further
            if heading == "Met vriendelijke groet,":
                process_text = False

            # simplify the paragraphs and paste both the original and simplified paragraphs
            for paragraph in section['paragraphs']:
                if process_text:
                    new_text +=  'Original: \n' + paragraph + '\n\n'
                    new_text += 'Simplified: \n' + simplify_paragraph(paragraph) + '\n\n'
                else:
                    new_text +=  paragraph + '\n'
            # Update the progress bar
            pbar.update(1)

    # Stop the timer
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Processing completed in {elapsed_time:.2f} seconds.")

    return new_text.strip()

# Function for simplifying a letter and writing it to a file
def simplify_letter(text, output_file):
    sections = split_text_into_sections(text)
    new_text = rebuild_text(sections)
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(new_text)

# Function for generating the filename for the output
def generate_filename(input_filename):
    filename_without_extension = input_filename.rsplit('.', 1)[0]
    output_filename = "simplified/" + filename_without_extension + '_raw_simplified.txt'
    return output_filename

# Function to loop through all files in the folder
def process_all_files_in_folder(folder_path):
    files = os.listdir(folder_path)

    for file in tqdm(files, desc="Processing files"):
        full_file_path = os.path.join(folder_path, file)
        # Ensure it's a file
        if os.path.isfile(full_file_path):
            text = read_file(full_file_path)
            output_file = generate_filename(file)
            simplify_letter(text, output_file)

In [None]:
# Process all files in the specified folder
path = 'data_formatted_part9/'
process_all_files_in_folder(path)

Processing files:   0%|          | 0/5 [00:00<?, ?it/s]
Processing Sections:   0%|          | 0/8 [00:00<?, ?it/s][A
Processing Sections:  25%|██▌       | 2/8 [00:18<00:55,  9.32s/it][A
Processing Sections:  38%|███▊      | 3/8 [00:52<01:36, 19.36s/it][A
Processing Sections:  62%|██████▎   | 5/8 [01:46<01:11, 23.71s/it][A
Processing Sections:  75%|███████▌  | 6/8 [02:26<00:56, 28.26s/it][A
Processing Sections: 100%|██████████| 8/8 [02:41<00:00, 20.13s/it][A
Processing files:  40%|████      | 2/5 [02:41<04:01, 80.51s/it]

Processing completed in 161.03 seconds.



Processing Sections:   0%|          | 0/9 [00:00<?, ?it/s][A
Processing Sections:  22%|██▏       | 2/9 [00:17<00:59,  8.57s/it][A
Processing Sections:  33%|███▎      | 3/9 [01:28<03:29, 34.86s/it][A
Processing Sections:  44%|████▍     | 4/9 [02:32<03:48, 45.76s/it][A
Processing Sections:  56%|█████▌    | 5/9 [03:15<02:58, 44.70s/it][A
Processing Sections:  67%|██████▋   | 6/9 [03:32<01:46, 35.63s/it][A
Processing Sections:  78%|███████▊  | 7/9 [04:09<01:11, 35.96s/it][A
Processing Sections: 100%|██████████| 9/9 [05:21<00:00, 35.77s/it][A
Processing files:  60%|██████    | 3/5 [08:02<06:02, 181.12s/it]

Processing completed in 321.96 seconds.



Processing Sections:   0%|          | 0/6 [00:00<?, ?it/s][A
Processing Sections:  17%|█▋        | 1/6 [00:17<01:26, 17.25s/it][A
Processing Sections:  33%|███▎      | 2/6 [00:52<01:50, 27.67s/it][A
Processing Sections:  50%|█████     | 3/6 [01:41<01:52, 37.54s/it][A
Processing Sections:  67%|██████▋   | 4/6 [01:54<00:55, 27.90s/it][A
Processing Sections: 100%|██████████| 6/6 [02:18<00:00, 23.05s/it][A
Processing files:  80%|████████  | 4/5 [10:21<02:45, 165.14s/it]

Processing completed in 138.31 seconds.



Processing Sections:   0%|          | 0/10 [00:00<?, ?it/s][A
Processing Sections:  20%|██        | 2/10 [00:26<01:47, 13.41s/it][A
Processing Sections:  30%|███       | 3/10 [02:14<06:08, 52.71s/it][A
Processing Sections:  40%|████      | 4/10 [03:49<06:51, 68.51s/it][A
Processing Sections:  50%|█████     | 5/10 [04:20<04:36, 55.40s/it][A
Processing Sections:  60%|██████    | 6/10 [04:36<02:49, 42.25s/it][A
Processing Sections:  70%|███████   | 7/10 [05:11<02:00, 40.14s/it][A
Processing Sections:  80%|████████  | 8/10 [05:27<01:04, 32.37s/it][A
Processing Sections: 100%|██████████| 10/10 [05:37<00:00, 33.80s/it][A
Processing files: 100%|██████████| 5/5 [15:59<00:00, 191.85s/it]

Processing completed in 337.96 seconds.



