In [302]:
pip install openai pymupdf


Note: you may need to restart the kernel to use updated packages.


**to be able to load the .enf file for the OpenAI key:**


In [304]:
pip install python-dotenv 


Note: you may need to restart the kernel to use updated packages.


In [305]:
from openai import OpenAI
from dotenv import load_dotenv
from pathlib import Path
import os
import fitz  # PyMuPDF
import time

dotenv_path = Path("D:/Python/Samples_VS/pythonLearning/OpenAI_key.env")
load_dotenv(dotenv_path=dotenv_path)
openai.api_key = os.getenv("OPENAI_API_KEY")


In [306]:
# === CONFIGURATION ===
pdf_path = r"D:\Python\Samples_VS\pythonLearning\C1_german\wortschatz.pdf"
output_dir = r"D:\Python\Samples_VS\pythonLearning\C1_german\generated_texts"
os.makedirs(output_dir, exist_ok=True)

# === FUNCTION: Extract text from a page ===
def extract_text_from_page(pdf_doc, page_number):
    page = pdf_doc.load_page(page_number)
    return page.get_text()

# === FUNCTION: Extract words from a page (basic split) ===
def extract_words_from_text(text):
    lines = text.splitlines()
    words = [
        line.strip()
        for line in lines[0:-4]
        if line.strip() and len(line.strip()) < 40 and ' S. ' not in line
    ]
    return words


# === FUNCTION: Ask GPT for a story ===
def ask_gpt_for_story(page_num, story_num, words, text_style):
    word_list = ", ".join(words)
    prompt = f"""
Imagine that you are a teacher of German who wants adult learners to learn German advanced words, C1 level,
using inductive language acquisition. Write me a pedogogical and engagig {text_style} (around 400 words) 
in simple German, in which you use these words:

{word_list}. The point is that the reader learns these new words passively as he reads the text.

Before the text, list the words under "Neue Wörter in diesem Text:" with the equevalent English word in parenthesis.
Inside the text, 
Importantly, make the meaning of each word easy to guess, using techniques like:
– breaking compound words to the elements, and taking advantage of ethymological similarity with English, e.g., Er war Rast.los (rest.less),
- breakoing down coumpund words and make the meaning clearer: beschleunigung --> be.schenell.igung
– using examples or oter German synonyms right after a word between commas.
– if the word is hard to be guessed, like abstract concepts, then adding English equivalents in parentheses is 
also absolutely ok.
try to write the text in a way that apart from the 'new words', other expressions would be easy to understand.
The goal is not the perfect text or story, but language learning. Avoid complicated grammar. Use the words naturally, 
but clearly. prevent from using phrases, like, it means, ... , or this word means, ..., just say what it means, 
especially with examples or synonyms. In the text, when a 'new word' from the list is used, print it in bold.

"""


    client = OpenAI(
        api_key=os.environ.get("OPENAI_API_KEY"),
    )

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=.7,
            max_tokens=1000
        )
        story = response.choices[0].message.content.strip()
        filename = os.path.join(output_dir, f"{page_num+1}_{story_num+1}_text.txt")
        with open(filename, "w", encoding="utf-8") as f:
            f.write(story)
        print(f"✔ Saved story to {filename}")
    except Exception as e:
        print(f"❌ Error on page {page_num+1}, story {story_num+1}: {e}")
        time.sleep(10)  # Wait a bit before retrying or moving on





In [307]:
# === MAIN LOOP ===
def main():
    pdf_doc = fitz.open(pdf_path)

    for page_num in range(len(pdf_doc)):
        print(f"📄 Processing page {page_num+1}")
        text = extract_text_from_page(pdf_doc, page_num)
        words = extract_words_from_text(text)

        if not words:
            print(f"⚠ No words found on page {page_num+1}, skipping.")
            continue

        # Save extracted words to file
        word_file = os.path.join(output_dir, f"{page_num+1}_words.txt")
        with open(word_file, "w", encoding="utf-8") as f:
            f.write("\n".join(words))
        print(f"📁 Saved words to {word_file}")


        # Define the possible styles
        styles = [    "realistic dialogue", "creative fictional or realistic story",
             "non-fiction documentation-style report"
        ]
        # Generate 10 stories with random subsets of words
        for story_num in range(10):
            subset_size = min(len(words), 15)
            selected_words = random.sample(words, subset_size)
            # Randomly choose a style (with equal weights)
            style_instruction = random.choice(styles)
            ask_gpt_for_story(page_num, story_num, selected_words, style_instruction)
            time.sleep(5)  # To avoid hitting rate limits

    
    pdf_doc.close()
    print("✅ All done.")

if __name__ == "__main__":
    main()


📄 Processing page 1
📁 Saved words to D:\Python\Samples_VS\pythonLearning\C1_german\generated_texts\1_words.txt
✔ Saved story to D:\Python\Samples_VS\pythonLearning\C1_german\generated_texts\1_1_text.txt
✔ Saved story to D:\Python\Samples_VS\pythonLearning\C1_german\generated_texts\1_2_text.txt
✔ Saved story to D:\Python\Samples_VS\pythonLearning\C1_german\generated_texts\1_3_text.txt
✔ Saved story to D:\Python\Samples_VS\pythonLearning\C1_german\generated_texts\1_4_text.txt
✔ Saved story to D:\Python\Samples_VS\pythonLearning\C1_german\generated_texts\1_5_text.txt
✔ Saved story to D:\Python\Samples_VS\pythonLearning\C1_german\generated_texts\1_6_text.txt
✔ Saved story to D:\Python\Samples_VS\pythonLearning\C1_german\generated_texts\1_7_text.txt
✔ Saved story to D:\Python\Samples_VS\pythonLearning\C1_german\generated_texts\1_8_text.txt
✔ Saved story to D:\Python\Samples_VS\pythonLearning\C1_german\generated_texts\1_9_text.txt
✔ Saved story to D:\Python\Samples_VS\pythonLearning\C1_germa