In [None]:
# This is already installed on Kaggle
# !pip install -U -q google-generativeai

In [None]:
import pandas as pd
import numpy as np
import os

import json
import time
import re

import google.generativeai as genai


In [None]:
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
GOOGLE_API_KEY = user_secrets.get_secret("GOOGLE_API_KEY")

In [None]:
NUM_ROWS_TO_CREATE = 280

# Set to true to print the results
PRINT_OUTPUT = True

## Create a list of moral values

In [None]:
value_list = [
"Kindness – Showing care and consideration for others, especially those in need.",
"Honesty – Valuing truth and integrity in words and actions.",
"Courage – Encouraging bravery to face fears and challenges.",
"Responsibility – Teaching accountability and the importance of fulfilling commitments.",
"Respect – Valuing other people’s rights, opinions, and differences.",
"Perseverance – Promoting resilience and the determination to keep going despite challenges.",
"Forgiveness – Encouraging letting go of grudges and offering second chances.",
"Gratitude – Emphasizing thankfulness for what one has and the kindness of others.",
"Teamwork – Highlighting the importance of working together and supporting one another.",
"Fairness – Teaching equity and justice, where everyone is treated equally.",
"Generosity – Encouraging giving to others, particularly those less fortunate.",
"Patience – Fostering calmness and the ability to wait without frustration.",
"Empathy – Helping children understand and care about the feelings of others.",
"Humility – Promoting modesty and recognizing the strengths and contributions of others.",
"Self-discipline – Teaching control over impulses and making responsible choices.",
"Gender Equality – Promoting equal rights and opportunities for all genders.",
"Religious Tolerance – Encouraging understanding and acceptance of different faiths and beliefs.",
"Strength in Diversity – Celebrating the value of different cultures, backgrounds, and perspectives.",
"Racial Equality – Fostering respect and equality among all races, promoting fairness and inclusion.",
"Environmental Stewardship – Encouraging responsibility for protecting the planet and its resources.",
"Compassion for Animals – Promoting kindness and care for all living creatures.",
"Peacefulness – Teaching peaceful conflict resolution and the importance of non-violence.",
"Gratitude for Simple Things – Encouraging appreciation of everyday moments and small acts of kindness.",
"Sharing – Teaching the value of sharing what we have with others.",
"Independence and Self-reliance – Encouraging confidence in one’s abilities and the importance of taking initiative.",
"Speaking Out Against Injustice – Instilling the courage to stand up against unfairness, discrimination, or wrongdoing. This value teaches children that they have the power and responsibility to challenge injustice, whether it is directed at them or others, and to advocate for fairness and equality in their communities.",
"Being On Time – Teaching the value of punctuality and respect for others’ time, reinforcing discipline and reliability.",
"Striving for Excellence – Instilling a mindset of doing one’s best and aiming for high standards in all pursuits, whether in school, hobbies, or personal goals.",
]

len(value_list)

28

## Set up the agents

In [None]:
# Phase 1 Agents

def run_story_prompt_agent(prompt_language, moral_value):

    input_for_prompt_gen = f"""
    You are an expert at creating prompts that LLMs can use to generate
    South African children's stories that teach children moral values
    and life lessons. You will be given a language and a moral value.
    Write a prompt in the specified language. The story should always be in South African Afrikaans.
    The prompt should include the moral value. Use only a max of three sentences.
    The prompt should emphasize that the title and text should be written in natural South African Afrikaans.
    Germanisms and Dutchisms should not be used.

    ### Story Language:
    South African Afrikaans

    ### Prompt Language:
    {prompt_language}

    ### Moral value:
    {moral_value}

    ### Prompt:
    """

    # Generate the story prompt
    response = model.generate_content(input_for_prompt_gen)

    # Extract the text
    prompt = response.text

    return input_for_prompt_gen, prompt


def run_first_draft_agent(prompt):

        # Generate the story
        story = model.generate_content(prompt)

        return story

In [None]:
# Phase 2 Agents

def run_final_draft_agent(story):

    prompt = f"""
    You are an expert South African Afrikaans children's story editor.
    You will be given the first draft of a children's story.
    Your task is to improve the story. Correct any errors or missing elements in the title and text.
    The title and text should be written in natural South African Afrikaans.
    Germanisms and Dutchisms should not be used.
    You should also explain what changes you made to improve the story. Use English when explaining.
    Output your response as JSON with two keys: revised_story, changes_made

    ### Story Language:
    South African Afrikaans

    ### Story:
    {story}

    ### Revised story:
    """

    # Generate the story prompt
    response = model.generate_content(prompt)

    # Extract the text
    response = response.text

    response = response.replace('```json', "")
    response = response.replace('```', "")
    response = response.strip()

    # Convert JSON string to Python dictionary
    json_response = json.loads(response)

    # Get the final draft of the story
    final_draft = json_response['revised_story']
    changes_made = json_response['changes_made']

    return final_draft, changes_made


def run_english_translation_agent(story):

    prompt = f"""
    You are an expert at translating South African Afrikaans stories into South African English.
    You will be given a story.
    Your task is to translate the title and the story into English.

    ### Story Language:
    South African Afrikaans

    ### Tranlation Language:
    South African English

    ### Story:
    {story}

    ### English translation:
    """

    # Correct the story
    response = model.generate_content(prompt)

    # Extract the text
    translation = response.text

    return translation


## Phase 1
- Use an LLM to generate a story prompt
- Use the prompt to generate a first draft
- Store everything in a dataframe

In [None]:
# Initialize genai
import google.generativeai as genai

genai.configure(api_key = GOOGLE_API_KEY)


# Initialize the model
model = genai.GenerativeModel("gemini-1.5-flash-002")



# Set the required size of the dataset
num_stories = NUM_ROWS_TO_CREATE

# Set the laguages that the prompts will be generated in
language_list = ['British English', 'Afrikaans']

j = 0
k = 0
num_requests = 0


input_for_prompt_gen_list = []
prompt_list = []
prompt_lang_list = []
moral_value_list = []
first_draft_list = []


for i in range(0, num_stories):

    print(f"--{i}--")

    num_requests = num_requests + 1

    # Gemini Flash free limit is 15 API requests per minute
    # Make only 5 loops (2 API requests per loop)
    if num_requests > 3:
        print("---waiting--")
        # Wait for approx. 1 minute (67 seconds) after every 3 loops
        time.sleep(67)

        # Set num_requests back to zero
        num_requests = 0


    # API requests can fail, therefore include exception handling
    try:

        # Get the prompt language and moral value
        prompt_language = language_list[j]
        moral_value = value_list[k]

        # Create the story prompt
        input_for_prompt_gen, first_draft_prompt = run_story_prompt_agent(prompt_language, moral_value)

        # Create the first draft of the story
        first_draft = run_first_draft_agent(first_draft_prompt)
        first_draft = first_draft.text

        # Increment the iterator
        j = j + 1
        k = k + 1

        # Reset the value to 0 to
        # start iterating from the beginning.
        if j > (len(language_list)-1):
            j = 0

        if k > (len(value_list)-1):
            k = 0

    except Exception as e:
        print('--Exception error--')
        print(e)

        # Assign values when an error occurs
        input_for_prompt_gen = "Error"
        first_draft_prompt = "Error"
        prompt_language = "Error"
        moral_value = "Error"
        first_draft = "Error"


    # Save
    input_for_prompt_gen_list.append(input_for_prompt_gen)
    prompt_list.append(first_draft_prompt)
    prompt_lang_list.append(prompt_language)
    moral_value_list.append(moral_value)
    first_draft_list.append(first_draft)

    if PRINT_OUTPUT == True:

        if i <= 10:

            print(f"Prompt language: {prompt_language}")
            print(f"Moral_value: {moral_value}")
            print(f"Prompt: {first_draft_prompt}")
            print()
            print(f"Story: {first_draft}")
            print()

--0--
Prompt language: British English
Moral_value: Kindness – Showing care and consideration for others, especially those in need.
Prompt: Write a South African Afrikaans children's story titled "[Insert a suitable title here]" that teaches children about the importance of kindness. The story must feature natural South African Afrikaans, avoiding Germanisms and Dutchisms, and showcase how showing care and consideration for others, especially those in need, can make a positive difference.


Story: ## Klein Karoo se Groot Hart

Klein Karoo was 'n klein jakkals met 'n baie groot neus en 'n nog groter hart. Hy't in die Karoo gewoon, tussen die dorings en die rooibruin berge.  Hy was nie soos die ander jakkalse nie. Terwyl hulle altyd op soek was na maklike prooi, het Karoo eerder gehelp waar hy kon.

Een dag, toe hy langs die droë rivier loop, sien hy 'n ou, swak olifantjie wat amper nie kon staan nie.  Die olifantjie se poot was seer, en hy't swaar gehoes. Sy oë was vol trane.

Die ander

In [None]:
# Put everything into a dataframe

data = {
    "input_for_prompt_gen": input_for_prompt_gen_list,
    "prompt": prompt_list,
    "prompt_language": prompt_lang_list,
    "moral_value": moral_value_list,
    "first_draft": first_draft_list,
}

df_story = pd.DataFrame(data)

# Save the dataframe
path = "afrikaans_childrens_stories.csv"
df_story.to_csv(path, index=False)


print(df_story.shape)

df_story.head()

(280, 5)


Unnamed: 0,input_for_prompt_gen,prompt,prompt_language,moral_value,first_draft
0,\n You are an expert at creating prompts th...,Write a South African Afrikaans children's sto...,British English,Kindness – Showing care and consideration for ...,## Klein Karoo se Groot Hart\n\nKlein Karoo wa...
1,\n You are an expert at creating prompts th...,Skep 'n Suid-Afrikaanse kinderverhaal in natuu...,Afrikaans,Honesty – Valuing truth and integrity in words...,"## Klein Karoo se Groot Geheim\n\nKlein Karoo,..."
2,\n You are an expert at creating prompts th...,Write a South African children's story in Afri...,British English,Courage – Encouraging bravery to face fears an...,## Klein Klein Leeutjie se Groot Moed\n\nKlein...
3,\n You are an expert at creating prompts th...,Skryf 'n Suid-Afrikaanse kinderverhaal in natu...,Afrikaans,Responsibility – Teaching accountability and t...,## Klein Katjie se Groot Belofte\n\nKlein Katj...
4,\n You are an expert at creating prompts th...,Write a South African children's story in Afri...,British English,"Respect – Valuing other people’s rights, opini...","## Respek vir almal, 'n Rykdom vir almal!\n\n..."


## Phase 2
- Use an LLM to review the first draft
- Generate a final draft
- Generate a report in English detailing the changes made to the first draft
- Translate the final draft into English

In [None]:
final_draft_list = []
changes_made_list = []
translation_list = []


for i in range(0, len(df_story)):

    print(f"--{i}--")

    # API requests can fail, therefore include exception handling
    try:

        # Get the first draft
        first_draft = df_story.loc[i, 'first_draft']

        if first_draft == "Error":
            final_draft = "Error"
            changes_made = "Error"
            translation = "Error"
        else:

            # Write a final draft
            final_draft, changes_made = run_final_draft_agent(first_draft)

            # Tranlate the final draft into English
            translation = run_english_translation_agent(final_draft)

    except Exception as e:
        print('--Exception error--')
        print(e)

        # Assign values when an error occurs
        final_draft = "Error"
        changes_made = "Error"
        translation = "Error"


    # Save
    final_draft_list.append(final_draft)
    changes_made_list.append(changes_made)
    translation_list.append(translation)

    if PRINT_OUTPUT == True:

        if i <= 10:

            print("===Final draft===")
            print(final_draft)
            print()
            print("===Changes made in final draft===")
            print(changes_made)
            print()
            print("===English translation===")
            print(translation)

--0--
===Final draft===
## Klein Karoo se Groot Hart

Klein Karoo was 'n klein jakkals met 'n groot neus en 'n nog groter hart. Hy het in die Karoo gewoon, tussen die dorings en die rooibruin berge. Hy was nie soos die ander jakkalse nie.  Terwyl hulle altyd op soek was na maklike prooi, het Karoo eerder gehelp waar hy kon.

Een dag, terwyl hy langs die droë rivier geloop het, sien hy 'n ou, swak olifantjie wat amper nie kon staan nie. Die olifantjie se poot was seer, en hy het swaar gehoes. Sy oë was vol trane.

Die ander jakkalse sou weggehardloop het, bang vir die groot dier. Maar nie Karoo nie. Hy het nader gekruip, sy groot neus effens getrek.

"Wat's fout, ou maatjie?" het hy saggies gevra.

Die olifantjie het 'n swak "Grrr..." gekreun. Hy kon skaars praat.

Karoo het gesien die olifantjie se poot was geswel en vol dorings. Hy het met sy klein bek begin om die dorings versigtig uit te trek. Dit het seergemaak, maar die olifantjie het stil gebly, vertrouend op Karoo se sagte hande

In [None]:
# Put everything into a dataframe
df_story['final_draft'] = final_draft_list
df_story['changes_made'] = changes_made_list
df_story['translation'] = translation_list

# Save the dataframe
path = "raw_afrikaans_childrens_stories.csv"
df_story.to_csv(path, index=False)

print(df_story.shape)

df_story.head(3)

(280, 8)


Unnamed: 0,input_for_prompt_gen,prompt,prompt_language,moral_value,first_draft,final_draft,changes_made,translation
0,\n You are an expert at creating prompts th...,Write a South African Afrikaans children's sto...,British English,Kindness – Showing care and consideration for ...,## Klein Karoo se Groot Hart\n\nKlein Karoo wa...,## Klein Karoo se Groot Hart\n\nKlein Karoo wa...,"The original story was already quite good, but...",## Little Karoo's Big Heart\n\nLittle Karoo wa...
1,\n You are an expert at creating prompts th...,Skep 'n Suid-Afrikaanse kinderverhaal in natuu...,Afrikaans,Honesty – Valuing truth and integrity in words...,"## Klein Karoo se Groot Geheim\n\nKlein Karoo,...",## Klein Karoo se Groot Bessie-Geheim\n\nKlein...,The following changes were made to improve the...,## Little Karoo's Big Berry Secret\n\nLittle K...
2,\n You are an expert at creating prompts th...,Write a South African children's story in Afri...,British English,Courage – Encouraging bravery to face fears an...,## Klein Klein Leeutjie se Groot Moed\n\nKlein...,"## Leo se Groot Moed\n\nKlein Leo, of Leon soo...",The following changes were made to improve the...,"## Leo's Great Courage\n\nLittle Leo, or Leon ..."


## Phase 3 - Clean the data
- Remove "Error" rows
- Remove markdown formatting
- Remove swear words
- Remove incomplete stories
- Remove duplicate rows


### 1- Remove "Error" rows

In [None]:
# Remove rows where any cell contains this value: "Error"
df_story = df_story.loc[~df_story.isin(['Error']).any(axis=1)]

# Reset the index
df_story = df_story.reset_index(drop=True)

df_story.shape

(270, 8)

### 2- Remove markdown formatting

In [None]:
# Remove markup formatting symbols from the final draft

def remove_markdown(text):

    # Remove symbols commonly used in markup formatting
    cleaned_text = re.sub(r'[*_~`#]', '', text)

    # Remove leading and trailing spaces
    cleaned_text = cleaned_text.strip()

    return cleaned_text

df_story['final_draft'] = df_story['final_draft'].apply(remove_markdown)


### 3- Replace inappropriate character names

During manual review I noticed that the model was using inappropriate names for some story characters.

Informally, "bliksem" is often used as a curse word, roughly translating to "damn" in English. It can also be used to describe someone as troublesome or aggressive, like calling someone a "little rascal" or "troublemaker."

In Afrikaans, "moffie" is a derogatory term historically used to refer to an effeminate man or someone who is gay. It has been used in a pejorative and offensive manner, carrying strong homophobic connotations.

In [None]:
def replace_swear_words(text):

    # Replace character names
    text = text.replace("Bliksem", "Weerlig")
    text = text.replace("Moffie", "Alex")

    # Replace swear words
    text = text.replace("bliksem", "***")
    text = text.replace("moffie", "***")

    return text

df_story['final_draft'] = df_story['final_draft'].apply(replace_swear_words)

### 3- Remove incomplete stories

In [None]:
def remove_incomplete_stories(text):

    # Using regex to split sentences based on '.', '!', or '?'
    sentences = re.split(r'[.!?]+', text)

    # Filter out any empty strings from the result
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

    # Get the number of sentences
    num_sentences = len(sentences)

    if num_sentences < 5:
        return "too_short"
    else:
        return text


df_story['final_draft'] = df_story['final_draft'].apply(remove_incomplete_stories)

# Filter out the rows where the story text was too_short
df_story = df_story[df_story['final_draft'] != "too_short"]

# Reset the index
df_story = df_story.reset_index(drop=True)


### 4- Remove duplicate rows

Based on manual review I found that the model is not creating duplicate stories. However, just in case, we will add code that removes any duplicate stories. Duplicated stories will slow down fine tuning without adding any value to the model.

In [None]:
# Remove any duplicate stories
df_story = df_story.drop_duplicates(subset=['final_draft'], keep="first")

# Reset the index
df_story = df_story.reset_index(drop=True)


### Save the cleaned data as a csv file

In [None]:
## Save the cleaned data

# Save the dataframe
path = "cleaned_afrikaans_childrens_stories.csv"
df_story.to_csv(path, index=False)

In [None]:
!ls

__notebook__.ipynb		 cleaned_afrikaans_childrens_stories.csv
afrikaans_childrens_stories.csv  raw_afrikaans_childrens_stories.csv


In [None]:
print(df_story.shape)

df_story.head()

(270, 8)


Unnamed: 0,input_for_prompt_gen,prompt,prompt_language,moral_value,first_draft,final_draft,changes_made,translation
0,\n You are an expert at creating prompts th...,Write a South African Afrikaans children's sto...,British English,Kindness – Showing care and consideration for ...,## Klein Karoo se Groot Hart\n\nKlein Karoo wa...,Klein Karoo se Groot Hart\n\nKlein Karoo was '...,"The original story was already quite good, but...",## Little Karoo's Big Heart\n\nLittle Karoo wa...
1,\n You are an expert at creating prompts th...,Skep 'n Suid-Afrikaanse kinderverhaal in natuu...,Afrikaans,Honesty – Valuing truth and integrity in words...,"## Klein Karoo se Groot Geheim\n\nKlein Karoo,...",Klein Karoo se Groot Bessie-Geheim\n\nKlein Ka...,The following changes were made to improve the...,## Little Karoo's Big Berry Secret\n\nLittle K...
2,\n You are an expert at creating prompts th...,Write a South African children's story in Afri...,British English,Courage – Encouraging bravery to face fears an...,## Klein Klein Leeutjie se Groot Moed\n\nKlein...,"Leo se Groot Moed\n\nKlein Leo, of Leon soos s...",The following changes were made to improve the...,"## Leo's Great Courage\n\nLittle Leo, or Leon ..."
3,\n You are an expert at creating prompts th...,Skryf 'n Suid-Afrikaanse kinderverhaal in natu...,Afrikaans,Responsibility – Teaching accountability and t...,## Klein Katjie se Groot Belofte\n\nKlein Katj...,Katjie se Roosbos-Belofte\n\nKlein Katjie was ...,The following changes were made to improve the...,## Katjie's Rosebush Promise\n\nLittle Katjie ...
4,\n You are an expert at creating prompts th...,Write a South African children's story in Afri...,British English,"Respect – Valuing other people’s rights, opini...","## Respek vir almal, 'n Rykdom vir almal!\n\n...",Respek: 'n Rykdom vir almal!\n\nKarlien en die...,The title was changed to be more concise and i...,## Respect: A Richness for Everyone!\n\n**Karl...
