In [1]:
from google import genai
from google.genai import types
import base64
import json
import os
from dotenv import load_dotenv

from pydantic import BaseModel, Field

import unicodedata
import re

In [2]:
load_dotenv()
project_name = os.getenv('PROJECT_NAME')

The story id, from 1 to 13 according to the 13 Ainu Kamuy Yukars translated by Chiri Yukie. The Yukar ID starts at 1. Chiri's Preface is 0.

In [3]:
start_at = 1
end_at = 13

In [4]:
class Translation(BaseModel):
    """The translation, and the comments"""
    translation: str = Field(description="The translation")
    comment: str = Field(description="Comments on the translation")

In [5]:
system_instruction_prompt = """You are a professional translator. You know Japanese, English and Chinese. You can translate Japanese into either Chinese or English. You can also translate Chinese into English, and English into Chinese."""

client = genai.Client(
      vertexai=True,
      project=project_name,
      location="us-central1",
)

model = "gemini-2.0-flash-001"

generate_content_config = types.GenerateContentConfig(
    temperature = 0,
    top_p = 0,
    max_output_tokens = 8192,
    response_mime_type = 'application/json',
    response_schema = Translation,
    safety_settings = [types.SafetySetting(
      category="HARM_CATEGORY_HATE_SPEECH",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_DANGEROUS_CONTENT",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_SEXUALLY_EXPLICIT",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_HARASSMENT",
      threshold="OFF"
    )],
    system_instruction=[types.Part.from_text(text=system_instruction_prompt)],
  )

In [6]:
translation_prompt = """Translate the following text from English to Chinese. The original text is translation from Japanese which is footnotes of Ainu chant. 
Keep the original meaning. Use modern Chinese. Display in Traditional Chinese. Only allow Chinese and English characters.
If a term cannot be translated, keep the original language."""

#descriptive_translation_prompt = "Translate the following text from Japanese to Chinese. Keep the original meanings. Display in Traditional Chinese."

In [7]:
# read the content page of Japanese translation and get the Japanese translated title
with open("Chiri_Japanese_Translation/content.txt", "r", encoding="utf8") as f:
    japanese_content = f.read()
    japanese_content = unicodedata.normalize('NFKC', japanese_content)


s=re.split(r'\n\n', japanese_content)
japanese_titles = re.split(r'\n', s[1])

In [8]:
# read the content page of Ainu original text and get the original title
with open("original_Ainu_text/content.txt", "r", encoding="utf8") as f:
    ainu_content = f.read()
    ainu_content = unicodedata.normalize('NFKC', ainu_content)


s=re.split(r'\n\n', ainu_content)
ainu_titles = re.split(r'\n', s[1])

In [9]:
# Read the markdown template for writing the Chinese translations to Markdown file

#read in the template
with open("templates/raw_output_md_template_cross_lingual", "r", encoding="utf8") as f:
    md_template = f.read()
    md_template = unicodedata.normalize('NFKC', md_template)

In [10]:
def generate(client: genai.Client,generate_content_config :types.GenerateContentConfig,model :str, /,input_text :str, prompt :str):

    text_full_prompt = text1 = types.Part.from_text(text=f"{prompt}\n\n{input_text}")

    output = ""

    contents = [
      types.Content(
        role="user",
        parts=[
          text_full_prompt
        ]
      )
    ]

    for chunk in client.models.generate_content_stream(
        model = model,
        contents = contents,
        config = generate_content_config,
        ):
        print(chunk.text, end="")
        output += chunk.text

    return output

def get_output_file_name_key(title :str):
    # setup the output file name
    s = title.split()
    md_name_part = s[0]

    name_2nd_part = ""

    for text in s:
        if text.startswith('“'):
            name_2nd_part = text.replace('“', '').replace('”', '')
        
    md_name_part += "_" + name_2nd_part

    return md_name_part



In [11]:
# process the translation in batch mode

for song_no in range(start_at, end_at + 1):

    md_name_part = get_output_file_name_key(ainu_titles[song_no - 1])

    with open(f"Initial_LLM_prompts_and_translations_footnotes/English_Translation_JSON/{song_no}_{md_name_part}.json", "r", encoding="utf8") as f:
        data_dict = json.load(f)

        poetic_translation = generate(client,generate_content_config,model,input_text=data_dict['english_translation'],prompt=translation_prompt)

        poetic_translation_dict = json.loads(poetic_translation)

        #descriptive_translation = generate(client,generate_content_config,model,input_text=japanese_story,prompt=descriptive_translation_prompt)

        md_output = md_template.format(translated_language="Chinese", original_language="English", ainu_title=ainu_titles[song_no - 1],
                                poetic_prompt=translation_prompt,
                                japanese_title=japanese_titles[song_no],
                                input_text = data_dict['english_translation'],
                                output_poetic=poetic_translation_dict['translation'])

        data_dict['chinese_translation'] = poetic_translation_dict['translation']
        data_dict['comment'] = poetic_translation_dict['comment']
        #data_dict['japanese_translation'] = data_dict['input_text']
        #del data_dict['input_text']

        with open(f"cross_lingual_LLM_prompts_and_translations_footnotes/Chinese_Translation_JSON/{song_no}_{md_name_part}.json", "w", encoding="utf8") as f:
            json.dump(data_dict, f, ensure_ascii=False, indent=4)

        with open(f"cross_lingual_LLM_prompts_and_translations_footnotes/Chinese_Translation/{song_no}_{md_name_part}_to_Chinese.md", "w", encoding="utf8") as f:
            f.write(md_output)



{
  "comment": "Translation from English to Traditional Chinese, preserving original meaning and using modern Chinese. Terms that cannot be translated are kept in the original language.",
  "translation": "1. 在過去，當男孩稍微長大一點，他們會製作並給他一個小弓箭。孩子會通過射擊樹木和鳥類來玩耍，不知不覺地變得擅長射箭。\nak...... archery（射箭）, shinot 是玩耍, ponai 是小箭。\n\n2. shiktumorke...... 眼神。\n據說，如果你想了解一個人的真實本性，最好的方法是看他們的眼睛，如果他們不安地四處張望，就會受到責罵。\n\n3. achikara...... 意思是「骯髒」。\n\n4. 據說鳥類和動物被人類射下來，是因為它們想要人類製作的箭，並且它們正在拿走箭。\n\n5. kotankorkamui...... 擁有國家或村莊的神。\n在山裡，有 nupurikorkamui...... 擁有山的神（熊）和 nupuripakorkamui...... 擁有山東邊的神（狼），貓頭鷹的地位排在熊和狼之後。\nkotankorkamui 不像山神或山東邊的神那樣粗魯和倉促。因此，他們通常很平靜，總是閉著眼睛，據說只有在非常嚴重的事情發生時才會睜開眼睛。\n\n6. eharkiso...... 左座。\n\n7. eshiso...... 右座。\n在房子的中央是一個爐灶，東側有※(「窗/心」)的那一側是上座。從上座看，右邊是 eshiso，左邊是 harkiso。只有男人才能坐在上座。地位低於戶主的客人會避免坐在上座。房子的主人和女主人習慣並排坐在右座。左座在右座旁邊，西側（朝向門口）的座位是最低的座位。\n\n8. hayokpe 頭盔。\n據說，當鳥類和動物在山裡時，人類的眼睛是看不見它們的，但它們每個都像人類一樣有一個房子，它們都以與人類相同的形式生活，當它們來到人類的村莊時，它們會戴上頭盔。據說鳥類和動物的屍體是頭盔，主體是看不見的，但在屍體的耳朵之間。\n\n9. otuipe...... 一個屁股