In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import datasurfer as ds
from pathlib import Path
from datasurfer.lib_llm.llm_agents import LLMAgent
from datasurfer.datautils import xml_valid_df

In [3]:
chapter = 3
version = 'R1V3'
root = Path(r'D:\02_Translation\01_Künstliche_Intelligenz_für_Ingenieure\03_KII_Chapter_03')

frcs  = root / f'KII_Chapter_{chapter:02}_Original.xlsx'
fdst1 = root / f'KII_Chapter_{chapter:02}_Translation_{version}.xlsx'
fdst2 = root / f'KII_Chapter_{chapter:02}_Translation_{version}.docx'
fbak  = root / f'KII_Chapter_{chapter:02}_Translation_{version}.csv'

In [4]:
Linda = LLMAgent('Linda', 'You are a Chinese linguist, you translate German to Chinese.')
Robin = LLMAgent('Robin', 'You are a Chinese linguist, you also know German.')

In [5]:
df_original = ds.AutoObject(frcs).df

df_original

Unnamed: 0,Page,Original,Translation,Review
0,69,Graphensuche,,4
1,69,Dieses Kapitel behandelt die Grundidee von Suc...,,4
2,69,3.1 Grundbegriffe der Graphentheorie,,4
3,69,3.1.1 Vorgehensweise,,4
4,69,Die Suche nach der Lösung eines Wissensverarbe...,,4
...,...,...,...,...
660,115,Der Begriff des Zustands wird für die Darstell...,,4
661,116,Literaturhinweise,,4
662,116,Graphensuchalgorithmen sind in vielen Büchern ...,,4
663,116,"Der Dijkstra-Algorithmus, der heute ein Standa...",,4


In [6]:
pattern_Linda = 'Translating "{original}" to Chinese, return only the translation, do not include any other words.'
pattern_Robin = '根据德语原文\n"{original}"，\n将以下中文翻译改进到语义通顺, 修改其中的错误并去除不必要的句子：\n"{translation}"\n只返回修改过的句子.'
print(pattern_Robin)

根据德语原文
"{original}"，
将以下中文翻译改进到语义通顺, 修改其中的错误并去除不必要的句子：
"{translation}"
只返回修改过的句子.


In [7]:
def transview_text(original, memory_length=100):
    """
    Translates and reviews a given text.

    This function takes an original text and its translation, processes the translation
    using Linda's pattern, and then reviews the translation using Robin's pattern.

    Args:
        original (str): The original text to be translated and reviewed.
        translation (str): The initial translation of the original text.

    Returns:
        list: A list containing the original text and the reviewed translation.
    """
    while 1:
        translation = Linda.told(pattern_Linda.format(original=original), use_cache=True, memory_length=memory_length)
        if 'Instruction' not in translation:
            break
    reviewed = Robin.told(pattern_Robin.format(original=original, translation=translation), use_cache=True, memory_length=memory_length*2)
    return [original, reviewed] 

In [8]:

def start_translation(df, nrows):
    
    out = []
    buffer = []
    
    for idx, (page, original, trans, review) in df.iterrows():
        print(f'Processing {idx+1}/{nrows} ({(idx+1)/nrows*100:0.2f}%)...\n')
        Linda.print_message(f'{original}', 100, role='User')
        
        if review != 2:
            if len(buffer):
                merged = ' '.join(buffer)
                buffer = []
                out.append([page, *transview_text(merged)])     
                              
        if review == 4:
            out.append([page, *transview_text(original)])  
            
        elif review == 2:
            buffer.append(original)
            
        elif review == 1:
            continue
        
        elif review == 3:
            txts = original.split('@')
            
            for txt in txts: 
                out.append([page, *transview_text(txt)])
        
        elif review == 0:
            Linda.append_history(pattern_Linda.format(original=original), role='User')
            Linda.append_history(trans)
            Robin.append_history(pattern_Robin.format(original=original, translation=trans), role='User')
            Robin.append_history(trans)
            out.append([page, original, trans])
        else:
            raise ValueError(f'Invalid review value: {review}')
        
    return out

In [9]:
while 1:
    dfbak = pd.DataFrame() if not fbak.is_file() else pd.read_csv(fbak)
    idx_start = dfbak.index.max() + 1 if not dfbak.empty else 0
    
    if idx_start >= len(df_original):
        break
    
    #df_working = df_original.loc[0:min(idx_start+1, len(df_original))].copy()
    df_working = dfbak.copy()
    df_working = pd.concat([df_working, df_original.loc[idx_start:min(idx_start+1, len(df_original))]])
    
    # if idx_start > 0:
    #     df_working.loc[:idx_start-1, 'Review'] = 0
    #     df_working.loc[:idx_start-1, 'Translation'] = dfbak.Translation.values
           
    out = start_translation(df_working, len(df_original))
    
    pages, raw_text, translation = zip(*out)
    dfout = pd.DataFrame({'Page': pages, 'Original': raw_text, 'Translation': translation, 'Review': 0})
    dfout.Translation = dfout.Translation.str.replace('Let me know if you have more text to translate!', '')
    print('Saving...')
    dfbak = dfout #pd.concat([dfbak, dfout])
    dfbak.to_csv(fbak, index=False)


Processing 1/665 (0.15%)...

[96mUser[0m:

Graphensuche

[92m----------------------------------------------------------------------------------------------------[0m

Processing 2/665 (0.30%)...

[96mUser[0m:

Dieses Kapitel behandelt die Grundidee von Suchalgorithmen am Beispiel der Bestimmung von Pfaden in
Graphen und erläutert die gemeinsame Struktur der behandelten Algorithmen. Heuristische
Erweiterungen der Suchalgorithmen führen auf denA∗-Algorithmus, dessen Wirkungsweise am Beispiel der
Bahnplanung von Roboternerläutert wird.

[92m----------------------------------------------------------------------------------------------------[0m

Processing 3/665 (0.45%)...

[96mUser[0m:

3.1 Grundbegriffe der Graphentheorie

[92m----------------------------------------------------------------------------------------------------[0m

Processing 4/665 (0.60%)...

[96mUser[0m:

3.1.1 Vorgehensweise

[92m------------------------------------------------------------------------------

In [130]:
dfbak.to_excel(fdst1, index=False)


In [118]:
obj = ds.DOCXObject(dfbak[['Original', 'Translation']], name='Translation')
obj.save_df(fdst2)



<DOCXObject@Translation>