In [24]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
import pandas as pd
import numpy as np
import datasurfer as ds
import time
from pathlib import Path
from datasurfer.lib_llm.llm_agents import LLMAgent
from datasurfer.datautils import xml_valid_df

In [26]:
chapter = 5
version = 'R1V1'
root = Path(r'D:\02_Translation\01_Künstliche_Intelligenz_für_Ingenieure\05_KII_Chapter_05')

frcs  = root / f'KII_Chapter_{chapter:02}_Original.xlsx'
fdst1 = root / f'KII_Chapter_{chapter:02}_Translation_{version}.xlsx'
fdst2 = root / f'KII_Chapter_{chapter:02}_Translation_{version}.docx'
fbak  = root / f'KII_Chapter_{chapter:02}_Translation_{version}.csv'

In [27]:
Linda = LLMAgent('Linda', 'You are a Chinese linguist, you translate German to Chinese.')
Robin = LLMAgent('Robin', 'You are a Chinese linguist, you also know German.')

In [28]:
df_original = ds.AutoObject(frcs).df

df_original

Unnamed: 0,Page,Original,Translation,Review
0,157,Wissensverarbeitung mit strukturiertenObjekten,,4
1,157,Durch die Repräsentation von Wissen als strukt...,,4
2,157,5.1 Begriffsbildung und strukturierte Objekte,,4
3,157,Begriffe und Methoden existieren im menschlich...,,4
4,157,In diesem Kapitel werden mit semantischen Netz...,,4
...,...,...,...,...
269,174,Stabilitätsanalyse: Für gegebene Systemmatrix ...,,4
270,174,"Modell des Regelkreises: Für das Modell (A, B)...",,4
271,174,Stellen Sie diese Informationen über die Modul...,,4
272,174,Literaturhinweise,,4


In [29]:
pattern_Linda = 'Translating "{original}" to Chinese, return only the translation, do not include any other words.'
pattern_Robin = '根据德语原文\n"{original}"，\n将以下中文翻译改进到语义通顺, 修改其中的错误并去除不必要的句子：\n"{translation}"\n只返回修改过的不加引号的句子.'
print(pattern_Robin)

根据德语原文
"{original}"，
将以下中文翻译改进到语义通顺, 修改其中的错误并去除不必要的句子：
"{translation}"
只返回修改过的不加引号的句子.


In [30]:
def transview_text(original, memory_length=100):
    """
    Translates and reviews a given text.

    This function takes an original text and its translation, processes the translation
    using Linda's pattern, and then reviews the translation using Robin's pattern.

    Args:
        original (str): The original text to be translated and reviewed.
        translation (str): The initial translation of the original text.

    Returns:
        list: A list containing the original text and the reviewed translation.
    """
    while 1:

        translation = Linda.told(pattern_Linda.format(original=original), use_cache=False, memory_length=memory_length)

        if 'Instruction' not in translation:
            break

            
    reviewed = Robin.told(pattern_Robin.format(original=original, translation=translation), use_cache=True, memory_length=memory_length*2)
    return [original, reviewed] 

In [31]:

def start_translation(df, nrows):
    
    out = []
    buffer = []
    
    for idx, (page, original, trans, review) in df.iterrows():
        start = time.time()
        if review != 0:
            print(f'Processing {idx+1}/{nrows} ({(idx+1)/nrows*100:0.2f}%)...\n')
            Linda.print_message(f'{original}', 80, role='User')
        
        if review != 2:
            if len(buffer):
                merged = ' '.join(buffer)
                buffer = []
                out.append([page, *transview_text(merged)])     
                              
        if review == 4:
            out.append([page, *transview_text(original)])  
            
        elif review == 2:
            buffer.append(original)
            
        elif review == 1:
            continue
        
        elif review == 3:
            txts = original.split('@')
            
            for txt in txts: 
                out.append([page, *transview_text(txt)])
        
        elif review == 0:
            Linda.append_history(pattern_Linda.format(original=original), role='User')
            Linda.append_history(trans)
            Robin.append_history(pattern_Robin.format(original=original, translation=trans), role='User')
            Robin.append_history(trans)
            out.append([page, original, trans])
        else:
            raise ValueError(f'Invalid review value: {review}')

        if review != 0:
            duration = time.time() - start
            tremain = int((nrows-idx-1)*duration)
            print(f'Completed in {duration:0.2f}s, remain {tremain//3600:0.0f}h{tremain%3600//60:0.0f}m{tremain%60}s\n')
        
    return out

In [32]:
while 1:
    dfbak = pd.DataFrame() if not fbak.is_file() else pd.read_csv(fbak)
    idx_start = dfbak.index.max() + 1 if not dfbak.empty else 0
    
    if idx_start >= len(df_original):
        break
    
    df_working = dfbak.copy()
    df_working = pd.concat([df_working, df_original.loc[idx_start:min(idx_start+5, len(df_original)-1)]])
    df_working.fillna('', inplace=True)
       
    out = start_translation(df_working, len(df_original))
    
    pages, raw_text, translation = zip(*out)
    dfout = pd.DataFrame({'Page': pages, 'Original': raw_text, 'Translation': translation, 'Review': 0})
    dfout.Translation = dfout.Translation.str.replace('Let me know if you have more text to translate!', '')
    print('Saving...')
    dfbak = dfout #pd.concat([dfbak, dfout])
    dfbak.to_csv(fbak, index=False)


  df_working.fillna('', inplace=True)


Processing 1/274 (0.36%)...

[96mUser[0m:

Wissensverarbeitung mit strukturiertenObjekten

[92m--------------------------------------------------------------------------------[0m

[96mLinda[0m:

知识处理与结构化对象
[92m--------------------------------------------------------------------------------[0m

[96mRobin[0m:

知识处理与结构化对象 



[92m--------------------------------------------------------------------------------[0m

Completed in 7.01s, remain 0h31m53s

Processing 2/274 (0.73%)...

[96mUser[0m:

Durch die Repräsentation von Wissen als strukturierte Objekte wird der
Zusammenhang zwischen den Wissenselementen in den Mittelpunkt der Verarbeitung
gestellt.Dieses Kapitel erläutert mit den semantischen Netzen und den Frames die
klassischenFormen, die im objektorientierten Programmierstil eine breite
Anwendung gefundenhaben.

[92m--------------------------------------------------------------------------------[0m

[96mLinda[0m:

通过将知识表示为结构化对象，知识元素之间的关系成为处理的重点。本章将通过语义网络和帧来解释经典形式，这些形式

In [76]:
dfbak.to_excel(fdst1, index=False)


In [77]:
obj = ds.DOCXObject(dfbak[['Original', 'Translation']], name='Translation')
obj.save_df(fdst2)



<DOCXObject@Translation>