In [72]:
import openai
import pandas as pd
from tqdm.notebook import tqdm
from config import CONFIG

openai.organization = CONFIG["translation"]["openai"]["organization"]
openai.api_key = CONFIG["translation"]["openai"]["api_key"]

In [85]:
csv1 = pd.read_csv(f"data/texts/database_text_v2.csv", sep=";")
csv2 = pd.read_csv(f"data/texts/Ayako - Macgregor Playthrough 1/database_text_v2.csv", sep=";")
csv3_output_name = f"data/texts/database_text_1.csv"

csv1_new = csv1[~csv1["Japanese text"].isin(csv2["Japanese text"])]
csv2_new = csv2[~csv2["Japanese text"].isin(csv1["Japanese text"])]

csv2_new = csv2_new[~csv2_new['Japanese text'].str.contains("\?")]
csv2_new['English text'] = csv2_new['English text'].str.replace("\"", "")
csv2_new.drop_duplicates(subset=['Character name', 'Japanese text'], keep='first', inplace=True)
csv2_new.dropna(subset=['Character name'], inplace=True)
csv2_new

Unnamed: 0,Character name,Japanese text,English text
3716,<PNAME>,『<DATE>。,<DATE>.
3717,<PNAME>,『Ｂ型。,Type B.
3718,none,データセーブ中です,Data saving in progress.
3721,<PNAME>,『えーと、アイスつてどういう意味かなあ。,"Let's see, what does ice cream mean?"
3722,Himoo,「それはね、インサーキットエミュレータといつて開発するターゲットを実環境に近い状態でリアルタ...,"It is an in-circuit emulator, which is a devic..."
...,...,...,...
4162,none,『この絵、片桐さんに似てない。』『この絵、奇麗だと思わない？』『あの人、奇麗だな⋯。』,This picture doesn't look like Katagiri-san. D...
4163,none,大学に試験を受けに行くやめて、就職試験を受けに行く,Stop going to college to take exams and go tak...
4164,none,―流大学を受験する二流大学を受験する三流大学を受験する,-Take a second-rate universityTake a third-rat...
4166,Yoshio,「実は、未緒と付き合うことになつたんだ。,"Actually, I'm going out with Mio."


In [86]:
def line_is_incomplete(row):
    return (sum(csv1[csv1['Character name'] == row['Character name']]['Japanese text'].str.startswith(row['Japanese text'])) > 0 or 
            sum(csv2_new[csv2_new['Character name'] == row['Character name']]['Japanese text'].str.startswith(row['Japanese text'])) > 1)

csv2_new = csv2_new[~csv2_new.apply(line_is_incomplete, axis=1)]
csv2_new

Unnamed: 0,Character name,Japanese text,English text
3716,<PNAME>,『<DATE>。,<DATE>.
3717,<PNAME>,『Ｂ型。,Type B.
3721,<PNAME>,『えーと、アイスつてどういう意味かなあ。,"Let's see, what does ice cream mean?"
3722,Himoo,「それはね、インサーキットエミュレータといつて開発するターゲットを実環境に近い状態でリアルタ...,"It is an in-circuit emulator, which is a devic..."
3723,<PNAME>,『あ、ありがとう。えーと⋯。,"Oh, thank you. Well..."
...,...,...,...
4162,none,『この絵、片桐さんに似てない。』『この絵、奇麗だと思わない？』『あの人、奇麗だな⋯。』,This picture doesn't look like Katagiri-san. D...
4163,none,大学に試験を受けに行くやめて、就職試験を受けに行く,Stop going to college to take exams and go tak...
4164,none,―流大学を受験する二流大学を受験する三流大学を受験する,-Take a second-rate universityTake a third-rat...
4166,Yoshio,「実は、未緒と付き合うことになつたんだ。,"Actually, I'm going out with Mio."


In [78]:
def translate_openai(desc, with_character=True):
    completion = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=[
            {"role": "system",
             "content": f"""You are a japanese translator. The user will give you a CSV in the format:
{'CHARACTER NAME;' if with_character else ''}JAPANESE TEXT
Your job is to reply with the following:
{'CHARACTER NAME;' if with_character else ''}JAPANESE TEXT;ENGLISH TEXT
For example, the user gives you
{'<PNAME>;' if with_character else ''}『サンキュー。
You reply with
{'<PNAME>;' if with_character else ''}『サンキュー。;Thank you."""},
            {"role": "user", "content": desc},
        ]
    )
    print(f"Tokens used: {completion.usage.total_tokens}")
    return completion.choices[0].message['content']

def process_batch_id(df, id, batch_size=50):
    _df = df.iloc[id:].head(batch_size)
    if "Character name" in _df.columns:
        batch_jp_str = '\n'.join((_df["Character name"] + ";" + _df["Japanese text"]).values)
    else:
        batch_jp_str = '\n'.join(_df["Japanese text"].values)
    translation = translate_openai(batch_jp_str, with_character="Character name" in _df.columns)
    return translation

def translate_dataframe(df):
    batch_size = 50
    
    translation_full = []
    for batch_id in tqdm(range(0, len(df), batch_size)):
        translation = ""
    
        n_tries = 0
        while n_tries < 5:
            try:
                translation = process_batch_id(df, batch_id, batch_size)
                translation_df = pd.DataFrame([x.split(";") for x in translation.split("\n")])
                translation_full.append(translation_df)
                break
            except:
                n_tries += 1
        
        print(f"Batch {batch_id}: {translation}")
    
    return pd.concat(translation_full)    

csv2_new_translated = translate_dataframe(csv2_new)
csv2_new_translated = csv2_new_translated.iloc[:, :len(csv2_new.columns)]
csv2_new_translated.columns = csv2_new.columns
csv2_new_translated

  0%|          | 0/10 [00:00<?, ?it/s]

Tokens used: 2783
Batch 0: <PNAME>;『いや、別に用はないんだけどなんとなくね。;No, I don't really have anything to do, it's just for some reason.
<PNAME>;（ふつ、俺としたことが、少し迷つてしまつたぜ。）;Geez, I hesitated a bit for someone like me.
Katagiri;「<PNICKNAME>。私に、何か用？;Do you need something from me, <PNICKNAME>?
<PNAME>;『‥‥‥。;......
Kisaragi;「もう、すつかり春ですね。;It's already completely spring, isn't it?
none;￣―――――――――――――――――;――――――――――――――――――
Kagami;「あら、<PSURNAME>君今帰りかしら？;Oh, are you going home now, <PSURNAME>?
Asahina;「<PNAME>君？何？どうしたの？;Hey, <PNAME>? What's up? What's wrong?
none;朝日奈;Asahina
Asahina;「じゃあ、帰ろ。<PNAME>君。;Alright then, let's go home, <PNAME>.
Kagami;「<PSURNAME>君？今日は、電話ばつかり⋯。もてすぎるのも、困り物よね。で、何の用かしら？;Hey, <PSURNAME>? You've been on the phone all day... Being too popular is troublesome, right? So, what do you need?
Kagami;「<PSURNAME>君？今日は、電話ばつかり⋯。もてすぎるのも、困り物よねで、何の用かしら？;Hey, <PSURNAME>? You've been on the phone all day... Being too popular is troublesome, right? So, what do you need?
Kagami;「ねぇ、私に似合うネックレス、探して下さる？;Hey, 

Unnamed: 0,Character name,Japanese text,English text
0,<PNAME>,『いや、別に用はないんだけどなんとなくね。,"No, I don't really have anything to do, it's j..."
1,<PNAME>,（ふつ、俺としたことが、少し迷つてしまつたぜ。）,"Geez, I hesitated a bit for someone like me."
2,Katagiri,「<PNICKNAME>。私に、何か用？,"Do you need something from me, <PNICKNAME>?"
3,<PNAME>,『‥‥‥。,......
4,Kisaragi,「もう、すつかり春ですね。,"It's already completely spring, isn't it?"
...,...,...,...
3,Koshiki,「そう言えば、私、滑れないのでした⋯。,"Speaking of which, I can't skate..."
4,Koshiki,「そうですか。それでは、よろしくお願い致します。,"I see. Well then, please take care of me."
5,Kiyokawa,「何か、オリに閉じこめられて、可哀想だな。,They seem kind of pitiful being trapped in a c...
6,Kiyokawa,「何か、オリに閉じこめられて可哀想だな。,They seem kind of pitiful being trapped in a c...


In [79]:
csv3 = pd.concat((csv1, csv2_new_translated))
csv3 = csv3.drop_duplicates(subset=csv2_new.columns.values, keep='first')
csv3['English text'] = csv3['English text'].str.replace("\"", "").str.replace("“", "").str.replace("”", "")
csv3['English text'] = csv3['English text'].str.replace("Cordao", "Himoo").str.replace("Stringo", "Himoo").str.replace("String-o", "Himoo")
csv3['English text'] = csv3['English text'].str.replace("Furushiki", "Koshiki")
csv3['English text'] = csv3['English text'].str.replace("Mr. ", "").str.replace("Ms. ", "")
csv3['English text'] = csv3['English text'].str.replace("; ", "").str.strip()
csv3 = csv3[csv1.columns]
csv3 = csv3.dropna()

timestamp = pd.Timestamp.now().strftime("[%Y-%m-%d]")
csv3.to_csv(csv3_output_name, sep=";", index=False)
csv3

Unnamed: 0,Character name,Japanese text,English text
0,<PNAME>,（今日は、伊集院さんとデートだ）,(I have a date with Ijuin-san today.)
1,<PNAME>,（今日は、伊集院さんとデートだ）,"(Today, I have a date with Ijuin-san.)"
2,<PNAME>,『伊集院さんのことについて知りたいんだけど。,I would like to know about Ijuin-san.
3,<PNAME>,『よし、次は、伊集院とだ。,"(Okay, next time, with Ijuin-san.)"
4,<PNAME>,『しばらく伊集院さんと話し込んだ,(I talked with Ijuin-san for a while.)
...,...,...,...
3,Koshiki,「そう言えば、私、滑れないのでした⋯。,"Speaking of which, I can't skate..."
4,Koshiki,「そうですか。それでは、よろしくお願い致します。,"I see. Well then, please take care of me."
5,Kiyokawa,「何か、オリに閉じこめられて、可哀想だな。,They seem kind of pitiful being trapped in a c...
6,Kiyokawa,「何か、オリに閉じこめられて可哀想だな。,They seem kind of pitiful being trapped in a c...
