In [None]:
import os
import time
from pathlib import Path

import deepl
import pandas as pd
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from tqdm import tqdm

assert load_dotenv()

In [None]:
path = '/media/ts/SSD_ubuntu/datasets/AudioCaptionCarHospital/Car/Car_Label/car_zh_eval.json'
df = pd.read_json(path)
df

In [None]:
number_characters = df.caption.apply(lambda x: len(x)).sum()
number_characters

## Translate with DEEPL

In [None]:
# API key in dotenv file
translator = deepl.Translator(os.getenv('DEEPL'))

In [None]:
eng_translation = [None for _ in range(df.shape[0])]
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    result = translator.translate_text(text=row.caption,
                                       split_sentences=deepl.SplitSentences.NO_NEWLINES,
                                       source_lang=deepl.Language.CHINESE,
                                       target_lang=deepl.Language.ENGLISH_AMERICAN)
    eng_translation[i] = result.text
    time.sleep(0.1)
df['caption_eng'] = eng_translation
df.to_csv('/media/ts/SSD_ubuntu/datasets/AudioCaptionCarHospital/Car/Car_Label/car_eng_eval.json', index=False)
df

## Translate with Google Translate (Hacky and Slow due to rate limit of Deepl)

In [None]:
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome('/home/ts/Downloads/chromedriver_linux64/chromedriver', options=chrome_options)

In [None]:
def get_translation(text: str):
    try:
        link = f"https://translate.google.com/?hl=de&sl=zh-CN&tl=en&text={text}&op=translate"
        driver.get(link)
        time.sleep(3)

        # accept cookies
        cookie_accept = driver.find_elements(by=By.XPATH, value='//span[contains(text(), "Alle akzeptieren")]')
        if cookie_accept:
            cookie_accept[0].click()
            time.sleep(2)

        return True, driver.find_element(by=By.CLASS_NAME, value="Q4iAWc").text
    except Exception as e:
        print("Failed with", e)
        return False, str(e)

In [None]:
result = [None for _ in range(df.shape[0])]
fail = {}
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    success, translation = get_translation(row.caption)
    if success:
        result[i] = translation
    else:
        fail[i] = translation
df['caption_eng'] = result
driver.quit()

# Postprocessing

## Hospital

In [None]:
hroot = Path('/media/ts/SSD_ubuntu/datasets/AudioCapsHospital/')
hdf_dev = pd.read_csv(hroot / 'labels/hospital_eng_dev.csv')
hdf_eval = pd.read_csv(hroot / 'labels/hospital_eng_eval.csv')

In [None]:
# rename files to match the labels
for file in Path(hroot / 'data').iterdir():
    new_file = file.with_name(f'{int(file.stem):05d}').with_suffix(file.suffix)
    file.rename(new_file)

In [None]:
def process_df(df):
    df = df.drop(columns=['duration', 'caption', 'tokens', 'caption_index'])
    df = df.groupby(by='filename')['caption_eng'].apply(list).reset_index()
    df = df.rename(columns={'caption_eng': 'captions'})
    df.filename = df.filename.apply(lambda f: Path(f).stem + '.mp4')
    return df

hdf_dev = process_df(hdf_dev)
hdf_eval = process_df(hdf_eval)

In [None]:
dir = hroot / 'train'
dir.mkdir()
not_exists = []
for i, row in hdf_dev.iterrows():
    path = hroot / 'data' / row['filename']
    if path.exists():
        path.rename(dir / row['filename'])
    else:
        not_exists.append(i)
hdf_dev = hdf_dev.drop(index=not_exists, axis=0)

In [None]:
# (hroot / 'data').rename(hroot / 'val')
not_exists = []
for i, row in hdf_eval.iterrows():
    path = hroot / 'val' / row['filename']
    if not path.exists():
        not_exists.append(i)
hdf_eval = hdf_eval.drop(index=not_exists, axis=0)

In [None]:
for file in (hroot / 'val').iterdir():
    if file.name not in list(hdf_eval.filename):
        file.unlink()

In [None]:
hdf_dev.filename = hdf_dev.filename.apply(lambda f: str(Path('train') / f))
hdf_eval.filename = hdf_eval.filename.apply(lambda f: str(Path('val') / f))

In [None]:
hdf_dev.to_parquet(hroot / 'annot_train.parquet', index=False)
hdf_eval.to_parquet(hroot / 'annot_val.parquet', index=False)

## Car

In [None]:
croot = Path('/media/ts/SSD_ubuntu/datasets/AudioCapsCar/')
cdf_dev = process_df(pd.read_csv(croot / 'labels' / 'car_eng_dev.csv'))
cdf_eval = process_df(pd.read_csv(croot / 'labels' / 'car_eng_eval.csv'))

In [None]:
train_data_dir = croot / 'train'
train_data_dir.mkdir()
val_data_dir = croot / 'val'
val_data_dir.mkdir()

not_exists = []
for i, row in cdf_dev.iterrows():
    path = croot / 'data' / row['filename']
    if path.exists():
        path.rename(train_data_dir / row['filename'])
    else:
        not_exists.append(i)
cdf_dev = cdf_dev.drop(index=not_exists, axis=0)

not_exists = []
for i, row in cdf_eval.iterrows():
    path = croot / 'data' / row['filename']
    if path.exists():
        path.rename(val_data_dir / row['filename'])
    else:
        not_exists.append(i)
cdf_eval = cdf_eval.drop(index=not_exists, axis=0)

In [None]:
cdf_dev.filename = cdf_dev.filename.apply(lambda f: str(Path('train') / f))
cdf_eval.filename = cdf_eval.filename.apply(lambda f: str(Path('val') / f))

In [None]:
cdf_dev.to_parquet(croot / 'annot_train.parquet', index=False)
cdf_eval.to_parquet(croot / 'annot_val.parquet', index=False)