**Installing Required Libraries**

In [2]:
#Installing Required Libraries
!pip install transformers sentencepiece --upgrade
!pip install torch
!pip install pandas

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
[31mERROR: Operation cancelled by user[0m[31m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-n

**Use this code if the raw data is too big**

In [None]:
# Use this code if the raw data is too big
import pandas as pd
!pip install scikit-learn
from sklearn.model_selection import train_test_split

# file path of your raw data
file_path = "/root/data/train+test+valid.csv"

df = pd.read_csv(file_path)

# first split : 5: (3+2)
part_1, part_tmp = train_test_split(df, test_size=0.5, random_state=42, shuffle=True)

# second split : part_tmp → 3:2
part_2, part_3 = train_test_split(part_tmp, test_size=0.4, random_state=42, shuffle=True)
# (0.4 = 2 / (3+2))

# saving
part_1.to_csv("/root/data/50.csv", index=False)
part_2.to_csv("/root/data/30.csv", index=False)
part_3.to_csv("/root/data/20.csv", index=False)

print("Done spliting and saving !")
print("1.csv rows:", len(part_1))
print("2.csv rows:", len(part_2))
print("3.csv rows:", len(part_3))


**Creating Translationese Column to Our Raw Data**

In [None]:
# Creating Translationese Column to Our Raw Data
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import pandas as pd
from tqdm import tqdm

# Load model and set device
model_name = "NHNDQ/nllb-finetuned-en2ko"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model on GPU with FP16 precision (only works on GPU)
if device.type == "cuda":
    model = model.half()

model = model.to(device)

# Fast batch translation function
def translate_batch_fast(text_list, batch_size=1024):
    results = []
    tokenizer.src_lang = "eng_Latn"
    bos_token_id = tokenizer.convert_tokens_to_ids("kor_Hang")

    for i in tqdm(range(0, len(text_list), batch_size)):
        batch = text_list[i:i+batch_size]
        encoded = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

        with torch.no_grad():
            output = model.generate(
                **encoded,
                forced_bos_token_id=bos_token_id,
                max_length=512,
                num_beams=1,      # greedy decoding for speed
                use_cache=True
            )

        decoded = tokenizer.batch_decode(output, skip_special_tokens=True)
        results.extend(decoded)

    return results

# Load full dataset
df = pd.read_csv("/root/data/20.csv")
text_list = df["en"].fillna("").tolist()

# Run full translation
translated_ko = translate_batch_fast(text_list, batch_size=128)

# Append translation results and save
df["translated_ko"] = translated_ko
output_path = "/root/data/20_translated.csv"
df.to_csv(output_path, index=False)

print("Translation completed! Saved to:", output_path)

**Data Filtering**

In [None]:
# Data Filtering
import pandas as pd
import re
from bert_score import score
from tqdm import tqdm

# ====== 1. Load the CSV file ======
file_path = "/root/data/50_translated.csv"
df = pd.read_csv(file_path)
print(f" Original number of rows: {len(df)}")

# ====== 2. Filter by minimum sentence length ======
min_len = len("내시경하는 기분이야.")  # 11 characters
before = len(df)
df = df[df["ko"].astype(str).apply(lambda x: len(x) >= min_len)]
after = len(df)
print(f" Length filtering: {before - after} rows removed → {after} rows remaining")

# ====== 3. Filter by allowed characters ======
# Allowed characters: Korean, English, numbers, whitespace, and selected symbols
allowed_pattern = re.compile(r'^[\uAC00-\uD7A3a-zA-Z0-9\s!@#$%^&*()_\-+={}\[\]|\\:;"\'<>,.?/~`]+$')

def is_valid(text):
    return bool(allowed_pattern.fullmatch(str(text).strip()))

before = len(df)
mask_valid = df[["ko", "en", "translated_ko"]].applymap(is_valid).all(axis=1)
df = df[mask_valid].reset_index(drop=True)
after = len(df)
print(f" Character filtering: {before - after} rows removed → {after} rows remaining")

# ====== 4. Compute BERTScore (F1) ======
ko_texts = df["ko"].astype(str).tolist()
translated_texts = df["translated_ko"].astype(str).tolist()
batch_size = 1024
f1_scores = []

print(" Calculating BERTScore F1 (using GPU + batching)...")
for i in tqdm(range(0, len(df), batch_size), desc="Progress", unit="batch"):
    batch_ko = ko_texts[i:i+batch_size]
    batch_trans = translated_texts[i:i+batch_size]
    try:
        _, _, F1 = score(batch_trans, batch_ko, lang='ko', device='cuda', verbose=False)
        f1_scores.extend([f.item() for f in F1])
    except RuntimeError as e:
        print(f"\n GPU memory error! Try reducing batch_size from {batch_size}.")
        raise e

# ====== 5. Filter by F1 score threshold ======
df["f1"] = f1_scores
before = len(df)
df_filtered = df[df["f1"] > 0.9].drop(columns=["f1"])
after = len(df_filtered)
print(f" BERTScore filtering (F1 ≤ 0.9): {before - after} rows removed → {after} rows remaining")

# ====== 6. Save the filtered results ======
output_path = "/root/data/50_translated_filtered.csv"
df_filtered.to_csv(output_path, index=False)

# ====== 7. Summary output ======
print(f"\n Final results saved!")
print(f" {len(df_filtered)} out of {len(df)} rows remaining ({len(df_filtered)/len(df)*100:.2f}%)")
print(f" Saved to: {output_path}")

