In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_id_en = "meta-llama/Meta-Llama-3-8B-Instruct"
model_id_zh = "shenzhi-wang/Llama3-8B-Chinese-Chat"
model_id = "google/gemma-2-27b-it"

tokenizer_en = AutoTokenizer.from_pretrained(model_id, padding_side='left')
model_en = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map={'':0},
)
tokenizer_zh=tokenizer_en
model_zh=model_en
# tokenizer_zh = AutoTokenizer.from_pretrained(model_id_zh, padding_side='left')
# model_zh = AutoModelForCausalLM.from_pretrained(
#     model_id_zh,
#     torch_dtype=torch.bfloat16,
#     device_map={'':0},
# )
model_en.eval()


Loading checkpoint shards:   0%|          | 0/12 [00:00<?, ?it/s]

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 4608, padding_idx=0)
    (layers): ModuleList(
      (0-45): 46 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear(in_features=4608, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4608, out_features=2048, bias=False)
          (v_proj): Linear(in_features=4608, out_features=2048, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4608, bias=False)
          (rotary_emb): Gemma2RotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear(in_features=4608, out_features=36864, bias=False)
          (up_proj): Linear(in_features=4608, out_features=36864, bias=False)
          (down_proj): Linear(in_features=36864, out_features=4608, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm()
        (post_attention_layernorm): Gemma2RMSNorm()
        (pre_feedforward_l

In [2]:
tokenizer_en.pad_token_id = model_en.config.eos_token_id
tokenizer_zh.pad_token_id = model_zh.config.eos_token_id

In [3]:
from pathlib import Path
import json
import re
from tqdm import tqdm
from torch.utils.data import Dataset

settings = json.loads(Path('../../../settings.json').read_text())preprocessed_data_path = Path(settings['preprocessed_data_path'])data_path = Path('.').resolve()data_name = data_path.namesource_name = data_path.parent.namepreprocessed_dir = preprocessed_data_path/source_name/data_namesource_data_dir = preprocessed_dir/'preprocessed'
splits = ['train', 'valid']
tasks = ['LM(한국어)', 'LM(영어)', 'LM(중국어)', '번역(한-영)', '번역(한-중)', '번역(영-한)', '번역(중-한)']
task_data_dir = preprocessed_dir/'preprocessed_task'
task_data_dir.mkdir(exist_ok=True)
for task in tasks:
    task_path = task_data_dir/task
    task_path.mkdir(exist_ok=True)
    
#### prepare for task preprocess
sentence_set_kr = set()
sentence_set_en = set()
sentence_set_zh = set()
regex_pattern = re.compile(r'&[a-z]*&')
input_prompt_text_en = '''You are a translation classifier. Given two sentences, judge whether the translation was successful with yes or no. Here are some samples:
Source: 인터넷에서 검색하니 지금 대원을 모집하고 있어요.
Target: When I search on the Internet, it's recruiting members.
Judge: Yes
    
Source: 음, 나는 바르셀로나는 무조건 추천한다.
Target: Okay, now shall we add the eclair cream in advance?
Judge: No

Source: {input}
Target: {output}
Judge:'''

input_prompt_text_zh = '''你是一个翻译分类器。给定两个句子,判断翻译是否成功,用"是"或"否"来回答。 以下是一些示例。:
Source: 인터넷에서 검색하니 지금 대원을 모집하고 있어요.
Target: 我在网上查了一下，他们现在正在招募会员。
Judge: Yes
    
Source: 음, 나는 바르셀로나는 무조건 추천한다.
Target: 好的，现在我们要提前添加闪电泡芙奶油吗？
Judge: No

Source: {input}
Target: {output}
Judge:'''

#### prepare for task preprocess end

topic_dialog = {}
#### task preprocess
for split in splits:
    source_data_dir_split = source_data_dir/split
    task_files = [(task_data_dir/task/f'{split}.jsonl').open('w', encoding='utf-8') for task in tasks]
    for source_data in tqdm(list(sorted(list(source_data_dir_split.iterdir()),
                                        key=lambda x : int(str(x).split('L')[1].split('.')[0]))), desc=split):
        source_data = source_data.open()
        for line in source_data.readlines():
            line = json.loads(line)
            
            #### data preprocess
            
            
            #### data preprocess end 

            #### LM(한국어)
            if line['typeInfo']['language'] == '한국어':
                data = {'text': None}
                ## preprocess data from line
                for utt in line['dialogs']:
                    if 'text' not in utt:
                        continue
                    text = utt['text']
                    if text in sentence_set_kr:
                        continue
                    sentence_set_kr.add(text)
                    data['text'] = text
                    ## preprocess data from line end
                    task_files[0].write(json.dumps(data, ensure_ascii=False)+'\n')
                    # print(json.dumps(data, indent=4, ensure_ascii=False))
            #### LM(한국어) end
            
            
            #### LM(영어)
            if line['typeInfo']['language'] == '영어':
                data = {'text': None}
                ## preprocess data from line
                for utt in line['dialogs']:
                    if 'text' not in utt:
                        continue
                    text = utt['text']
                    if text in sentence_set_en:
                        continue
                    sentence_set_en.add(text)
                    data['text'] = text
                    ## preprocess data from line end
                    task_files[1].write(json.dumps(data, ensure_ascii=False)+'\n')
                    # print(json.dumps(data, indent=4, ensure_ascii=False))
            #### LM(영어) end
            
            
            #### LM(중국어)
            if line['typeInfo']['language'] == '중국어':
                data = {'text': None}
                ## preprocess data from line
                for utt in line['dialogs']:
                    if 'text' not in utt:
                        continue
                    text = utt['text']
                    if text in sentence_set_en:
                        continue
                    sentence_set_en.add(text)
                    data['text'] = text
                    ## preprocess data from line end
                    task_files[2].write(json.dumps(data, ensure_ascii=False)+'\n')
                    # print(json.dumps(data, indent=4, ensure_ascii=False))
            #### LM(중국어) end
            
            topic = line['typeInfo']['topic'] + ' ' + line['typeInfo']['language_pair']
            dialog_language = line['typeInfo']['language']
            if topic not in topic_dialog:
                topic_dialog[topic] = {}
            
            dialogs = [utt for utt in line['dialogs'] if 'text' in utt]
            dialogs = [utt for utt in dialogs if utt['text'] != '']
            
            topic_dialog[topic][dialog_language] = dialogs
            if len(topic_dialog[topic]) == 2:
                if '영어' in topic_dialog[topic]:
                    task_ids = [3,5]
                    prompt = input_prompt_text_en
                    judge_model = model_en
                    judge_tokenizer = tokenizer_en
                elif '중국어' in topic_dialog[topic]:
                    task_ids = [4,6]
                    prompt = input_prompt_text_zh
                    judge_model = model_zh
                    judge_tokenizer = tokenizer_zh
                    
                en_kr_pairs = sorted(topic_dialog[topic].items())
                topic_dialog.pop(topic)
                if len(en_kr_pairs[0][1]) == len(en_kr_pairs[1][1]):
                    judges = []
                    sample_datas = []
                    for src, tgt in zip(en_kr_pairs[1][1][:16], en_kr_pairs[0][1][:16]):
                        data = {'input': None, 'output': None}
                        data['input'] = src['text']
                        data['output'] = tgt['text']
                        sample_datas.append(data)
                    input_samples = [input_prompt_text_en.format(**data) for data in sample_datas]        
                    input_ids = judge_tokenizer(input_samples, return_tensors='pt', padding=True).to(judge_model.device)
                    
                    with torch.no_grad():
                        outputs = judge_model.generate(**input_ids,
                                                    max_new_tokens=1,
                                                    pad_token_id=judge_tokenizer.eos_token_id)#[0]["generated_text"]
                        
                    
                    judges = judge_tokenizer.batch_decode(outputs, skip_special_tokens=True)
                    judges_only = [judge.split('Judge:')[-1].strip() for judge in judges]
                    # print(topic, judges)
                    
                    if sum([judge == 'No' for judge in judges_only]) > 8:
                        print(source_data.name, topic)
                        #print(judges_only)
                        continue
                    
                    for utt_en, utt_kr in zip(en_kr_pairs[0][1], en_kr_pairs[1][1]):
                        if bool(regex_pattern.findall(utt_kr['text'])) != bool(regex_pattern.findall(utt_en['text'])):
                            continue
                        #### 번역(한-영), 번역(한-중)
                        data = {'input': None, 'output': None}
                        data['input'] = utt_kr['text']
                        data['output'] = utt_en['text']
                        task_files[task_ids[0]].write(json.dumps(data, ensure_ascii=False)+'\n')
                        # print(json.dumps(data, indent=4, ensure_ascii=False))
                        #### 번역(한-영), 번역(한-중) end
                    
                    
                        #### 번역(영-한), 번역(중-한)
                        data['input'], data['output'] = data['output'], data['input']
                        task_files[task_ids[1]].write(json.dumps(data, ensure_ascii=False)+'\n')
                        # print(json.dumps(data, indent=4, ensure_ascii=False))
                        #### 번역(영-한), 번역(중-한) end
                    
            # break
        # break
    # break

    for path in task_files:
        path.close()      
        
#### task preprocess end      

train:   0%|          | 0/38 [00:00<?, ?it/s]

preprocessed/train/TL2.jsonl 방송_애니메이션_0420 ko-en
preprocessed/train/TL2.jsonl 방송_애니메이션_0421 ko-en
preprocessed/train/TL2.jsonl 방송_애니메이션_0091 ko-en
preprocessed/train/TL2.jsonl 방송_애니메이션_0110 ko-en
preprocessed/train/TL2.jsonl 방송_애니메이션_0051 ko-en
preprocessed/train/TL2.jsonl 방송_애니메이션_0054 ko-en
preprocessed/train/TL2.jsonl 방송_애니메이션_0367 ko-en
preprocessed/train/TL2.jsonl 방송_애니메이션_0262 ko-en
preprocessed/train/TL2.jsonl 방송_애니메이션_0297 ko-en
preprocessed/train/TL2.jsonl 방송_애니메이션_0181 ko-en
preprocessed/train/TL2.jsonl 방송_애니메이션_0270 ko-en
preprocessed/train/TL2.jsonl 방송_애니메이션_0271 ko-en
preprocessed/train/TL2.jsonl 방송_애니메이션_0277 ko-en
preprocessed/train/TL2.jsonl 방송_애니메이션_0279 ko-en
preprocessed/train/TL2.jsonl 방송_애니메이션_0280 ko-en
preprocessed/train/TL2.jsonl 방송_애니메이션_0018 ko-en
preprocessed/train/TL2.jsonl 방송_애니메이션_0015 ko-en
preprocessed/train/TL2.jsonl 방송_애니메이션_0060 ko-en
preprocessed/train/TL2.jsonl 방송_애니메이션_0195 ko-en
preprocessed/train/TL2.jsonl 방송_애니메이션_0411 ko-en
preprocessed/train/T

train:   3%|▎         | 1/38 [03:44<2:18:26, 224.50s/it]

preprocessed/train/TL3.jsonl 방송_영화드라마_1213 ko-en
preprocessed/train/TL3.jsonl 방송_영화드라마_0866 ko-en
preprocessed/train/TL3.jsonl 방송_영화드라마_1257 ko-en
preprocessed/train/TL3.jsonl 방송_영화드라마_0936 ko-en
preprocessed/train/TL3.jsonl 방송_영화드라마_0083 ko-en
preprocessed/train/TL3.jsonl 방송_영화드라마_0855 ko-en
preprocessed/train/TL3.jsonl 방송_영화드라마_0853 ko-en
preprocessed/train/TL3.jsonl 방송_영화드라마_1331 ko-en
preprocessed/train/TL3.jsonl 방송_영화드라마_1138 ko-en
preprocessed/train/TL3.jsonl 방송_영화드라마_1139 ko-en
preprocessed/train/TL3.jsonl 방송_영화드라마_1135 ko-en
preprocessed/train/TL3.jsonl 방송_영화드라마_0524 ko-en
preprocessed/train/TL3.jsonl 방송_영화드라마_1328 ko-en
preprocessed/train/TL3.jsonl 방송_영화드라마_0995 ko-en
preprocessed/train/TL3.jsonl 방송_영화드라마_0740 ko-en
preprocessed/train/TL3.jsonl 방송_영화드라마_1214 ko-en
preprocessed/train/TL3.jsonl 방송_영화드라마_0926 ko-en
preprocessed/train/TL3.jsonl 방송_영화드라마_1271 ko-en
preprocessed/train/TL3.jsonl 방송_영화드라마_1162 ko-en
preprocessed/train/TL3.jsonl 방송_영화드라마_1206 ko-en
preprocessed/train/T

train:   3%|▎         | 1/38 [14:40<9:02:42, 880.06s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 5.70 GiB. GPU 0 has a total capacity of 79.15 GiB of which 2.01 GiB is free. Including non-PyTorch memory, this process has 77.11 GiB memory in use. Of the allocated memory 70.56 GiB is allocated by PyTorch, and 6.05 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
exit(0)