In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-72B-Instruct-GPTQ-Int4", # the quantized model
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-72B-Instruct-GPTQ-Int4")

In [14]:
model.eval()

def get_outputs(messages, rejected_text):

    PROMPT = '''## Instruction: \n당신은 친절한 전문가 AI 어시스턴트 입니다. 사용자와 어시스턴트의 대화 히스토리가 제공됩니다. 히스토리의 문맥을 파악하여 적절히 대답하세요. 답변은 두 문장 이내로 대답하세요.\n'''
    
    text = PROMPT+'## Dialogue history: \n'
    for message in messages:
        text += f"{message['role']}: {message['content']}\n"
    
    # text += f"## Wrong Answer : \nassistant: {rejected_text}\n"
    # text += "## Answer : \nassistant: "
    text += "## Answer : "
    text += "assistant: "
    # print(text)
    
    input_ids = tokenizer.encode(text,return_tensors="pt").to(model.device)
    terminators = [
        tokenizer.eos_token_id,
        # tokenizer.bos_token_id,
        # tokenizer.pad_token_id,
    ]

    outputs = model.generate(
        input_ids,
        max_new_tokens=256,
        eos_token_id=terminators,
        num_beams=3,
        repetition_penalty=0.5,
        do_sample=True,
        temperature=0.6,
        top_p=0.9
    )
    outputs = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
    outputs = outputs.split("##")[0].split('\n')[0].strip()
    return outputs

In [None]:
from pathlib import Path
import json
import re
from tqdm import tqdm
import torch

settings = json.loads(Path('../../../settings.json').read_text())preprocessed_data_path = Path(settings['preprocessed_data_path'])data_path = Path('.').resolve()data_name = data_path.namesource_name = data_path.parent.namepreprocessed_dir = preprocessed_data_path/source_name/data_namesource_data_dir = preprocessed_dir/'preprocessed'
splits = ['train', 'valid']
tasks = ['dialog', 'dpo', '대화 요약']
task_data_dir = preprocessed_dir/'preprocessed_task'
task_data_dir.mkdir(exist_ok=True)
for task in tasks:
    task_path = task_data_dir/task
    task_path.mkdir(exist_ok=True)
    
#### prepare for task preprocess
speaker_type_map = {
    'human' : 'user',
    'bot' : 'assistant',
}
def get_new_eval(eval_dicts):
    if len(eval_dicts) == 0:
        return True
    keys = [
        'linguistic_acceptability',
        'consistency',
        'unbias',
        'harmlessness',
        'no_hallucination'
    ]
    
    
    values = sum([list({k:v for k,v in eval_dict.items() 
                        if k in keys}.values()) for eval_dict in eval_dicts], [])
    count = 0
    for value in values:
        if value == 'no':
            count += 1
        if count > 2:
            return False
    return True

#### prepare for task preprocess end
with torch.no_grad():
#### task preprocess
    for split in splits:
        source_data_dir_split = source_data_dir/split
        task_files = [(task_data_dir/task/f'{split}.jsonl').open('w', encoding='utf-8') for task in tasks]
        for source_data in tqdm(list(source_data_dir_split.iterdir()), desc=split):
            source_data = source_data.open()
            for line in source_data.readlines():
                line = json.loads(line)
                
                #### data preprocess
                speakers = {s['id']:speaker_type_map[s['speaker_type']] for s in line['metadata']['speakers']}
                messages = [{
                                'role':speakers[utt['speaker_id']], 
                                'content':utt['utterance_text'], 
                                'eval': get_new_eval(utt['utterance_evaluation'])
                            } for utt in line['utterances']]
                summary = line['conversation_summary']
                #### data preprocess end 
                
                #### dialog
                new_messages = []
                for message in messages:
                    message_eval = message.pop('eval')
                    if message['role'] == 'user':
                        new_messages.append(message)
                    else:
                        if message_eval == True:
                            new_messages.append(message)
                        else:
                            rejected_text = message['content']
                            new_message = get_outputs(new_messages, rejected_text)
                            chosen = {
                                'role': speaker_type_map['bot'],
                                'content': new_message
                            }
                            rejected = {
                                'role': speaker_type_map['bot'],
                                'content': rejected_text
                            }
                            print(new_messages[-1]['content'])
                            print('-',new_message)
                            data = {'conversations': new_messages, 'chosen': chosen, 'rejected': rejected}
                            task_files[1].write(json.dumps(data, ensure_ascii=False)+'\n')
                            # print(json.dumps(data, indent=4, ensure_ascii=False))
                            new_messages.append(chosen)
                
                data = {'conversations': new_messages}
                task_files[0].write(json.dumps(data, ensure_ascii=False)+'\n')
                # print(json.dumps(data, indent=4, ensure_ascii=False))
                #### dialog end
                
                
                #### 대화 요약
                data = {'input': None, 'output': None}
                ## preprocess data from line
                data['input'] = new_messages
                data['output'] = summary
                ## preprocess data from line end
                task_files[2].write(json.dumps(data, ensure_ascii=False)+'\n')
                # print(json.dumps(data, indent=4, ensure_ascii=False))
                #### 대화 요약 end
                
                
                
                # break564
            # break
        # break

        for path in task_files:
            path.close()      
        
#### task preprocess end      