In [None]:
from datasets import Dataset
import os
import json
from tqdm import tqdm

In [None]:
raw_data_dir = './data'

trivial_words = {
    '작성됨',
    '법무실장',
    '전문변호사',
    '대한변호사협회',
    '대형로펌',
    '조력해드',
    '대표변호사',
    '파트너변호사',
    '검사출신',
    '가사전문',
    '형사전문',
    '이혼전문',
    '국방부',
    '군법무관',
    '지방검찰청',
    '의정부지검',
    '대전지검',
    '지검검사',
    '검찰부장',
    '부장검사',
    '바로상담',
    '전담TF팀',
    '전담팀',
    '성공사례',
    '해결사례',
    '진행한경험',
    '성공한경험',
    '해결한경험',
    '대표번호',
    '도와드리고',
    '변호사입니다',
    '찾아드리겠습니다',
    '도움드리겠습니다',
    '설명드리겠습니다',
    '돕겠습니다',
    '책임집니다',
    '풍부한경험',
    '대변합니다',
    '재산분할전문',
    '연락주세요',
    '안녕하세요',
    '유선상담',
    '전화상담',
    '비밀보장',
    '검토요청',
    '검토신청',
    '모든절차',
    '최고의서비스',
    '함께하겠습니다',
    '드리겠습니다',
    '사명감',
    '의뢰인의',
    '검증된',
    '보다구체적인',
    '연락부탁드립니다',
    '연락주시길',
    '연락주시기',
    '연락주십시오',
    '바있습니다',
    '법률사무소',
    '군인권자문변호사',
    '군인권변호사',
    '변호사약력',
    '방송출연',
    '방송다회출연',
    '처리합니다',
    '주시기바랍니다',
    '연락주시면',
    '년차변호사',
    '감사합니다',
    '신뢰할수있는',
    '법무팀',
    '출신변호사',
    'https',
    'www',
    '서울대학교',
    '고려대학교',
    '로스쿨',
    '법학과',
    '우수사례선정',
    '변호사드림',
    '배상전문',
    '사건다수',
    '다수의승소사례',
    '후기를확인',
    '수행경험',
    '법무법인',
    '채택부탁',
    '유선으로상담',
    '유선상담',
    '사법시험',
    '사법연수원',
    '강력한전문성',
    '전문성보유',
    '서울남부',
    '서울동부',
    '서울서부',
    '서울북부',
    '서울가정법원',
    '채널운영',
    '유튜브운영',
    '거품없는비용',
    '사무장',
    '겸임교수',
    '변호사가직접',
}

def remove_trivials(answer):
    clean_sents = []
    for sent in answer.split('\n'):
        sent = sent.strip()
        if not sent:
            continue

        is_trivial = False
        for word in trivial_words:
            if word in sent.replace(' ', ''):
                is_trivial = True
            
        if not is_trivial:
            clean_sents.append(sent)

    return '\n'.join(clean_sents)

In [5]:
def create_qna_dataset():
    fnames = [f for f in os.listdir(raw_data_dir) if not f.startswith('.')]
    question_titles = []
    categories = []
    question_bodies = []
    answers = []
    for fname in tqdm(fnames):
        with open(f'{raw_data_dir}/{fname}', 'r') as f:
            obj = json.load(f)
        question_title = obj['question_title']
        category = obj['categories']
        question_body = obj['question_body']
        q_answers = obj['answers']
        q_answers = [remove_trivials(answer) for answer in q_answers]

        for answer in q_answers:
            if len(answer) > 100:
                question_titles.append(question_title)
                categories.append(category)
                question_bodies.append(question_body)
                answers.append(answer)

    dataset = Dataset.from_dict({
        'category': categories,
        'question_title': question_titles,
        'question_body': question_bodies,
        'answer': answers,
    })
    return dataset

qna_dataet = create_qna_dataset()
qna_dataet.push_to_hub('lawtalk-qna', private=True)

100%|██████████| 35799/35799 [00:15<00:00, 2340.85it/s]


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/53 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/yainage90/lawtalk-qna/commit/2e3a2dd9eeec34413f29d4e5ed5bf7e94d076013', commit_message='Upload dataset', commit_description='', oid='2e3a2dd9eeec34413f29d4e5ed5bf7e94d076013', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
def create_instruction_dataset():
    fnames = [f for f in os.listdir(raw_data_dir) if not f.startswith('.')]
    sentences = []
    for fname in tqdm(fnames):
        with open(f'{raw_data_dir}/{fname}', 'r') as f:
            obj = json.load(f)
        question = obj['question_body']
        answers = obj['answers']
        answers = [remove_trivials(answer) for answer in answers]

        for answer in answers:
            if len(answer) > 100:
                sentences.append(f'<s>[INST] {question} [/INST] {answer} </s>')

    dataset = Dataset.from_dict({'text': sentences}).train_test_split(test_size=0.05, seed=2024)
    return dataset

instruction_dataset = create_instruction_dataset()
instruction_dataset.push_to_hub('lawtalk-qna-instruction', private=True)