# 1. Load dataset

### Dataset: 민원(콜센터) 질의-응답 데이터
### Using: Training and Validation of
    ├─라벨링데이터_220121_add
            │  │  ├─금융보험
            │  │  │  ├─민원(콜센터) 질의응답_금융보험_사고 및 보상 문의_Training.zip | 4 MB | 174452
            │  │  │  ├─민원(콜센터) 질의응답_금융보험_상품 가입 및 해지_Training.zip | 4 MB | 174453
            │  │  │  ├─민원(콜센터) 질의응답_금융보험_이체출금대출서비스_Training.zip | 4 MB | 174454
            │  │  │  └─민원(콜센터) 질의응답_금융보험_잔고 및 거래내역_Training.zip | 3 MB | 174455
            │  │  ├─다산콜센터
            │  │  │  ├─민원(콜센터) 질의응답_다산콜센터_대중교통 안내_Training.zip | 2 MB | 174456
            │  │  │  ├─민원(콜센터) 질의응답_다산콜센터_생활하수도 관련 문의_Training.zip | 2 MB | 174457
            │  │  │  ├─민원(콜센터) 질의응답_다산콜센터_일반행정 문의_Training.zip | 2 MB | 174458
            │  │  │  └─민원(콜센터) 질의응답_다산콜센터_코로나19 관련 상담_Training.zip | 2 MB | 174459
            │  │  └─질병관리본부
            │  │      ├─민원(콜센터) 질의응답_질병관리본부_기타문의_Training.zip | 1 MB | 174461
            │  │      ├─민원(콜센터) 질의응답_질병관리본부_약품식품_Training.zip | 497 KB | 174462
            │  │      ├─민원(콜센터) 질의응답_질병관리본부_요양기관 현황_Training.zip | 789 KB | 174463
            │  │      ├─민원(콜센터) 질의응답_질병관리본부_증상징후_Training.zip | 1 MB | 174464
            │  │      └─민원(콜센터) 질의응답_질병관리본부_진료비정보_Training.zip | 294 KB | 174465

In [1]:
from datasets import load_dataset

data_files= {
    "train": "/data/whisper_testdata/QnA_data/Data/1.Training/라벨링데이터_220121_add/금융보험/*.zip",

    "test": "/data/whisper_testdata/QnA_data/Data/2.Validation/라벨링데이터_220121_add/금융보험/*.zip"
}

# json file 구조가 "data": [{data1}, {data2}, ...] 처럼 되어있지 않고 
# {data1}, {data2}, ... 이므로 field="data" 와 같이 필드 지정하면 오류남
qna_dataset = load_dataset("json", data_files=data_files)

In [2]:
qna_dataset_sample = qna_dataset["train"].shuffle(seed=10).select(range(1000))
qna_dataset_sample[:5]

{'고객질문(요청)': ['각 범위별로 설명 좀 해주시겠어요?',
  '카드를 해지했는데, 그 해지한 카드 결제계좌에서 카드값이 나갔어요. 뭔 일이죠?',
  '사지절단시는요?',
  '',
  '보험계약대출을 받으려고 하는데 어떻게 해야하나요? '],
 '고객답변': ['', '', '', '', ''],
 '화자': ['고객', '고객', '고객', '상담사', '고객'],
 '상담사의도': ['', '', '', '용종 제거', ''],
 '상담사답변': ['', '', '', '고객님, 본인확인부터 하겠습니다', ''],
 '지식베이스': ['설명,내용', '해지,결제금액', '절단,팔다리', '확인,검증', '보험계약대출,대부'],
 '도메인': ['금융/보험', '금융/보험', '금융/보험', '금융/보험', '금융/보험'],
 '고객의도': ['보험 가입 문의', '카드해지후승인내역', '교보생명', '', 'ATM기 대출'],
 '상담사질문(요청)': ['', '', '', '', ''],
 '카테고리': ['상품 가입 및 해지',
  '잔고 및 거래내역',
  '사고 및 보상 문의',
  '사고 및 보상 문의',
  '이체, 출금, 대출서비스'],
 '문장번호': ['11', '1', '17', '4', '1'],
 'QA': ['Q', 'Q', 'Q', 'A', 'Q'],
 '용어사전': ['범위/영역,설명/내용',
  '카드/전자화폐, 해지/취소, 카드값/결제금액',
  '사지/팔다리',
  '본인/당사자,확인/검증',
  '대출/대부'],
 '대화셋일련번호': ['A8343', 'A18100', 'A19078', 'A8451', 'A6650'],
 '개체명 ': ['범위,설명', '카드, 해지, 결제계좌, 카드값', '사지,절단', '본인,확인', '보험계약 대출']}

In [68]:
qna_dataset

DatasetDict({
    train: Dataset({
        features: ['고객질문(요청)', '고객답변', '화자', '상담사의도', '상담사답변', '지식베이스', '도메인', '고객의도', '상담사질문(요청)', '카테고리', '문장번호', 'QA', '용어사전', '대화셋일련번호', '개체명 '],
        num_rows: 323215
    })
    test: Dataset({
        features: ['고객질문(요청)', '고객답변', '화자', '상담사의도', '상담사답변', '지식베이스', '도메인', '고객의도', '상담사질문(요청)', '카테고리', '문장번호', 'QA', '용어사전', '대화셋일련번호', '개체명 '],
        num_rows: 40426
    })
})

In [64]:
# 불필요한 data 버리기
empty_indices = []

for i, data in enumerate( qna_dataset["train"]):      
    if data["고객질문(요청)"]+ data["고객답변"] + data["상담사답변"] +data["상담사질문(요청)"] == '':
        empty_indices.append(i)

print(f"{empty_indices}is empty")

[31271, 31273, 31275, 31277, 31279, 31281, 36214, 38581, 53894, 75159, 75245, 78114, 85492, 85593, 85959, 96786, 122885, 124850, 134709, 134763, 140166, 141265, 149974, 150049, 150605, 158095, 159776, 161812, 162715, 172091, 172092, 173105, 173106, 176234, 176254, 197896, 198473, 199050, 199080, 199127, 207930, 207961, 208685, 208722, 214836, 215479, 215499, 215501, 226742, 226772, 226819, 234179, 234256, 237716, 248037, 251852, 282477, 301541, 317402]is empty


In [65]:
for i in empty_indices:
    print(qna_dataset["train"][i]["고객질문(요청)"], qna_dataset["train"][i]["고객답변"],qna_dataset["train"][i]["상담사답변"],qna_dataset["train"][i]["상담사질문(요청)"])

   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   


In [70]:
qna_dataset = qna_dataset.filter(lambda x, i: i not in empty_indices, with_indices=True)

Filter:   0%|          | 0/323215 [00:00<?, ? examples/s]

Filter:   0%|          | 0/40426 [00:00<?, ? examples/s]

In [75]:
# QnA 합치기
def sum_qna(ex):
    return {"talk": ex["고객질문(요청)"] + ex["고객답변"] + ex["상담사답변"] + ex["상담사질문(요청)"]}

qna_dataset = qna_dataset.map(sum_qna)

Map:   0%|          | 0/323156 [00:00<?, ? examples/s]

KeyboardInterrupt: 

# 2. Set Tokenizer for data manipulating

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("jiwon65/whisper-small_korean-zeroth")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [79]:
def tokenize_function(ex):
    return tokenizer(ex["talk"], padding="max_length", truncation=True)

In [80]:
tokenized_datasets = qna_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/323156 [00:00<?, ? examples/s]

Map:   0%|          | 0/40418 [00:00<?, ? examples/s]

In [None]:
# Dataloader 씌우기
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import DataCollatorWithPadding

# batch 별 padding 을 위한 datacollator 
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenized_datasets = tokenized_datasets.remove_columns(['고객질문(요청)'], ['고객답변'], ['화자'], ['상담사의도'], ['상담사답변'], ['지식베이스'], '도메인', '고객의도', '상담사질문(요청)', '카테고리', '문장번호', 'QA', '용어사전', '대화셋일련번호', '개체명 '])

# 3. Load model & Training
### 3-1. Load model 

In [81]:
from transformers import AutoModel
checkpoint = "jiwon65/whisper-small_korean-zeroth"
model = AutoModel.from_pretrained(checkpoint)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [91]:
# training hyperparameter
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(output_dir="checkpoint_dir", evaluation_strategy="epoch")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [92]:
# 평가 mertic 설정
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# Trainer 로 훈련
