### xtreme 데이터 셋

In [1]:
from datasets import get_dataset_config_names

# xtreme 데이터셋 이름 확인
subsets = get_dataset_config_names("xtreme")

def find_lan(lan):
    return [j for j in [i for i in subsets if "PAN-X" in i]
                if lan in j][0]

language_names = [find_lan(l) for l in ["ko", "en", "ja", "es"]]
language_names

['PAN-X.ko', 'PAN-X.en', 'PAN-X.ja', 'PAN-X.es']

### 데이터 셋 로드 및 전처리

In [None]:
from collections import defaultdict
from datasets import DatasetDict, load_dataset

panx_ch = defaultdict(DatasetDict)
down_sample_ratios = [0.5, 0.3, 0.2, 0.1] #현실 데이터 와 같이 데이터 불균형 만들기
# 데이터 셋 로드

for l, down_sample_ratio in zip(language_names, down_sample_ratios):
    l_name = (l.split('.')[-1])
    down_sample = load_dataset("xtreme", name=l)

    for split in down_sample:
        # 선택할 데이터 포인트의 수
        num_datas = int(down_sample[split].num_rows * down_sample_ratio)
        panx_ch[l_name][split] = down_sample[split].shuffle(seed=42).select(range(num_datas))

In [3]:
print(f"""각 언어별 데이터 포인트 수 :
{[(j,panx_ch[j].num_rows) for j in [i for i in panx_ch]]}""")

각 언어별 데이터 포인트 수 :
[('ko', {'train': 10000, 'validation': 5000, 'test': 5000}), ('en', {'train': 6000, 'validation': 3000, 'test': 3000}), ('ja', {'train': 4000, 'validation': 2000, 'test': 2000}), ('es', {'train': 2000, 'validation': 1000, 'test': 1000})]


In [4]:
lables = panx_ch["ko"]["train"].features["ner_tags"].feature.names

i2l = {k:v for k,v in enumerate(lables)}
l2i = {v:k for k,v in enumerate(lables)}

print(i2l,"\n",l2i)

{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'} 
 {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}


- 라벨 맵핑하기

In [5]:
ner_tags_str = lambda bs : {"ner_tags_str" :([[i2l[i] for i in b] for b in bs["ner_tags"]])}

for l in ["ko", "en", "ja", "es"]:
    panx_ch[l] = (panx_ch[l]
                    .map(ner_tags_str,
                    batch_size= 100, batched=True))

In [None]:
import pandas as pd
pd.DataFrame(panx_ch["en"]["train"][122]).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
tokens,Jake,McGee,",",current,MLB,player,(,Tampa,Bay,Rays,)
ner_tags,1,2,0,0,3,0,0,3,4,4,0
langs,en,en,en,en,en,en,en,en,en,en,en
ner_tags_str,B-PER,I-PER,O,O,B-ORG,O,O,B-ORG,I-ORG,I-ORG,O


- ko 안의 ner 갯 수 확인

In [None]:
all_count = {}
for k in ["train", "validation", "test"]:
    all_content =[]
    for i in panx_ch['ko'][k]["ner_tags_str"]:
        for j in i:
            if "B" in j:
                all_content.append(j.split("-")[-1])
        per , org, loc =(all_content.count('PER'),
                         all_content.count('ORG'),
                         all_content.count('LOC'))
        all_count[k] = [per, org, loc]
all_count

{'train': [4097, 4495, 5845],
 'validation': [2039, 2297, 2976],
 'test': [2116, 2165, 2971]}

In [109]:
pd.DataFrame(all_count,
              index=['PER', 'ORG', 'LOC']).transpose()

Unnamed: 0,PER,ORG,LOC
train,4097,4495,5845
validation,2039,2297,2976
test,2116,2165,2971


### XLM-R 토크나이징

In [112]:
from transformers import AutoTokenizer

xlmr_model_name = "xlm-roberta-base"
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

In [194]:
' '.join(panx_ch['ko']['train'][1266]["tokens"])

'넘겨주기 무명 용사의 무덤'

In [193]:
xlmr_tokenizer(' '.join(panx_ch['ko']['train'][4]["tokens"]))


{'input_ids': [0, 11873, 2293, 6, 166637, 71393, 16907, 12286, 80169, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}