In [None]:
!pip install transformers



In [None]:
# @title Huggingface 라이브러리 활용

In [None]:
# @title Tokenizer 응용
from transformers import AutoModel, AutoTokenizer

model_name = 'bert-base-multilingual-cased'

# BertTokenizerFaset class

model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
print(tokenizer.vocab_size)

119547


In [None]:
for i, key in enumerate(tokenizer.get_vocab()):
      print(key)
      if i > 50:
            break

##ënë
縷
##lot
Senato
##lője
Habib
##挤
кĕçĕн
##ഭ
##voren
కచ్చా
瘟
##ыс
Barth
##чиками
Herman
toată
Rochefort
Carson
thắng
ثلاث
##因
##滔
粉
Ecology
Egyesült
magna
Martens
##۷
aici
ħ
##шње
comtat
偈
årsnederbörd
##קרקע
parti
Death
##γο
wa
zapadu
##rná
##씨
летних
븐
retreat
##gune
##eë
祀
##ाव
Adriano
##唄


In [None]:
text = "이순신은 조선 중기의 무신이다"

In [None]:
tokenized_input_text = tokenizer(text, return_tensors='pt')
for key, value in tokenized_input_text.items():
  print("{}:\n\t{}".format(key, value))

input_ids:
	tensor([[   101,   9638, 119064,  25387,  10892,  59906,   9694,  46874,   9294,
          25387,  11925,    102]])
token_type_ids:
	tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask:
	tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [None]:
print(tokenized_input_text['input_ids'])
print(tokenized_input_text.input_ids)
print(tokenized_input_text['token_type_ids'])
print(tokenized_input_text.token_type_ids)
print(tokenized_input_text['attention_mask'])
print(tokenized_input_text.attention_mask)

tensor([[   101,   9638, 119064,  25387,  10892,  59906,   9694,  46874,   9294,
          25387,  11925,    102]])
tensor([[   101,   9638, 119064,  25387,  10892,  59906,   9694,  46874,   9294,
          25387,  11925,    102]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [None]:
tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)
input_ids = tokenizer.encode(text)
print(input_ids)
decoded_ids = tokenizer.decode(input_ids)
print(decoded_ids)

['이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다']
[101, 9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 102]
[CLS] 이순신은 조선 중기의 무신이다 [SEP]


In [None]:
tokenized_text = tokenizer.tokenize(text, add_special_tokens=False)
print(tokenized_text)
input_ids = tokenizer.encode(text, add_special_tokens=False)
print(input_ids)
decoded_ids = tokenizer.decode(input_ids)
print(decoded_ids)

['이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다']
[9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925]
이순신은 조선 중기의 무신이다


In [None]:
tokenized_text = tokenizer.tokenize(text,
                                    add_special_tokens=False,
                                    max_length=5,
                                    truncation=True)
print(tokenized_text)

input_ids = tokenizer.encode(text,
                                    add_special_tokens=False,
                                    max_length=5,
                                    truncation=True)
print(input_ids)
decoded_ids = tokenizer.decode(input_ids)
print(decoded_ids)

['이', '##순', '##신', '##은', '조선']
[9638, 119064, 25387, 10892, 59906]
이순신은 조선


In [None]:
print(tokenizer.pad_token)
print(tokenizer.pad_token_id)

tokenized_text = tokenizer.tokenize(text,
                                    add_special_tokens=False,
                                    max_length=20,
                                    padding='max_length')
print(tokenized_text)

input_ids = tokenizer.encode(text,
                                    add_special_tokens=False,
                                    max_length=20,
                              padding='max_length')
print(input_ids)
decoded_ids = tokenizer.decode(input_ids)
print(decoded_ids)

[PAD]
0
['이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
[9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
이순신은 조선 중기의 무신이다 [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [None]:
# @title 새로운 token 추가

text = '깟뻬뜨랑 리불이 뜨럽거 므리커럭이 케쇽 냐왜쇼 우뤼갸 쳥쇼섀료다혀뚜여'

tokenized_text = tokenizer.tokenize(text, add_special_tokens=False)
print(tokenized_text)
input_ids = tokenizer.encode(text, add_special_tokens=False)
print(input_ids)
decoded_ids = tokenizer.decode(input_ids)
print(decoded_ids)

['[UNK]', '리', '##불', '##이', '뜨', '##럽', '##거', '므', '##리', '##커', '##럭', '##이', '[UNK]', '냐', '##왜', '##쇼', '[UNK]', '[UNK]']
[100, 9238, 118992, 10739, 9151, 118867, 41521, 9308, 12692, 106826, 118864, 10739, 100, 9002, 119164, 119060, 100, 100]
[UNK] 리불이 뜨럽거 므리커럭이 [UNK] 냐왜쇼 [UNK] [UNK]


In [None]:
added_token_num= tokenizer.add_tokens(["깟뻬뜨랑", "케쇽", "우뤼갸", "쳥쇼", "섀료"])
print(added_token_num)

tokenized_text = tokenizer.tokenize(text, add_special_tokens=False)
print(tokenized_text)
input_ids = tokenizer.encode(text, add_special_tokens=False)
print(input_ids)
decoded_ids = tokenizer.decode(input_ids)
print(decoded_ids)

5
['깟뻬뜨랑', '리', '##불', '##이', '뜨', '##럽', '##거', '므', '##리', '##커', '##럭', '##이', '케쇽', '냐', '##왜', '##쇼', '우뤼갸', '쳥쇼', '섀료', '다', '##혀', '##뚜', '##여']
[119547, 9238, 118992, 10739, 9151, 118867, 41521, 9308, 12692, 106826, 118864, 10739, 119548, 9002, 119164, 119060, 119549, 119550, 119551, 9056, 80579, 118841, 29935]
깟뻬뜨랑 리불이 뜨럽거 므리커럭이 케쇽 냐왜쇼 우뤼갸 쳥쇼 섀료 다혀뚜여


In [None]:
print(tokenizer.vocab_size)

119547


In [None]:
# @title 특정 역할을 위한 special token 추가

text = "[SHKIM]이순신은 조선 중기의 무신이다./[SHKIM]"

tokenized_text = tokenizer.tokenize(text, add_special_tokens=False)
print(tokenized_text)
input_ids = tokenizer.encode(text, add_special_tokens=False)
print(input_ids)
decoded_ids = tokenizer.decode(input_ids)
print(decoded_ids)

['[', 'SH', '##KI', '##M', ']', '이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '.', '/', '[', 'SH', '##KI', '##M', ']']
[164, 38702, 59879, 11517, 166, 9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 119, 120, 164, 38702, 59879, 11517, 166]
[ SHKIM ] 이순신은 조선 중기의 무신이다. / [ SHKIM ]


In [None]:
# @title 특정 역할을 위한 special token 추가

text = "[SHKIM]이순신은 조선 중기의 무신이다./[SHKIM]"

added_token_num += tokenizer.add_special_tokens({"additional_special_tokens":["[SHKIM]","[/SHKIM]"]})

tokenized_text = tokenizer.tokenize(text, add_special_tokens=False)
print(tokenized_text)
input_ids = tokenizer.encode(text, add_special_tokens=False)
print(input_ids)
decoded_ids = tokenizer.decode(input_ids)
print(decoded_ids)

['[SHKIM]', '이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '.', '/', '[SHKIM]']
[119552, 9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 119, 120, 119552]
[SHKIM] 이순신은 조선 중기의 무신이다. / [SHKIM]


In [None]:
print(added_token_num)

7


In [None]:
# @title 자연어처리 task

# Single segment input
single_seg_input = tokenizer("이순신은 조선 중기의 무신이다")

# Multiple segment input
multi_seg_input = tokenizer("이순신은 조선 중기의 무신이다.","그는 임진왜란을 승리로 이끌었다")

print("single segment token (str): {}".format(tokenizer.convert_ids_to_tokens(single_seg_input['input_ids'])))
print("single segment token (int): {}".format(single_seg_input['input_ids']))
print("single segment token : {}".format(single_seg_input['token_type_ids']))

print()
print("multi segment token (str): {}".format(tokenizer.convert_ids_to_tokens(multi_seg_input['input_ids'])))
print("multi segment token (int): {}".format(multi_seg_input['input_ids']))
print("multi segment token : {}".format(multi_seg_input['token_type_ids']))


single segment token (str): ['[CLS]', '이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '[SEP]']
single segment token (int): [101, 9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 102]
single segment token : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

multi segment token (str): ['[CLS]', '이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '.', '[SEP]', '그는', '임', '##진', '##왜', '##란', '##을', '승', '##리로', '이', '##끌', '##었다', '[SEP]']
multi segment token (int): [101, 9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 119, 102, 17889, 9644, 18623, 119164, 49919, 10622, 9484, 100434, 9638, 118705, 17706, 102]
multi segment token : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [None]:
# @title 배열 형태로 입력

tokens = tokenizer(["이순신은 조선 중기의 무신이다.","그는 임진왜란을 승리로 이끌었다"],
                   padding=True)

for i in range(2):
  print("tokens: (int): {}",format(tokens['input_ids'][i]))
  print("tokens: (int): {}",format([tokenizer.convert_ids_to_tokens(s) for s in tokens['input_ids'][i]]))
  print("tokens: (att_mask): {}",format(tokens['attention_mask'][i]))
  print()

tokens: (int): {} [101, 9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 119, 102]
tokens: (int): {} ['[CLS]', '이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '.', '[SEP]']
tokens: (att_mask): {} [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

tokens: (int): {} [101, 17889, 9644, 18623, 119164, 49919, 10622, 9484, 100434, 9638, 118705, 17706, 102]
tokens: (int): {} ['[CLS]', '그는', '임', '##진', '##왜', '##란', '##을', '승', '##리로', '이', '##끌', '##었다', '[SEP]']
tokens: (att_mask): {} [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]



In [None]:
# @title BERT 모델 테스트

text = "이순신은 [MASK] 중기의 무신이다"
tokenized_text = tokenizer.tokenize(text)

print(tokenized_text)

['이', '##순', '##신', '##은', '[MASK]', '중', '##기의', '무', '##신', '##이다']


In [None]:
from transformers import pipeline

nlp_fill = pipeline('fill-mask', model=model_name)
nlp_fill("이순신은 [MASK] 중기의 무신이다.")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


[{'score': 0.8747125267982483,
  'token': 59906,
  'token_str': '조선',
  'sequence': '이순신은 조선 중기의 무신이다.'},
 {'score': 0.06436437368392944,
  'token': 9751,
  'token_str': '청',
  'sequence': '이순신은 청 중기의 무신이다.'},
 {'score': 0.010954885743558407,
  'token': 9665,
  'token_str': '전',
  'sequence': '이순신은 전 중기의 무신이다.'},
 {'score': 0.004647171590477228,
  'token': 22200,
  'token_str': '##종',
  'sequence': '이순신은종 중기의 무신이다.'},
 {'score': 0.003610667772591114,
  'token': 12310,
  'token_str': '##기',
  'sequence': '이순신은기 중기의 무신이다.'}]

In [None]:
tokens_pt = tokenizer('이순신은 조선 중기의 무신이다.', return_tensors='pt')
for key, value in tokens_pt.items():
  print("{}:\n\t{}".format(key, value))

outputs = model(**tokens_pt)
last_hidden_state = outputs.last_hidden_state
pooler_output = outputs.pooler_output

print("\n token wise output: {}, pooled output: {}".format(last_hidden_state.shape, pooler_output.shape))

input_ids:
	tensor([[   101,   9638, 119064,  25387,  10892,  59906,   9694,  46874,   9294,
          25387,  11925,    119,    102]])
token_type_ids:
	tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask:
	tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

 token wise output: torch.Size([1, 13, 768]), pooled output: torch.Size([1, 768])


In [None]:
print(pooler_output)

tensor([[ 2.3151e-01, -8.0099e-02, -1.4249e-01, -1.6500e-01,  4.0654e-02,
         -1.0855e-01, -8.1334e-02,  1.8238e-01, -2.5709e-01,  1.7708e-01,
          5.5793e-02,  2.1646e-01, -2.6028e-01, -1.1805e-01,  1.6879e-01,
          2.2665e-01,  1.6693e-01, -1.4605e-01, -3.3806e-01, -2.5379e-04,
         -9.9437e-01, -3.0201e-01,  7.8168e-02, -2.0071e-01, -2.2053e-02,
          1.8374e-01,  7.8908e-02,  2.7492e-01,  1.5742e-01, -1.8706e-03,
         -1.9677e-01, -9.9644e-01,  5.8323e-01,  2.5800e-01,  1.8723e-01,
         -2.3858e-01,  5.7414e-02,  2.9294e-01,  2.0999e-01, -3.3407e-01,
          9.6668e-02, -1.3142e-01,  1.4946e-01, -2.3270e-01, -1.2149e-01,
         -2.6757e-01, -5.3484e-03,  3.1485e-01, -3.2587e-01, -8.8365e-02,
          2.4172e-01,  1.7982e-01,  3.3085e-01, -2.0571e-01, -2.7982e-02,
          2.8920e-01, -1.2191e-01, -3.7320e-01,  4.6051e-02, -2.3296e-01,
         -2.3272e-01, -4.9194e-02,  9.7081e-02, -8.8512e-02, -5.6786e-02,
         -2.2798e-01,  2.4580e-02, -1.

In [None]:
# @title vocab을 추가했다면 모델의 임베딩 사이즈를 올려줘야됨

print(model.get_input_embeddings())
model.resize_token_embeddings(tokenizer.vocab_size + added_token_num)
print(model.get_input_embeddings())

Embedding(119547, 768, padding_idx=0)


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(119554, 768, padding_idx=0)


In [None]:
# @title cls 토큰을 활용한 문장의 유사도 측정

sent1 = tokenizer("오늘 하루 어떻게 보냈나요?", return_tensors="pt")
sent2 = tokenizer("오늘은 어떤 하루를 보내셨나요?", return_tensors="pt")
sent3 = tokenizer("이순신은 조선 중기의 무신이다.", return_tensors="pt")
sent4 = tokenizer("깟뻬뜨랑 리뿔이 뜨럽거 므리커럭이 케쇽 냐왜쇼 우뤼갸 쳥쇼섀료다혀뚜여", return_tensors="pt")

outputs = model(**sent1)
sent_1_pooler_output = outputs.pooler_output

outputs = model(**sent2)
sent_2_pooler_output = outputs.pooler_output

outputs = model(**sent3)
sent_3_pooler_output = outputs.pooler_output

outputs = model(**sent4)
sent_4_pooler_output = outputs.pooler_output

In [None]:
from torch import nn

cos = nn.CosineSimilarity(dim=1, eps=1e-6)
print(cos(sent_1_pooler_output, sent_2_pooler_output))
print(cos(sent_2_pooler_output, sent_3_pooler_output))
print(cos(sent_3_pooler_output, sent_4_pooler_output))
print(cos(sent_1_pooler_output, sent_4_pooler_output))

tensor([0.9757], grad_fn=<SumBackward1>)
tensor([0.6075], grad_fn=<SumBackward1>)
tensor([0.5854], grad_fn=<SumBackward1>)
tensor([0.8905], grad_fn=<SumBackward1>)
