# CPU
: [Central Processing Unit]

-> 컴퓨터의 중앙처리 장치

-> 모든 컴퓨터 시스템의 핵심(대부분 소프트웨어 처리)

# GPU
: [Graphics Processing Unit]

-> 그래픽 처리 전용 프로세스

-> 그래픽 및 랜더링용으로 개발(딥러닝 및 계산)


# TPU
: [Tensor Processing Unit]

-> 구글에서 머신러닝 작업을 위해 설계한 하드웨어 가속기

-> 머신러닝, 딥러닝에 특화된 형태로 설계

---------------------------------
** 차이점 **
- CPU : 순차적 처리
- GPU : 병렬적 처리
- TPU : 행렬 연산

In [None]:
import tensorflow as tf
import os
import pandas as pd
import numpy as np
import urllib.request
from tqdm import tqdm
from transformers import BertTokenizer, TFBertModel

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

('ratings_test.txt', <http.client.HTTPMessage at 0x7c64abaa3f40>)

In [None]:
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

In [None]:
train_data.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [None]:
print(len(train_data), len(test_data))

150000 50000


In [None]:
train_data.isnull().values.any()

True

In [None]:
test_data.isnull().values.any()

True

In [None]:
train_data = train_data.dropna(how='any')
train_data = train_data.reset_index(drop=True)
train_data.isnull().values.any()

False

In [None]:
test_data = test_data.dropna(how='any')
test_data = test_data.reset_index(drop=True)
test_data.isnull().values.any()

False

In [None]:
print(len(train_data), len(test_data))

149995 49997


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [None]:
print(tokenizer.encode('보는 내내 그대로 들어맞는 예측 카리스마 없는 악역'))

[101, 9356, 11018, 8996, 31605, 110589, 71568, 118913, 11018, 9576, 119281, 9786, 79940, 23811, 40364, 9520, 23160, 102]


In [None]:
print(tokenizer.tokenize('보는 내내 그대로 들어맞는 예측 카리스마 없는 악역'))

['보', '##는', '내', '##내', '그대로', '들어', '##맞', '##는', '예', '##측', '카', '##리스', '##마', '없는', '악', '##역']


In [None]:
tokenizer.decode(tokenizer.encode('보는 내내 그대로 들어맞는 예측 카리스마 없는 악역'))

'[CLS] 보는 내내 그대로 들어맞는 예측 카리스마 없는 악역 [SEP]'

In [None]:
for elem in tokenizer.encode('happy birthday~!'):
  print(tokenizer.decode(elem))

[ C L S ]
h a p p y
b i r t h d a y
~
!
[ S E P ]


In [None]:
for elem in tokenizer.encode('보는 내내 그대로 들어맞는 예측 카리스마 없는 악역'):
  print(tokenizer.decode(elem))

[ C L S ]
보
# # 는
내
# # 내
그 대 로
들 어
# # 맞
# # 는
예
# # 측
카
# # 리 스
# # 마
없 는
악
# # 역
[ S E P ]


In [None]:
print(tokenizer.decode(101))

[ C L S ]


In [None]:
print(tokenizer.decode(102))

[ S E P ]


In [None]:
print(tokenizer.cls_token, tokenizer.sep_token)
# print(tokenizer.cls_token_id, tokenizer_sep_token_id)

[CLS] [SEP]


NameError: name 'tokenizer_sep_token_id' is not defined

In [None]:
print(tokenizer.pad_token, tokenizer.pad_token_id)

[PAD] 0


In [None]:
max_seq_len = 128

encoded_result = tokenizer.encode('보는 내내 그대로 들어맞는 예측 카리스마 없는 악역',
                                 max_length=max_seq_len, pad_to_max_length=True,truncation=True)
print(encoded_result)

[101, 9356, 11018, 8996, 31605, 110589, 71568, 118913, 11018, 9576, 119281, 9786, 79940, 23811, 40364, 9520, 23160, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
print(len(encoded_result))

128


In [None]:
# examples : 변환할 텍스트, labels : 라벨 리스트, max_seq_len : 최대 시퀸스 길이, tokenizer : BERT 토크나이저
def convert_examples_to_features(examples, labels, max_seq_len, tokenizer):
  input_ids, attention_masks, token_type_ids, data_labels = [], [], [], [] # 각 데이터, list로 초기화

  for example, label in tqdm(zip(examples, labels), total=len(examples)): #
    input_id = tokenizer.encode(example, max_length = max_seq_len, pad_to_max_length=True) # 토큰화, 정수 인코딩

    # 실제 단어가 있는 부분은 1, 패딩은 0
    padding_count = input_id.count(tokenizer.pad_token_id)
    attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
    # 세그먼트 임베딩
    token_type_id = [0] * max_seq_len

    assert len(input_id) == max_seq_len, 'Error with Length [] vs []'.format(len(input_id), max_seq_len)
    assert len(attention_mask) == max_seq_len, 'Error with attention mask length [] vs []'.format(len(attention_mask), max_seq_len)
    assert len(token_type_id) == max_seq_len, 'Error with token type length [] vs []'.format(len(token_type_id), max_seq_len)

    # 전체 리스트에 추가
    input_ids.append(input_id)
    attention_masks.append(attention_mask)
    token_type_ids.append(token_type_id)
    data_labels.append(label)

  # Numpy 배열 변환
  input_ids = np.array(input_ids, dtype='int')
  attention_masks = np.array(attention_masks, dtype='int')
  token_type_ids = np.array(token_type_ids, dtype='int')
  data_labels = np.asarray(data_labels, dtype=int32)

  return (input_ids, attention_masks, token_type_ids), data_labels

In [None]:
train_X, train_y = convert_examples_to_features(train_data['document'], train_data['label'],
                                                max_seq_len=max_seq_len, tokenizer=tokenizer)

  0%|          | 0/149995 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 149995/149995 [00:36<00:00, 4149.53it/s]


In [None]:
test_X, test_y = convert_examples_to_features(test_data['document'], test_data['label'],
                                                max_seq_len=max_seq_len, tokenizer=tokenizer)

100%|██████████| 49997/49997 [00:11<00:00, 4427.98it/s]


In [None]:
input_id = train_X[0][0]
attention_mask = train_X[1][0]
token_type_id = train_X[2][0]
label = train_y[0]

print('단어에 대한 정수 인코딩 : ', input_id)
print('어텐션 마스크 : ', attention_mask)
print('세그먼트 인코딩 : ', token_type_id)
print('각 인코딩의 길이 : ', len(input_id))
print('정수 인코딩 복원 : ', tokenizer.decode(input_id))
print('레이블 : ', label)

단어에 대한 정수 인코딩 :  [   101   9519   9074 119005    119    119   9708 119235   9715 119230
  16439  77884  48549   9284  22333  12692    102      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0]
어텐션 마스크 :  [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0

# 리뷰에 대한 분류 처리

In [None]:
model = TFBertModel.from_pretrained('bert-base-multilingual-cased', from_pt=True)

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

In [None]:
input_ids_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
attention_masks_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
token_type_ids_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)

outputs = model([input_ids_layer, attention_masks_layer, token_type_ids_layer])

In [None]:
print(outputs)

TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'tf_bert_model')>, pooler_output=<KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'tf_bert_model')>, past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None)


In [None]:
print(outputs[0])

KerasTensor(type_spec=TensorSpec(shape=(None, 128, 768), dtype=tf.float32, name=None), name='tf_bert_model/bert/encoder/layer_._11/output/LayerNorm/batchnorm/add_1:0', description="created by layer 'tf_bert_model'")


In [None]:
class TFBertForSequenceClassification(tf.keras.Model):
  def __init__(self, model_name): # 사전 학습된 BERT 모델
    super(TFBertForSequenceClassification, self).__init__()
    self.bert = TFBertModel.from_pretrained(model_name, from_pt=True) # BERT 모델 로드

    # Dense Layer 처리(kernel_initializer : 가중치 초기화 설정 표준편차 0.02인 정규분포)
    self.classifier = tf.keras.layers.Dense(1, kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02),
                                            activation='sigmoid',
                                            name='classifier')


  def call(self, inputs):
    input_ids, attention_masks, token_type_ids = inputs
    outputs = self.bert([input_ids, attention_masks, token_type_ids])
    cls_token = outputs[1] # BERT의 출력에서 [CLS] 토큰에 대한 임베딩을 추출한다.(문장의 전체 의미를 대표)
    prediction = self.classifier(cls_token) # 예측 수행

    return prediction

In [None]:
model = TFBertForSequenceClassification('bert-base-multilingual-cased')
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.BinaryCrossentropy()

model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

In [None]:
model.fit(train_X, train_y, epochs=2, batch_size=32, validation_data=0.2)

Epoch 1/2

==> 수업 끝난 후에 11시쯤까지 기다려보았지만... 도저히 끝낼 기미를 안보였습니다....😭

In [None]:
# 학습한 모델로 실제 리뷰 예측해보기(숙제)(긍정/부정)

In [None]:
# 리뷰 예측 함수 정의
def predict(review):
  encoded_review = tokenizer.encode(review, max_length=max_seq_len, pad_to_max_length=True)
  padding_count = encoded_review.count(tokenizer.pad_token_id)
  attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
  token_type_id = [0] * max_seq_len

  input_ids = np.array([encoded_review], dtype='int32')
  attention_masks = np.array([attention_mask], dtype='int32')
  token_type_ids = np.array([token_type_id], dtype='int32')

  prediction = model.predict([input_ids, attention_masks, token_type_ids])

  if prediction[0][0] >= 0.5:
    return '긍정'
  else:
    return '부정'

In [None]:
print(predict_review("기대하고 갔는데 별로였어용 ㅠㅠ"))
print(predict_review("이야 이만한 영화 또 없습니다^_^"))