In [1]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("amberoad/bert-multilingual-passage-reranking-msmarco")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cpu")

In [3]:
df = pd.read_csv("./data/train.csv")
df.head()

Unnamed: 0,text,label
0,"self.convs1 = nn.ModuleList([nn.Conv2d(1, Co, ...",코드2
1,현재 이미지를 여러개 업로드 하기 위해 자바스크립트로 동적으로 폼 여러개 생성하는데...,웹
2,glob.glob(PATH) 를 사용할 때 질문입니다.\n\nPATH에 [ ] 가 ...,코드2
3,"tmpp = tmp.groupby(by = 'Addr1', as_index=Fals...",코드2
4,filename = TEST_IMAGE + str(round(frame_sec)) ...,코드2


In [4]:
label_dict = {
    '코드1': 0,
    '코드2': 1,
    '웹': 2,
    '이론': 3,
    '시스템 운영': 4,
    '원격': 5
}

df['label'] = df['label'].map(label_dict)

df.head()

Unnamed: 0,text,label
0,"self.convs1 = nn.ModuleList([nn.Conv2d(1, Co, ...",1
1,현재 이미지를 여러개 업로드 하기 위해 자바스크립트로 동적으로 폼 여러개 생성하는데...,2
2,glob.glob(PATH) 를 사용할 때 질문입니다.\n\nPATH에 [ ] 가 ...,1
3,"tmpp = tmp.groupby(by = 'Addr1', as_index=Fals...",1
4,filename = TEST_IMAGE + str(round(frame_sec)) ...,1


In [5]:
import re
import emoji
from soynlp.normalizer import repeat_normalize

pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-ㅣ가-힣]+')
url_pattern = re.compile(
    r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')

def clean(x):
    x = pattern.sub(' ', x)
    x = emoji.replace_emoji(x, replace='') #emoji 삭제
    x = url_pattern.sub('', x)
    x = x.strip()
    x = repeat_normalize(x, num_repeats=2)
    return x

In [6]:
x_train = [clean(x) for x in df['text'].values]

df['remove_special_sent'] = x_train

df.head()

Unnamed: 0,text,label,remove_special_sent
0,"self.convs1 = nn.ModuleList([nn.Conv2d(1, Co, ...",1,"self.convs1 = nn.ModuleList([nn.Conv2d(1, Co, ..."
1,현재 이미지를 여러개 업로드 하기 위해 자바스크립트로 동적으로 폼 여러개 생성하는데...,2,현재 이미지를 여러개 업로드 하기 위해 자바스크립트로 동적으로 폼 여러개 생성하는데...
2,glob.glob(PATH) 를 사용할 때 질문입니다.\n\nPATH에 [ ] 가 ...,1,glob.glob(PATH) 를 사용할 때 질문입니다. PATH에 [ ] 가 포함되...
3,"tmpp = tmp.groupby(by = 'Addr1', as_index=Fals...",1,"tmpp = tmp.groupby(by = 'Addr1', as_index=Fals..."
4,filename = TEST_IMAGE + str(round(frame_sec)) ...,1,filename = TEST_IMAGE + str(round(frame_sec)) ...


In [7]:
#train & test 데이터로 나누기
from sklearn.model_selection import train_test_split


data = df[['remove_special_sent', 'label']]
dataset_train, dataset_valid = train_test_split(data, test_size=0.2, random_state=0)

# 내용 확인
print(dataset_train.iloc[0])
print(dataset_valid.iloc[0])

remove_special_sent    new로 새로운 객체를 만들고 그 새로 만들어진 객체에 값을 넣기위해 Person함...
label                                                                  2
Name: 2287, dtype: object
remove_special_sent    실습 피드백 요청합니다. 손해 부분의 경우, 자사 서비스를 사용했을 때 발생하는 손...
label                                                                  3
Name: 1032, dtype: object


In [8]:
print(len(dataset_train))
print(len(dataset_valid))

2964
742


In [9]:
# Setting parameters
max_len = 512
batch_size = 32
warmup_ratio = 0.1
num_epochs = 10
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-6

In [10]:
# train_data tokenize

encoded_train = tokenizer(
    dataset_train['remove_special_sent'].tolist(),
    return_tensors='pt',
    max_length=max_len,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

# 확인
print( encoded_train[0].tokens )
print( encoded_train[0].ids )
print( encoded_train[0].attention_mask )
print()
print('디코딩 :',tokenizer.decode(encoded_train[0].ids))

['[CLS]', 'new', '##로', '새로운', 'ᄀ', '##ᅢᆨ', '##체를', 'ᄆ', '##ᅡᆫ', '##들', '##고', '그', '새', '##로', 'ᄆ', '##ᅡᆫ', '##들어', '##진', 'ᄀ', '##ᅢᆨ', '##체', '##에', '가', '##ᆹ', '##을', 'ᄂ', '##ᅥ', '##ᇂ', '##기', '##위', '##해', 'person', '##함', '##수', '##에서', 'this', '##를', '이', '##용한', '##게', 'ᄆ', '##ᅡ', '##ᆽ', '##나', '##요', '?', '그', '##러', '##ᆷ', 'person', '##이', '##라는', 'ᄒ', '##ᅡᆷ', '##수는', '가', '##ᆹ', '##을', 'ᄂ', '##ᅥ', '##ᇂ', '##기', '##위', '##한', 'ᄇ', '##ᅩ', '##조', '##역', '##할', '##이', '##라고', '새', '##ᆼ', '##각', '##해', '##도', '될', '##ᄁ', '##ᅡ', '##요', '?', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]

In [11]:
# valid data tokenized

encoded_valid = tokenizer(
    dataset_valid['remove_special_sent'].tolist(),
    return_tensors='pt',
    max_length=max_len,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

print( encoded_valid[0].tokens )
print( encoded_valid[0].ids )
print( encoded_valid[0].attention_mask )
print()
print('디코딩 :',tokenizer.decode(encoded_valid[-1].ids))

['[CLS]', '시', '##ᆯ', '##스', '##ᆸ', 'ᄑ', '##ᅵ', '##드', '##백', 'ᄋ', '##ᅭ', '##청', '##합', '##니다', '.', 'ᄉ', '##ᅩᆫ', '##해', 'ᄇ', '##ᅮ', '##분', '##의', '경우', ',', 'ᄌ', '##ᅡ', '##사', 'ᄉ', '##ᅥ', '##비', '##스를', 'ᄉ', '##ᅡ', '##용', '##해', '##ᆻ을', '때', 'ᄇ', '##ᅡᆯ', '##생', '##하는', 'ᄉ', '##ᅩᆫ', '##해', '##에', '대해', 'ᄌ', '##ᅡᆨ', '##성', '##하는', '것이', '##ᆫ', '##가', '##요', '?', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[P

In [12]:
from torch.utils.data import Dataset, DataLoader

# 데이터셋 클래스
class ReviewDataset(Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = { key: torch.tensor(val[idx]) for key, val in self.encodings.items() }
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)


# 데이터셋 생성
train_dataset = ReviewDataset(encoded_train, dataset_train['label'].values)
valid_dataset = ReviewDataset(encoded_valid, dataset_valid['label'].values)

In [13]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

model = AutoModelForSequenceClassification.from_pretrained("amberoad/bert-multilingual-passage-reranking-msmarco",
                                                           num_labels=6,
                                                           ignore_mismatched_sizes=True)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at amberoad/bert-multilingual-passage-reranking-msmarco and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [14]:
training_args = TrainingArguments(
    output_dir='./temp/electra',
    overwrite_output_dir='True',

    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,

    logging_dir='./temp/logs',
    logging_steps=log_interval,
    evaluation_strategy="steps",
    eval_steps=log_interval,

    # https://discuss.huggingface.co/t/save-only-best-model-in-trainer/8442/8
    save_total_limit=2,
    save_strategy='no',
    load_best_model_at_end=False,
)

#정확도 측정을 위한 함수 정의
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # average: 'micro', 'macro', 'weighted' or 'samples'
    # 참고 https://aimb.tistory.com/152
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'acc': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

  item = { key: torch.tensor(val[idx]) for key, val in self.encodings.items() }
  3%|▎         | 26/930 [19:33<11:37:47, 46.31s/it]