In [1]:
import torch
import pandas as pd

from tqdm.auto import tqdm
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding

In [2]:
#모델 및 토크나이저 로드
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# 데이터셋 로드
train_data = pd.read_csv("datas/ratings_train.txt", names=["id", "document", "label"], skiprows=1, sep="\t", index_col=None)
before_test_data = pd.read_csv("datas/ratings_test.txt", names=["id", "document", "label"], skiprows=1, sep="\t", index_col=None)

train_data = train_data.dropna(axis=0)
before_test_data = before_test_data.dropna(axis=0)

validation_data, test_data  = train_test_split(before_test_data, test_size=0.7, random_state=42)

#data_files = {"train": "datas/ratings_train.txt", "test": "datas/ratings_test.txt"}
datasets = DatasetDict({"train": Dataset.from_pandas(train_data),"validation": Dataset.from_pandas(validation_data),"test": Dataset.from_pandas(test_data)})
#train_datasets = Dataset.from_pandas(train_data)
#test_datasets = Dataset.from_pandas(test_data)
#raw_datasets = Dataset(train_datasets+test_datasets)

In [4]:
datasets['train'][10]

{'id': 9008700,
 'document': '걍인피니트가짱이다.진짜짱이다♥',
 'label': 1,
 '__index_level_0__': 10}

In [5]:
train_data.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [6]:
datasets['train'].features

{'id': Value(dtype='int64', id=None),
 'document': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 '__index_level_0__': Value(dtype='int64', id=None)}

In [7]:
tokenizer_sentences1 = tokenizer(datasets['train']['document'])

In [8]:
inputs = tokenizer("와 진짜 꿀잼 영화")
inputs

{'input_ids': [101, 1463, 30012, 100, 100, 1463, 30010, 30025, 30005, 30012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['[CLS]',
 'ᄋ',
 '##ᅪ',
 '[UNK]',
 '[UNK]',
 'ᄋ',
 '##ᅧ',
 '##ᆼ',
 '##ᄒ',
 '##ᅪ',
 '[SEP]']

In [10]:
def tokenize_function(example):
    return tokenizer(example["document"], truncation=True)

In [11]:
tokenized_datasets = datasets.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/149995 [00:00<?, ? examples/s]

Map:   0%|          | 0/14999 [00:00<?, ? examples/s]

Map:   0%|          | 0/34998 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 149995
    })
    validation: Dataset({
        features: ['id', 'document', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 14999
    })
    test: Dataset({
        features: ['id', 'document', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 34998
    })
})

In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["id", 'document', "__index_level_0__"]}
[len(x) for x in samples["input_ids"]]

[20, 56, 3, 40, 82, 73, 30, 101]

In [14]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 101]),
 'token_type_ids': torch.Size([8, 101]),
 'attention_mask': torch.Size([8, 101]),
 'labels': torch.Size([8])}

In [15]:
training_args = TrainingArguments("test-trainer")

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [18]:
trainer.train()

Step,Training Loss
500,0.7063
1000,0.6989
1500,0.7008
2000,0.6969
2500,0.6971
3000,0.6969
3500,0.6957
4000,0.6964
4500,0.6984
5000,0.6971


TrainOutput(global_step=56250, training_loss=0.6945938240559896, metrics={'train_runtime': 5922.8096, 'train_samples_per_second': 75.975, 'train_steps_per_second': 9.497, 'total_flos': 2.669769019019694e+16, 'train_loss': 0.6945938240559896, 'epoch': 3.0})

In [19]:
trainer.save_model('./models0505_1/')

In [20]:
predictions = trainer.predict(tokenized_datasets['validation'])
print(predictions)

PredictionOutput(predictions=array([[-0.09416476, -0.08538062],
       [-0.0941634 , -0.08538251],
       [-0.0941646 , -0.08538089],
       ...,
       [-0.09416297, -0.08538339],
       [-0.09416229, -0.08538441],
       [-0.09416123, -0.08538598]], dtype=float32), label_ids=array([0, 0, 0, ..., 0, 0, 1]), metrics={'test_loss': 0.693030059337616, 'test_runtime': 50.6085, 'test_samples_per_second': 296.373, 'test_steps_per_second': 37.049})


In [21]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

In [22]:
from datasets import load_metric

metric = load_metric("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

  metric = load_metric("glue", "mrpc")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.514434295619708, 'f1': 0.6793748624257099}