## 데이터셋 다운로드

`sarcasm.json` 데이터셋을 다운로드 받습니다.

In [1]:
import urllib
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

SEED = 123

# 데이터셋 다운로드
url = 'https://storage.googleapis.com/download.tensorflow.org/data/sarcasm.json'
urllib.request.urlretrieve(url, 'sarcasm.json')

# JSON 파일을 데이터프레임으로 로드
df = pd.read_json('sarcasm.json')
df = df.rename(columns={
    'headline': 'sentence', 
    'is_sarcastic': 'label'
})
df

Unnamed: 0,article_link,sentence,label
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0
...,...,...,...
26704,https://www.huffingtonpost.com/entry/american-...,american politics in moral free-fall,0
26705,https://www.huffingtonpost.com/entry/americas-...,america's best 20 hikes,0
26706,https://www.huffingtonpost.com/entry/reparatio...,reparations and obama,0
26707,https://www.huffingtonpost.com/entry/israeli-b...,israeli ban targeting boycott supporters raise...,0


## 데이터셋 분할

In [2]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, random_state=SEED)

In [3]:
# train 데이터셋 출력
train.head()

Unnamed: 0,article_link,sentence,label
7917,https://www.theonion.com/disturbance-of-arafat...,disturbance of arafat's grave casts horrible c...,1
23206,https://www.huffingtonpost.com/entry/15-photos...,15 photos of hot dudes supporting bernie sande...,0
4611,https://www.huffingtonpost.com/entry/illinois-...,6 things you need to know about the nation's s...,0
11937,https://local.theonion.com/really-ugly-shark-t...,really ugly shark tired of being mistaken for ...,1
9334,https://local.theonion.com/friends-wife-encoun...,friend's wife encountered twice a year,1


In [4]:
# test 데이터셋 출력
test.head()

Unnamed: 0,article_link,sentence,label
22288,https://www.huffingtonpost.com/entry/steve-wil...,steve wilson on 'the making of gone with the w...,0
16228,https://local.theonion.com/standards-lowered-f...,standards lowered for second search through fr...,1
4905,https://www.huffingtonpost.comhttp://www.thede...,surgical tech in needle-swap scandal at swedis...,0
8947,https://www.huffingtonpost.com/entry/donald-tr...,ferguson is not among the most dangerous place...,0
3706,https://politics.theonion.com/bill-clinton-res...,bill clinton resting up to sit upright at next...,1


## 토큰화가 적용된 데이터셋

In [6]:
from transformers import AutoTokenizer
from torch.utils.data import Dataset


class TokenDataset(Dataset):
  
    def __init__(self, dataframe, tokenizer_pretrained):
        # sentence, label 컬럼으로 구성된 데이터프레임 전달
        self.data = dataframe        
        # Huggingface 토크나이저 생성
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_pretrained)
  
    def __len__(self):
        return len(self.data)
  
    def __getitem__(self, idx):
        sentence = self.data.iloc[idx]['sentence']
        label = self.data.iloc[idx]['label']

        # 토큰화 처리
        tokens = self.tokenizer(
            sentence,                # 1개 문장 
            return_tensors='pt',     # 텐서로 반환
            truncation=True,         # 잘라내기 적용
            padding='max_length',    # 패딩 적용
            add_special_tokens=True  # 스페셜 토큰 적용
        )

        input_ids = tokens['input_ids'].squeeze(0)           # 2D -> 1D
        attention_mask = tokens['attention_mask'].squeeze(0) # 2D -> 1D

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask, 
            'label': torch.tensor(label)
        }

데이터셋 인스턴스 생성

In [7]:
# distilbert-base-uncased 토크나이저 지정
tokenizer_pretrained = 'distilbert-base-uncased'

# train, test 데이터셋 생성
train_data = TokenDataset(train, tokenizer_pretrained)
test_data = TokenDataset(test, tokenizer_pretrained)

## Model

In [8]:
import torch

# device 지정
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:1


In [9]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments


# Fine-Tuning을 위한 옵션 지정
training_args = TrainingArguments(
    output_dir='./results',          # 결과 값이 저장될 디렉토리 지정
    num_train_epochs=3,              # 학습 epoch
    per_device_train_batch_size=16,  # training 배치사이즈
    per_device_eval_batch_size=64,   # evaluation 배치사이즈
    warmup_steps=500,                # leaning rate 스케줄러의 웜업 step
    weight_decay=0.01,               # weight decay 강도
    logging_dir='./logs',            # 로그를 저장할 디렉토리
    logging_steps=200,               # 로그 출력 step
)

In [10]:
# pretrained 모델 지정
model_pretrained = 'distilbert-base-uncased'

# 모델 다운로드, num_labels 지정, device 지정
model = AutoModelForSequenceClassification.from_pretrained(model_pretrained, num_labels=2).to(device)

# Trainer 생성 후, model, train, test 데이터셋 지정
trainer = Trainer(
    model=model,                     # 이전에 불러온 허깅페이스 pretrained 모델
    args=training_args,              # 이전에 정의한 training arguments 지정
    train_dataset=train_data,        # training 데이터
    eval_dataset=test_data           # test 데이터
)

# trainer 를 활용한 학습 시작
trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.w

Step,Training Loss
200,0.5127
400,0.2871
600,0.2678
800,0.1614
1000,0.147
1200,0.1482
1400,0.0679
1600,0.0472
1800,0.0449


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1878, training_loss=0.1813284194253631, metrics={'train_runtime': 445.8676, 'train_samples_per_second': 134.778, 'train_steps_per_second': 4.212, 'total_flos': 7960363387435008.0, 'train_loss': 0.1813284194253631, 'epoch': 3.0})

In [11]:
# 학습된 trainer로 예측
predictions = trainer.predict(test_data)
predictions

***** Running Prediction *****
  Num examples = 6678
  Batch size = 128


PredictionOutput(predictions=array([[ 2.9732733, -2.9471958],
       [-4.0222363,  3.6413522],
       [ 3.8347576, -3.318453 ],
       ...,
       [ 2.824299 , -2.4794154],
       [ 3.5981152, -3.2576218],
       [ 4.025952 , -3.6779523]], dtype=float32), label_ids=array([0, 1, 0, ..., 0, 0, 0]), metrics={'test_loss': 0.3030776381492615, 'test_runtime': 13.6168, 'test_samples_per_second': 490.424, 'test_steps_per_second': 3.892})

In [13]:
# 예측 결과는 label_ids 에 담겨 있음
predictions.label_ids

array([0, 1, 0, ..., 0, 0, 0])

In [12]:
# 평가
accuracy = (test['label'] == predictions.label_ids).mean()
accuracy

1.0