In [1]:
!pip install fastapi kaleido python-multipart uvicorn
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U datasets scipy ipywidgets matplotlib jsonlines
!pip install wandb --upgrade

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd '/content/drive/MyDrive/DATA304'

/content/drive/MyDrive/DATA304


In [16]:
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration, GPT2LMHeadModel, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

kobart_tokenizer = PreTrainedTokenizerFast.from_pretrained("gogamza/kobart-base-v2")
kobart_model = BartForConditionalGeneration.from_pretrained("gogamza/kobart-base-v2")

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [17]:
import pandas as pd

texts = []
labels = []

df = pd.read_csv('used_mobile_phone.csv')

train_df = df.iloc[:199]
test_df = df.iloc[199:]

texts = train_df['text'].tolist()
labels = train_df['label'].tolist()
labels = [str(int(label)) for label in labels]

In [18]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.texts[idx], max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")

        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(self.labels[idx], max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt").input_ids

        labels = [[(-100 if token == self.tokenizer.pad_token_id else token) for token in label] for label in labels]

        return {"input_ids": inputs.input_ids.squeeze(), "attention_mask": inputs.attention_mask.squeeze(), "labels": torch.tensor(labels[0])}

In [22]:
dataset = CustomDataset(texts, labels, kobart_tokenizer, max_length=128)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    )

trainer = Trainer(
    model=kobart_model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()
trainer.save_model('./results')

Step,Training Loss
10,8.8068
20,7.3736
30,5.0285
40,3.5599
50,1.8862
60,1.357
70,1.1401
80,1.1165
90,1.2195
100,1.2846


In [27]:
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration, GPT2LMHeadModel
import torch
import jsonlines
from tqdm import tqdm

kobart_checkpoint_path = "./results"

kobart_tokenizer = PreTrainedTokenizerFast.from_pretrained("gogamza/kobart-base-v2")
kobart_model = BartForConditionalGeneration.from_pretrained(kobart_checkpoint_path)

for i, eval_input in enumerate(tqdm(test_df['text'].tolist(), total=len(test_df['text']))):
    eval_prompt = f"""휴대폰의 상태를 0, 1, 2점 중 하나로 평가해줘.
0점은 휴대폰이 정상일때, 1점은 휴대폰이 조금 고장났을 때, 2점은 휴대폰이 사용 불가능할 정도로 고장났을 때 붙이면 돼.

{eval_input}
"""

    kobart_inputs = kobart_tokenizer(eval_prompt, return_tensors="pt")
    if "token_type_ids" in kobart_inputs:
        del kobart_inputs["token_type_ids"]
    kobart_outputs = kobart_model.generate(**kobart_inputs, max_length=40, num_return_sequences=1, repetition_penalty=1.5)
    kobart_generated_text = kobart_tokenizer.decode(kobart_outputs[0], skip_special_tokens=True)

    print(kobart_generated_text)

    df.loc[i+199, 'label'] = kobart_generated_text
    print()

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
  0%|          | 1/4752 [00:01<2:29:55,  1.89s/it]

2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2



  0%|          | 2/4752 [00:03<2:35:33,  1.97s/it]

2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2



  0%|          | 3/4752 [00:05<2:34:55,  1.96s/it]

2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2



  0%|          | 4/4752 [00:07<2:29:26,  1.89s/it]

1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0



  0%|          | 5/4752 [00:09<2:23:54,  1.82s/it]

2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2



  0%|          | 6/4752 [00:11<2:23:25,  1.81s/it]

2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2



  0%|          | 7/4752 [00:12<2:17:38,  1.74s/it]

1 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0



  0%|          | 8/4752 [00:14<2:19:43,  1.77s/it]

2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2



  0%|          | 9/4752 [00:16<2:12:14,  1.67s/it]

2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2



  0%|          | 10/4752 [00:17<2:06:18,  1.60s/it]

1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0



  0%|          | 11/4752 [00:19<2:06:44,  1.60s/it]

1 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2



  0%|          | 12/4752 [00:20<2:03:43,  1.57s/it]

2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2



  0%|          | 13/4752 [00:22<2:02:29,  1.55s/it]

2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2



  0%|          | 14/4752 [00:23<1:59:38,  1.52s/it]

2 1 0 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2



  0%|          | 15/4752 [00:25<2:01:22,  1.54s/it]

1 2 0 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2



  0%|          | 16/4752 [00:26<2:09:25,  1.64s/it]

2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2



  0%|          | 17/4752 [00:28<2:10:48,  1.66s/it]

2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2



  0%|          | 18/4752 [00:30<2:13:37,  1.69s/it]

2 1 0 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2



  0%|          | 19/4752 [00:32<2:11:35,  1.67s/it]

2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2



  0%|          | 20/4752 [00:33<2:07:33,  1.62s/it]

2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2



  0%|          | 21/4752 [00:34<2:02:30,  1.55s/it]

1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0



  0%|          | 22/4752 [00:36<2:03:42,  1.57s/it]

2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0



  0%|          | 23/4752 [00:38<2:05:04,  1.59s/it]

2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0



  1%|          | 24/4752 [00:39<2:06:28,  1.60s/it]

2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0



  1%|          | 25/4752 [00:41<2:02:47,  1.56s/it]

2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0



  1%|          | 26/4752 [00:42<2:06:29,  1.61s/it]

2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2



  1%|          | 27/4752 [00:44<2:08:36,  1.63s/it]

2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2



  1%|          | 27/4752 [00:45<2:11:37,  1.67s/it]


KeyboardInterrupt: ignored