In [19]:
import torch
import pandas as pd

from tqdm.auto import tqdm
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding

In [20]:
#모델 및 토크나이저 로드
checkpoint = "skt/kogpt2-base-v2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at skt/kogpt2-base-v2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# 데이터셋 로드
train_data = pd.read_csv("datas/ratings_train.txt", names=["id", "document", "label"], skiprows=1, sep="\t", index_col=None)
before_test_data = pd.read_csv("datas/ratings_test.txt", names=["id", "document", "label"], skiprows=1, sep="\t", index_col=None)

train_data = train_data.dropna(axis=0)
before_test_data = before_test_data.dropna(axis=0)

validation_data, test_data  = train_test_split(before_test_data, test_size=0.7, random_state=42)

#data_files = {"train": "datas/ratings_train.txt", "test": "datas/ratings_test.txt"}
datasets = DatasetDict({"train": Dataset.from_pandas(train_data),"validation": Dataset.from_pandas(validation_data),"test": Dataset.from_pandas(test_data)})
#train_datasets = Dataset.from_pandas(train_data)
#test_datasets = Dataset.from_pandas(test_data)
#raw_datasets = Dataset(train_datasets+test_datasets)

In [22]:
datasets['train'][10]

{'id': 9008700,
 'document': '걍인피니트가짱이다.진짜짱이다♥',
 'label': 1,
 '__index_level_0__': 10}

In [23]:
train_data.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [24]:
datasets['train'].features

{'id': Value(dtype='int64', id=None),
 'document': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 '__index_level_0__': Value(dtype='int64', id=None)}

In [25]:
tokenizer_sentences1 = tokenizer(datasets['train']['document'])

In [26]:
inputs = tokenizer("와 진짜 꿀잼 영화")
inputs

{'input_ids': [10278, 23971, 21502, 8174, 10584], 'attention_mask': [1, 1, 1, 1, 1]}

In [27]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['▁와', '▁진짜', '▁꿀', '잼', '▁영화']

In [28]:
def tokenize_function(example):
    return tokenizer(example["document"], truncation=True)

In [29]:
tokenized_datasets = datasets.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/149995 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/14999 [00:00<?, ? examples/s]

Map:   0%|          | 0/34998 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 149995
    })
    validation: Dataset({
        features: ['id', 'document', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 14999
    })
    test: Dataset({
        features: ['id', 'document', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 34998
    })
})

In [30]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [31]:
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["id", 'document', "__index_level_0__"]}
[len(x) for x in samples["input_ids"]]

[11, 18, 12, 17, 34, 28, 9, 55]

In [32]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 55]),
 'attention_mask': torch.Size([8, 55]),
 'labels': torch.Size([8])}

In [33]:
training_args = TrainingArguments("test-trainer")

In [34]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at skt/kogpt2-base-v2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
trainer.train()

/opt/conda/conda-bld/pytorch_1704987280714/work/aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [148,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1704987280714/work/aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [148,0,0], thread: [1,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1704987280714/work/aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [148,0,0], thread: [2,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1704987280714/work/aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [148,0,0], thread: [3,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1704987280714/work/aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [148,0,0], thread: [4,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/cond

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
