In [None]:
!pip install transformers
!pip install konlpy

In [2]:
import pandas as pd
import numpy as np
import pickle

from transformers import BertTokenizerFast, BertModel
from torchtext import transforms as T
from torch.utils.data import Dataset, DataLoader
from konlpy.tag import Okt

computer = 'colab'

In [3]:
if computer=='docker':
    train_data_path = '/root/share/data/train.csv'
    test_data_path = '/root/share/data/test.csv'
elif computer=='colab':
    train_data_path = '/content/drive/MyDrive/commit_folder/chungwadae/torch_nlp/data/train.csv'
    test_data_path = '/content/drive/MyDrive/commit_folder/chungwadae/torch_nlp/data/test.csv'

In [4]:
tokenizer = BertTokenizerFast.from_pretrained('kykim/bert-kor-base')
model = BertModel.from_pretrained('kykim/bert-kor-base')

Downloading:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/344k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/725 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

train_data = train_data[train_data['data'].notnull()]

In [7]:
train_data.head()

Unnamed: 0,index,category,data
0,0,2,신혼부부위한 주택정책 보다 보육시설 늘려주세요.. 국민세금으로 일부를 위한 정책펴지...
1,1,0,학교이름에 '남자'도 붙여주세요. 울산여자중학교에 재학중인 학생입니다 최근 양성평등...
2,2,1,"빙상연맹, 대한축구협회등 각종 체육협회의 비리를 철저하게 밝혀주세요.. 최근 동계올..."
3,3,1,"티비 12세,15세 관람가도 연령확인 의무화 하자.. 제기 에전에 티비를 보다가 잠..."
4,4,1,무더운 여름철엔 남성들도 시원한 자율복장을 해야. 무더운 여름철에는 남성들도 노넥타...


In [14]:
class CustomTrainDataset(Dataset):
  def __init__(self, df, tokenizer, tag):
    self.labels = [label for label in df['category']]
    df['data'] = df['data'].apply(self.tokenizer, tag=tag)
    df['data'] = df['data'].apply(lambda x: ' '.join(x))
    self.sentence = [
        tokenizer(sentence, padding='max_length', max_length=300, truncation=True, return_tensors='pt')
        for sentence in df['data']            
    ]

  def tokenizer(self, sentence, tag):
    return tag.morphs(sentence)

  def __len__(self):
    return len(self.labels)

  def get_batch_labels(self, idx):
    return np.array(self.labels[idx])

  def get_batch_sentences(self, idx):
    return self.sentence[idx]

  def __getitem__(self, idx):

    batch_sentence = self.get_batch_sentences(idx)
    batch_y = self.get_batch_labels(idx)

    return batch_sentence, batch_y

class CustomTestDataset(Dataset):
  def __init__(self, df, tokenizer, tag):
    df['data'] = df['data'].apply(self.tokenizer, tag=tag)
    df['data'] = df['data'].apply(lambda x: ' '.join(x))
    self.sentence = [
        tokenizer(sentence, padding='max_length', max_length=300, truncation=True, return_tensors='pt')
        for sentence in df['data']            
    ]

  def tokenizer(self, sentence, tag):
    return tag.morphs(sentence)

  def __len__(self):
    return len(self.sentence)

  def get_batch_sentences(self, idx):
    return self.sentence[idx]

  def __getitem__(self, idx):

    batch_sentence = self.get_batch_sentences(idx)

    return batch_sentence