In [16]:
import pandas as pd
import numpy as np
import pickle
from konlpy.tag import Mecab

from transformers import BertTokenizerFast
from torchtext import transforms as T
from torch.utils.data import Dataset, random_split

computer = 'docker'

In [3]:
if computer=='docker':
    train_data_path = '/root/share/data/train.csv'
    test_data_path = '/root/share/data/test.csv'
elif computer=='colab':
    train_data_path = '/content/drive/MyDrive/commit_folder/chungwadae/torch_nlp/data/train.csv'
    test_data_path = '/content/drive/MyDrive/commit_folder/chungwadae/torch_nlp/data/test.csv'

In [32]:
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

train_data = train_data[train_data['data'].notnull()]
train, valid = random_split(train_data, [len(train_data)-2000,2000])

train = train_data.iloc[train.indices]
valid = train_data.iloc[valid.indices]

In [33]:
tokenizer = BertTokenizerFast.from_pretrained('kykim/bert-kor-base')
mecab = Mecab()

In [35]:
class CustomTrainDataset(Dataset):
  def __init__(self, df, tokenizer, tag):
    self.labels = [label for label in df['category']]
    df['data'] = df['data'].apply(self.tokenizer, tag=tag)
    df['data'] = df['data'].apply(lambda x: ' '.join(x))
    self.sentence = [
        tokenizer(sentence, padding='max_length', max_length=300, truncation=True, return_tensors='pt')
        for sentence in df['data']            
    ]

  def tokenizer(self, sentence, tag):
    return tag.morphs(sentence)

  def __len__(self):
    return len(self.labels)

  def get_batch_labels(self, idx):
    return np.array(self.labels[idx])

  def get_batch_sentences(self, idx):
    return self.sentence[idx]

  def __getitem__(self, idx):

    batch_sentence = self.get_batch_sentences(idx)
    batch_y = self.get_batch_labels(idx)

    return batch_sentence, batch_y

class CustomTestDataset(Dataset):
  def __init__(self, df, tokenizer, tag):
    df['data'] = df['data'].apply(self.tokenizer, tag=tag)
    df['data'] = df['data'].apply(lambda x: ' '.join(x))
    self.sentence = [
        tokenizer(sentence, padding='max_length', max_length=300, truncation=True, return_tensors='pt')
        for sentence in df['data']            
    ]

  def tokenizer(self, sentence, tag):
    return tag.morphs(sentence)

  def __len__(self):
    return len(self.sentence)

  def get_batch_sentences(self, idx):
    return self.sentence[idx]

  def __getitem__(self, idx):

    batch_sentence = self.get_batch_sentences(idx)

    return batch_sentence

In [36]:
train_dataset = CustomTrainDataset(train, tokenizer, mecab)
valid_dataset = CustomTrainDataset(valid, tokenizer, mecab)
test_dataset = CustomTestDataset(test_data, tokenizer, mecab)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [37]:
with open('/root/share/data/train_dataset.pkl','wb') as f:
    pickle.dump(train_dataset, f)

with open('/root/share/data/valid_dataset.pkl','wb') as f:
    pickle.dump(valid_dataset, f)
    
with open('/root/share/data/test_dataset.pkl','wb') as f:
    pickle.dump(test_dataset, f)    