In [None]:
!pip install transformers

In [33]:
import pandas as pd
import numpy as np
import cv2
# transformers
from transformers import BertTokenizerFast
# torch
import torch
from torch.utils.data import Dataset, DataLoader
# albumentaion
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
# sklearn
from sklearn.metrics import f1_score, accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings(action='ignore')
torch.manual_seed(1)

server = 'colab'

In [91]:
# mecab tokenizing 진행한 데이터 가져옴
if server == 'docker':
  data_path = '/root/share/tour_ai/data/train.csv'
  mecab_data_path = ''
  base_dir = ''
  test_data_path = ''
elif server == 'colab':
  data_path = '/content/drive/MyDrive/commit_folder/competition/tour_ai/data/train.csv'
  mecab_data_path = '/content/drive/MyDrive/commit_folder/competition/tour_ai/data/mecab_data.csv'
  base_dir = '/content/drive/MyDrive/commit_folder/competition/tour_ai/data'
  test_data_path = '/content/drive/MyDrive/commit_folder/competition/tour_ai/data/test.csv'

In [6]:
# 필요 데이터만 가져오기
mecab_data = pd.read_csv(mecab_data_path)
mecab_data = mecab_data[['img_path','cat3','mecab_data']]
mecab_data.columns = ['img_path','label','sentence']

In [11]:
# label encoding
label_encoding = preprocessing.LabelEncoder()
label_encoding.fit(mecab_data['label'].values)

mecab_data['label'] = label_encoding.transform(mecab_data['label'].values)

In [14]:
# sentence tokenizing
tokenizer = BertTokenizerFast.from_pretrained('kykim/bert-kor-base')

mecab_data['sentence'] = mecab_data['sentence'].apply(lambda x: tokenizer(x, padding='max_length', max_length=300, truncation=True, return_tensors='pt'))

Downloading:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/344k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/725 [00:00<?, ?B/s]

In [26]:
# train_test_split
train, valid = train_test_split(mecab_data, test_size=0.05, random_state=42)

In [57]:
class CustomDataset(Dataset):

  def __init__(self, data, transforms, base_dir, infer_yn=False):
    self.img_paths = data['img_path'].apply(lambda x: x[1:]).values
    self.sentences = data['sentence'].values
    self.labels = data['label'].values
    self.transforms = transforms
    self.base_dir = base_dir
    self.infer_yn = infer_yn

  def __getitem__(self, index):
    # image data
    img_path = self.img_paths[index]
    img_path = self.base_dir+img_path
    image = cv2.imread(img_path)
    if self.transforms is not None:
      image = self.transforms(image=image)['image']
    
    # text data
    sentence = self.sentences[index]
    
    # label
    if self.infer_yn:
      return image, sentence
    else:
      label = self.labels[index]
      return image, sentence, label

  def __len__(self):
    return len(self.labels)

In [78]:
image_transforms = A.Compose([
    A.Resize(128, 128),
    A.Normalize(),
    ToTensorV2()
])

In [79]:
train_dataset = CustomDataset(train, image_transforms, base_dir)
train_dataloader = DataLoader(train_dataset, batch_size = 8, shuffle=True)

valid_dataset = CustomDataset(valid, image_transforms, base_dir)
valid_dataloader = DataLoader(valid_dataset, batch_size = 8, shuffle=True)

# test_dataset = CustomDataset(test)