In [None]:
!pip install transformers

In [None]:
######## 임시 추가 
import pandas as pd
import numpy as np
import cv2
# transformers
from transformers import BertTokenizerFast
# torch
import torch
from torch.utils.data import Dataset, DataLoader
# albumentaion
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
# sklearn
from sklearn.metrics import f1_score, accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
#####
import random
import pandas as pd
import numpy as np
import cv2

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
import torchvision.models as models

from transformers import BertTokenizerFast, BertModel

from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import classification_report

from tqdm import tqdm
import warnings
warnings.filterwarnings(action='ignore')

torch.manual_seed(1)

server = 'colab'

In [None]:
if server == 'docker':
  data_path = '/root/share/tour_ai/data/train.csv'
  mecab_data_path = ''
  base_dir = ''
  test_data_path = ''
elif server == 'colab':
  data_path = '/content/drive/MyDrive/commit_folder/competition/tour_ai/data/train.csv'
  mecab_data_path = '/content/drive/MyDrive/commit_folder/competition/tour_ai/data/mecab_data.csv'
  base_dir = '/content/drive/MyDrive/commit_folder/competition/tour_ai/data'
  mecab_test_data_path = '/content/drive/MyDrive/commit_folder/competition/tour_ai/data/mecab_test_data.csv'

In [None]:
# 필요 데이터만 가져오기
mecab_data = pd.read_csv(mecab_data_path)
mecab_data = mecab_data[['img_path','cat3','mecab_data']]
mecab_data.columns = ['img_path','label','sentence']
# label encoding
label_encoding = preprocessing.LabelEncoder()
label_encoding.fit(mecab_data['label'].values)

mecab_data['label'] = label_encoding.transform(mecab_data['label'].values)
# sentence tokenizing
tokenizer = BertTokenizerFast.from_pretrained('kykim/bert-kor-base')

mecab_data['sentence'] = mecab_data['sentence'].apply(lambda x: tokenizer(x, padding='max_length', max_length=300, truncation=True, return_tensors='pt'))
# train_test_split
train, valid = train_test_split(mecab_data, test_size=0.05, random_state=42)

class CustomDataset(Dataset):

  def __init__(self, data, transforms, base_dir, infer_yn=False):
    self.img_paths = data['img_path'].apply(lambda x: x[1:]).values
    self.sentences = data['sentence'].values
    self.labels = data['label'].values
    self.transforms = transforms
    self.base_dir = base_dir
    self.infer_yn = infer_yn

  def __getitem__(self, index):
    # image data
    img_path = self.img_paths[index]
    img_path = self.base_dir+img_path
    image = cv2.imread(img_path)
    if self.transforms is not None:
      image = self.transforms(image=image)['image']
    
    # text data
    sentence = self.sentences[index]
    
    # label
    if self.infer_yn:
      return image, sentence
    else:
      label = self.labels[index]
      return image, sentence, label

  def __len__(self):
    return len(self.labels)

image_transforms = A.Compose([
    A.Resize(224, 224),
    A.Normalize(),
    ToTensorV2()
])

train_dataset = CustomDataset(train, image_transforms, base_dir)
train_dataloader = DataLoader(train_dataset, batch_size = 8, shuffle=True)

valid_dataset = CustomDataset(valid, image_transforms, base_dir)
valid_dataloader = DataLoader(valid_dataset, batch_size = 8, shuffle=True)

Downloading:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/344k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/725 [00:00<?, ?B/s]

In [None]:
class CustomModel(nn.Module):
  def __init__(self, num_classes):
    super(CustomModel, self).__init__()
    
    # image
    self.image_model = models.efficientnet_b0(pretrained=True)
    for params in self.image_model.parameters():
      params.requires_grad = True
    self.image_model.classifier[1] = nn.Linear(in_features=1280, out_features=256)
    # self.linear = nn.Linear(1280, num_classes)
    # text
    self.text_model = BertModel.from_pretrained('kykim/bert-kor-base')
    self.dropout = nn.Dropout(0.3)

    # linear
    self.linear = nn.Linear(256+768, num_classes)
    # softmax function
    self.softmax = nn.Softmax()

  def forward(self, image, text, text_mask):
    # image result
    image_output = self.image_model(image)

    # text result
    _, text_output = self.text_model(text, attention_mask=text_mask, return_dict=False)
    text_output = self.dropout(text_output)

    # concat
    output = torch.cat([image_output, text_output], axis=1)
    output = self.linear(output)
    output = self.softmax(output)
    return output

In [None]:
def train(model, train_dataloader, valid_dataloader, lr, epochs):
  use_cuda = torch.cuda.is_available()
  device = torch.device('cuda' if use_cuda else 'cpu')

  loss_func = nn.CrossEntropyLoss()
  optimizer = optim.Adam(model.parameters(), lr = lr)

  if use_cuda:
    model = model.cuda()
    loss_func = loss_func.cuda()
  
  for epoch in range(epochs):
    total_train_accuracy = 0.0
    total_train_loss = 0.0
    for image, text, label in tqdm(train_dataloader):
      image = image.to(device)
      input_ids = text['input_ids'].squeeze(1).to(device)
      mask = text['attention_mask'].squeeze(1).to(device)
      label = label.to(device)

      output = model(image, input_ids, mask)

      batch_loss = loss_func(output, label)
      total_train_loss += batch_loss.item()

      accuracy = (output.argmax(dim=1) == label).sum().item()
      total_train_accuracy += accuracy

      model.zero_grad()
      batch_loss.backward()
      optimizer.step()

    total_val_accuracy = 0.0
    total_val_loss = 0.0

    with torch.no_grad():
      for image, text, label in tqdm(valid_dataloader):
         image = image.to(device)
         input_ids = text['input_ids'].squeeze(1).to(device)
         mask = text['attention_mask'].squeeze(1).to(device)
         label = label.to(device)

         output = model(image, input_ids, mask)

         batch_loss = loss_func(output, label)
         total_val_loss += batch_loss.item()

         accuracy = (output.argmax(dim=1) == label).sum().item()
         total_val_accuracy += accuracy
    print(
      f'Epochs: {epoch + 1} | Train Loss: {total_train_loss / len(train_dataset): .3f} \
      | Train Accuracy: {total_train_accuracy / len(train_dataset): .3f} \
      | Val Loss: {total_val_loss / len(valid_dataset): .3f} \
      | Val Accuracy: {total_val_accuracy / len(valid_dataset): .3f}')

In [None]:
EPOCHS = 5
model = CustomModel(len(label_encoding.classes_))
lr = 1e-6

train(model, train_dataloader, valid_dataloader, lr, EPOCHS)

Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 2017/2017 [1:45:00<00:00,  3.12s/it]
100%|██████████| 107/107 [05:05<00:00,  2.85s/it]


Epochs: 1 | Train Loss:  0.583       | Train Accuracy:  0.227       | Val Loss:  0.574       | Val Accuracy:  0.316


  1%|          | 23/2017 [00:14<20:44,  1.60it/s]


KeyboardInterrupt: ignored

In [None]:
path = '/content/drive/MyDrive/commit_folder/competition/tour_ai/data/baseline_weight.pt'
def save_model(model, path):
  return torch.save(model.state_dict(), path)

In [None]:
class MultiModalModel(nn.Module):
  def __init__(self, num_classes, kind):
    super(CustomModel, self).__init__()
    self.kind = kind
    # image
    if kind == 'image' or kind == 'multi_modal':
      self.image_model = models.efficientnet_b0(pretrained=True)
      for params in self.image_model.parameters():
        params.requires_grad = True
      self.image_model.classifier[1] = nn.Linear(in_features=1280, out_features=256)
    # text
    if kind == 'text' or kind == 'multi_modal':
      self.text_model = BertModel.from_pretrained('kykim/bert-kor-base')
      self.dropout = nn.Dropout(0.3)
    kind_parameter_dict = {'image' : 256, 'text' : 768 , 'multi_modal' : 256+768}
    # linear
    self.linear = nn.Linear(kind_parameter_dict[kind], num_classes)
    # softmax function
    self.softmax = nn.Softmax()

  def forward(self, image, text, text_mask):
    # image result
    if self.kind == 'image' or self.kind == 'multi_modal':
      image_output = self.image_model(image)

    # text result
    if self.kind == 'text' or self.kind == 'multi_modal':
      _, text_output = self.text_model(text, attention_mask=text_mask, return_dict=False)
      text_output = self.dropout(text_output)

    # concat
    if self.kind == 'multi_modal':
      output = torch.cat([image_output, text_output], axis=1)

    if self.kind == 'image':
      output = self.linear(image_output)
    elif self.kind == 'text':
      output = self.linear(text_output)
    else:
      output = self.linear(output)
    output = self.softmax(output)
    return output