In [34]:
# !pip install transformers

In [23]:
######## 임시 추가 
import pandas as pd
import numpy as np
import cv2
# transformers
from transformers import BertTokenizerFast
# torch
import torch
from torch.utils.data import Dataset, DataLoader
# albumentaion
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
# sklearn
from sklearn.metrics import f1_score, accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
#####
import random
import pandas as pd
import numpy as np
import cv2

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
import torchvision.models as models

from transformers import AutoModel, AutoTokenizer

from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import classification_report

from tqdm import tqdm
import warnings
warnings.filterwarnings(action='ignore')

torch.manual_seed(1)

server = 'docker'

In [24]:
if server == 'docker':
  data_path = '/root/share/tour_ai/data/train.csv'
  mecab_data_path = '/root/share/tour_ai/data/mecab_data.csv'
  base_dir = '/root/share/tour_ai/data/'
  test_data_path = '/root/share/tour_ai/data/mecab_test_data.csv'
elif server == 'colab':
  data_path = '/content/drive/MyDrive/commit_folder/competition/tour_ai/data/train.csv'
  mecab_data_path = '/content/drive/MyDrive/commit_folder/competition/tour_ai/data/mecab_data.csv'
  base_dir = '/content/drive/MyDrive/commit_folder/competition/tour_ai/data'
  mecab_test_data_path = '/content/drive/MyDrive/commit_folder/competition/tour_ai/data/mecab_test_data.csv'

In [25]:
# 필요 데이터만 가져오기
mecab_data = pd.read_csv(mecab_data_path)
mecab_data = mecab_data[['img_path','cat3','mecab_data']]
mecab_data.columns = ['img_path','label','sentence']
# label encoding
label_encoding = preprocessing.LabelEncoder()
label_encoding.fit(mecab_data['label'].values)

mecab_data['label'] = label_encoding.transform(mecab_data['label'].values)
# sentence tokenizing
tokenizer = AutoTokenizer.from_pretrained('klue/roberta-small')

mecab_data['sentence'] = mecab_data['sentence'].apply(lambda x: tokenizer(x, padding='max_length', max_length=300, truncation=True, return_tensors='pt'))
# train_test_split
train, valid = train_test_split(mecab_data, test_size=0.05, random_state=42)

In [26]:
class CustomDataset(Dataset):

  def __init__(self, data, infer_yn=False):
    self.sentences = data['sentence'].values
    self.labels = data['label'].values
    self.infer_yn = infer_yn

  def __getitem__(self, index):
    # text data
    sentence = self.sentences[index]
    
    # label
    if self.infer_yn:
      return sentence
    else:
      label = self.labels[index]
      return sentence, label

  def __len__(self):
    return len(self.labels)

In [27]:
train_dataset = CustomDataset(train)
train_dataloader = DataLoader(train_dataset, batch_size = 8, shuffle=True)

valid_dataset = CustomDataset(valid)
valid_dataloader = DataLoader(valid_dataset, batch_size = 8, shuffle=True)

In [28]:
class MultiModalModel(nn.Module):
  def __init__(self, num_classes, kind):
    super(MultiModalModel, self).__init__()
    self.kind = kind
    # image
    if kind == 'image' or kind == 'multi_modal':
      self.image_model = models.efficientnet_b0(pretrained=True)
      for params in self.image_model.parameters():
        params.requires_grad = True
      self.image_model.classifier[1] = nn.Linear(in_features=1280, out_features=256)
    # text
    if kind == 'text' or kind == 'multi_modal':
      self.text_model = AutoModel.from_pretrained("klue/roberta-small")
      self.dropout = nn.Dropout(0.3)
    kind_parameter_dict = {'image' : 256, 'text' : 768 , 'multi_modal' : 256+768}
    # linear
    self.linear = nn.Linear(kind_parameter_dict[kind], num_classes)
    # softmax function
    self.softmax = nn.Softmax()

  def forward(self, text, text_mask):
    # image result


    # text result
    if self.kind == 'text' or self.kind == 'multi_modal':
      _, text_output = self.text_model(text, attention_mask=text_mask, return_dict=False)
      text_output = self.dropout(text_output)

    # concat

    output = self.linear(text_output)
    output = self.softmax(output)
    return output

In [29]:
model = MultiModalModel(len(label_encoding.classes_), kind='text')

Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

In [30]:
def train(model, train_dataloader, valid_dataloader, lr, epochs):
  use_cuda = torch.cuda.is_available()
  device = torch.device('cuda' if use_cuda else 'cpu')

  loss_func = nn.CrossEntropyLoss()
  optimizer = optim.Adam(model.parameters(), lr = lr)

  if use_cuda:
    model = model.cuda()
    loss_func = loss_func.cuda()
  
  for epoch in range(epochs):
    total_train_accuracy = 0.0
    total_train_loss = 0.0
    for text, label in tqdm(train_dataloader):
      model.zero_grad()
      input_ids = text['input_ids'].squeeze(1).to(device)
      mask = text['attention_mask'].squeeze(1).to(device)
      label = label.to(device)

      output = model(input_ids, mask)

      batch_loss = loss_func(output, label)
      total_train_loss += batch_loss.item()

      accuracy = (output.argmax(dim=1) == label).sum().item()
      total_train_accuracy += accuracy

      batch_loss.backward()
      optimizer.step()

    total_val_accuracy = 0.0
    total_val_loss = 0.0

    model_preds = []
    true_labels = []
    with torch.no_grad():
      for text, label in tqdm(valid_dataloader):
         input_ids = text['input_ids'].squeeze(1).to(device)
         mask = text['attention_mask'].squeeze(1).to(device)
         label = label.to(device)

         output = model(input_ids, mask)

         batch_loss = loss_func(output, label)
         total_val_loss += batch_loss.item()

         accuracy = (output.argmax(dim=1) == label).sum().item()
         total_val_accuracy += accuracy

         model_preds += output.argmax(1).detach().cpu().numpy().tolist()
         true_labels += label.detach().cpu().numpy().tolist()
    test_weighted_f1 = score_function(true_labels, model_preds)
    print(
      f'Epochs: {epoch + 1} | Train Loss: {total_train_loss / len(train_dataset): .3f} \
      | Train Accuracy: {total_train_accuracy / len(train_dataset): .3f} \
      | Val Loss: {total_val_loss / len(valid_dataset): .3f} \
      | Val Accuracy: {total_val_accuracy / len(valid_dataset): .3f} \
      | Val weighted f1 : {test_weighted_f1}')

def score_function(real, pred):
    return f1_score(real, pred, average="weighted")

In [32]:
lr = 1e-4
lr

0.0001

In [36]:
EPOCHS = 1
lr = 0.00001

train(model, train_dataloader, valid_dataloader, lr, EPOCHS)

100%|██████████| 2017/2017 [04:21<00:00,  7.70it/s]
100%|██████████| 107/107 [00:04<00:00, 24.30it/s]

Epochs: 1 | Train Loss:  0.535       | Train Accuracy:  0.597       | Val Loss:  0.534       | Val Accuracy:  0.639       | Val weighted f1 : 0.5285141338152397





In [35]:
torch.save(model.state_dict(), '/root/share/tour_ai/data/model/bert_weight.pt')