In [None]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

In [9]:
!unzip open.zip

Archive:  open.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [1]:
import random
import pandas as pd
import numpy as np
import os
import re

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm

import warnings
warnings.filterwarnings(action='ignore') 

In [3]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
CFG = {
    'EPOCHS':10,
    'LEARNING_RATE':1e-4,
    'BATCH_SIZE':256,
    'SEED':41
}

In [11]:
df=pd.read_csv('train.csv').drop(['ID'], axis=1)
df_copy=df.copy()
df_test=pd.read_csv('test.csv')
df_test_copy=df_test.copy()

In [None]:
df

Unnamed: 0,문장,유형,극성,시제,확실성,label
0,0.75%포인트 금리 인상은 1994년 이후 28년 만에 처음이다.,사실형,긍정,현재,확실,사실형-긍정-현재-확실
1,이어 ＂앞으로 전문가들과 함께 4주 단위로 상황을 재평가할 예정＂이라며 ＂그 이전이...,사실형,긍정,과거,확실,사실형-긍정-과거-확실
2,정부가 고유가 대응을 위해 7월부터 연말까지 유류세 인하 폭을 30%에서 37%까지...,사실형,긍정,미래,확실,사실형-긍정-미래-확실
3,"서울시는 올해 3월 즉시 견인 유예시간 60분을 제공하겠다고 밝혔지만, 하루 만에 ...",사실형,긍정,과거,확실,사실형-긍정-과거-확실
4,익사한 자는 사다리에 태워 거꾸로 놓고 소금으로 코를 막아 가득 채운다.,사실형,긍정,현재,확실,사실형-긍정-현재-확실
...,...,...,...,...,...,...
16536,"＇신동덤＇은 ＇신비한 동물사전＇과 ＇해리 포터＇ 시리즈를 잇는 마법 어드벤처물로, ...",사실형,긍정,과거,확실,사실형-긍정-과거-확실
16537,"수족냉증은 어릴 때부터 심했으며 관절은 어디 한 곳이 아니고 목, 어깨, 팔꿈치, ...",사실형,긍정,과거,확실,사실형-긍정-과거-확실
16538,김금희 소설가는 ＂계약서 조정이 그리 어려운가 작가를 격려한다면서 그런 문구 하나 ...,사실형,긍정,과거,확실,사실형-긍정-과거-확실
16539,1만명이 넘는 방문자수를 기록한 이번 전시회는 총 77개 작품을 넥슨 사옥을 그대로...,사실형,긍정,과거,불확실,사실형-긍정-과거-불확실


In [None]:
for idx, sent in enumerate(df['문장']):
  if re.search('토트넘', sent):
    print(sent, df['label'][idx], '\n')

# 텍스트 전처리

In [None]:
df.describe(exclude='number') #문장 중복 확인

Unnamed: 0,ID,문장,유형,극성,시제,확실성,label
count,16541,16541,16541,16541,16541,16541,16541
unique,16541,16506,4,3,3,2,64
top,TRAIN_00000,이들 게임은 국내 구글 플레이 매출 톱10 진입이 예상되는 기대작이다.,사실형,긍정,과거,확실,사실형-긍정-과거-확실
freq,1,2,13558,15793,8032,15192,7113


In [None]:
df.columns

Index(['ID', '문장', '유형', '극성', '시제', '확실성', 'label'], dtype='object')

In [None]:
for col in df.columns[2:-1]:
    print(col, df[col].unique())


유형 ['사실형' '추론형' '예측형' '대화형']
극성 ['긍정' '부정' '미정']
시제 ['현재' '과거' '미래']
확실성 ['확실' '불확실']


In [None]:
len(df[df.duplicated(subset=['문장', 'label'])]) #문장과 레이블이 동시에 중복 확인

31

In [None]:
len(df[df.duplicated(subset=['문장'])]) #문장은 중복됐지만 레이블은 다른 경우 확인(4개)

35

## 중복값 제거

In [12]:
df.drop_duplicates(subset=['문장', 'label'], inplace=True) #문장, 레이블 동시 중복 제거
df.reset_index(drop=True, inplace=True)

df['확실성'][14963]='확실'
df['label'][14963]='사실형-긍정-현재-확실' #맞게 변경

df.drop_duplicates(subset='문장', keep='last', inplace=True) #틀린 레이블을 가진 중복 문장 제거
df.reset_index(drop=True, inplace=True)

In [None]:
df.describe(exclude='number') #중복여부 확인

Unnamed: 0,ID,문장,유형,극성,시제,확실성,label
count,16506,16506,16506,16506,16506,16506,16506
unique,16506,16506,4,3,3,2,64
top,TRAIN_00000,0.75%포인트 금리 인상은 1994년 이후 28년 만에 처음이다.,사실형,긍정,과거,확실,사실형-긍정-과거-확실
freq,1,1,13530,15758,8014,15163,7099


In [None]:
for col in df.columns[1:-1]: #데이터 불균형 확인
    print(df[col].value_counts(),'\n')

사실형    13530
추론형     2146
대화형      573
예측형      257
Name: 유형, dtype: int64 

긍정    15758
부정      565
미정      183
Name: 극성, dtype: int64 

과거    8014
현재    6852
미래    1640
Name: 시제, dtype: int64 

확실     15163
불확실     1343
Name: 확실성, dtype: int64 



In [None]:
for col in df.columns[1:-1]: #데이터 불균형 확인
    print(df[col].value_counts(normalize=True),'\n')

## 텍스트 증강

```
- 목표: 수가 적은 카테고리의 텍스트에 역번역(ko -> en -> ko)방식과 RS(Random Swap), RD(Random Deletion)방식을 적용해서 데이터 증강.
```

```
- 보완
  - 번역투(-ㅂ니다.)를 특정해서 분류하므로 번역문을 따로 전처리해야 함.
  - 위 과정이 너무 오래 걸리므로 카테고리 내 문장 수가 극히 적은 [유형: 예측], [극성: 미정] 만 역번역 하기로 함.
  -  RD는 제거해도 의미가 변하지 않는 불용어와 수식어를 임의로 제거하거나 조사를 임의로 제거
  - RS는 처음 또는 끝 단어를 교체하지 않도록 조정. 긴 문장(복문장)의 경우 임의의 단어를 교체할 경우 의미가 크게 달라질 것 같아 긴 문장은 RS에서 제외.
```

### 역번역

In [None]:
import random

In [13]:
def trans_augmentation_preprocess(df, label, label_cat):
  new_df = df.copy()
  new_df = new_df[new_df[label]==label_cat].reset_index(drop=True)

  new_df['문장']=new_df['문장'].map(lambda x: re.sub(r'\([^()]*\)', '', x)) #괄호와 괄호 안 문장 제거
  p=re.compile('[《『]') #번역에 부적합한 고전 책 제목 제거 ex)《직지심체요절》
  ind_del = []

  for ind, doc in enumerate(new_df['문장']):
    for char in doc:
      if p.search(char):
        ind_del.append(ind)

  new_df.drop(ind_del, inplace=True)
  new_df.reset_index(drop=True, inplace=True)
  ind_del.clear()


  new_df.reset_index(drop=True, inplace=True)

  return new_df

In [14]:
df_train_pred_aug = trans_augmentation_preprocess(df, "유형", "예측형")
df_train_non_aug = trans_augmentation_preprocess(df, "극성", "미정")

https://huggingface.co/facebook/m2m100_1.2B

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_1.2B")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/m2m100_1.2B").to(device)

In [None]:
from tqdm import tqdm

def trans_augmentation(df, label):
  new_df = df.copy()
  new_df=new_df.reindex(columns=['번역문', '유형', '극성', '시제', '확실성', 'label'])
  # tqdm으로 반복문 진행 상황을 볼 수 있도록 변경

  for ind, sent in tqdm(enumerate(df['문장']), total=len(new_df)):
    tokenizer.src_lang = "ko"
    encoded_ko = tokenizer(sent, return_tensors="pt").to(device)
    generated_tokens = model.generate(**encoded_ko, forced_bos_token_id=tokenizer.get_lang_id("en"))
    result=tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] #ko -> en

    tokenizer.src_lang = "en"
    encoded_en = tokenizer(result, return_tensors="pt").to(device)
    generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.get_lang_id("ko"))
    result2 = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] #en -> ko
    
    new_df['번역문'][ind] = result2
  
  return new_df


#### [유형: 예측] 역번역

In [None]:
df_train_pred_aug

In [None]:
df_train_pred_aug_ = trans_augmentation(df_train_pred_aug, '유형')

100%|██████████| 257/257 [13:22<00:00,  3.12s/it]


In [None]:
#예측형으로 번역이 안 된 문장 제거
pred_aug=df_train_pred_aug_.drop([4, 5, 6, 10, 17, 20, 24, 35, 46, 56, 80, 81, 101, 104, 105, 117, 121, 140, 143, 153, 160, 166, 170, 173, 190, 200, 203, 209, 214, 239, 242, 243, 247, 251])

In [None]:
pred_aug.reset_index(drop=True, inplace=True)

In [None]:
#시제가 제대로 번역 되었는지 확인
pred_aug[pred_aug['시제']=='과거']#.index

Unnamed: 0,번역문,유형,극성,시제,확실성,label
1,"""당신이 처음부터 OTT 사업을 성장시키고 싶었다면, Cupplay의 'Time t...",예측형,부정,과거,확실,예측형-부정-과거-확실
16,"시스템 반도체에 대한 수요는 2030 년에 3,400에 도달 할 것으로 예상되며, ...",예측형,긍정,과거,확실,예측형-긍정-과거-확실
17,"그러나 행동 심리학자 B. F. Skinner와 상담했다면 성장 과정, 사건의 실제...",예측형,긍정,과거,불확실,예측형-긍정-과거-불확실
31,"뉴욕 타임즈는 ""이번 주 회의는 1994년 이후 가장 큰 금리 인프라를 논의하는 것...",예측형,긍정,과거,확실,예측형-긍정-과거-확실
32,성장률은 ‘정상적인’ 수준으로 예상된다.,예측형,긍정,과거,불확실,예측형-긍정-과거-불확실
44,"몽골 파고 (Mongolia Fago)는 서쪽 0.5 ~ 2.0 미터, 남쪽 0.5...",예측형,긍정,과거,확실,예측형-긍정-과거-확실
58,또한 새로운 코로나 상황이 조직 될 것이라는 신호에 따르면 Bitcoin이 주요 수...,예측형,긍정,과거,불확실,예측형-긍정-과거-불확실
65,"미시시피는 강을 제외한 모든 지역에서 ""행복""의 수준에있을 것으로 예상됩니다.",예측형,긍정,과거,불확실,예측형-긍정-과거-불확실
97,기상 기관은 오늘 아침 서부 중앙 바다에 위치한 고온 압력의 경계에서 국가가 대부분...,예측형,긍정,과거,확실,예측형-긍정-과거-확실
129,2020년에는 한국 경제연구소가 한국의 제4차 산업혁명 분야에서 노동 부족률이 20...,예측형,긍정,과거,불확실,예측형-긍정-과거-불확실


In [None]:
for i in pred_aug[pred_aug['시제']=='과거'].index:

  pred_aug['번역문'][i] = pred_aug['번역문'][i].replace('예상된다.', '예상 되었다.')
  pred_aug['번역문'][i] = pred_aug['번역문'][i].replace('것이다.', '것이었다.')
  pred_aug['번역문'][i] = pred_aug['번역문'][i].replace('전망이다.', '전망이었다.')
  pred_aug['번역문'][i] = pred_aug['번역문'][i].replace('예정이다.', '예정이었다.')

In [None]:
import random

pred_aug['번역문']=pred_aug['번역문'].map(lambda x: re.sub(r'입니다[.]$', '이다.', x))
pred_aug['번역문']=pred_aug['번역문'].map(lambda x: re.sub(r'보이다[.]$', '보인다.', x))
pred_aug['번역문']=pred_aug['번역문'].map(lambda x: re.sub(r'됩니다[.]$', '된다.', x))
pred_aug['번역문']=pred_aug['번역문'].map(lambda x: re.sub(r'합니다[.]$', '한다.', x))
pred_aug['번역문']=pred_aug['번역문'].map(lambda x: re.sub(r'습니다[.]$', '겠다.', x))
pred_aug['번역문']=pred_aug['번역문'].map(lambda x: re.sub(r'했겠다[.]$', '했다.', x))

for i, sent in enumerate(pred_aug['번역문']):
  r = random.uniform(0, 1)
  if r>0.5:
    pred_aug['번역문'][i]=re.sub(r'예상', '전망', pred_aug['번역문'][i])

for i, sent in enumerate(pred_aug['번역문']):
  r = random.uniform(0, 1)
  if r>0.5:
    pred_aug['번역문'][i]=re.sub(r'것으로 전망된다[.]', '전망이다.', pred_aug['번역문'][i])

pred_aug['번역문']=pred_aug['번역문'].map(lambda x: re.sub(r'전망이[ ]있겠다[.]$', '전망이 있다.', x))
pred_aug['번역문']=pred_aug['번역문'].map(lambda x: re.sub(r'예측하고[ ]있겠다[.]$', '예측하고 있다.', x))
pred_aug['번역문']=pred_aug['번역문'].map(lambda x: re.sub(r'서 전망이다[.]$', '설 전망이다.', x))
pred_aug['번역문']=pred_aug['번역문'].map(lambda x: re.sub(r'전망되지[ ]않겠다[.]$', '전망되지는 않는다.', x))

In [None]:
pred_aug.to_csv('pred_aug.csv', index=False, encoding='utf-8-sig')

#### [극성: 미정] 역번역

In [None]:
df_train_non_aug_ = trans_augmentation(df_train_non_aug, '극성')

100%|██████████| 183/183 [07:53<00:00,  2.59s/it]


In [None]:
#번역 이상한 것 제거
non_aug=df_train_non_aug_.drop([4, 7, 8, 13, 15, 21, 25, 27, 29, 34, 36, 37, 38, 39, 41, 42, 44, 48, 50, 51, 53, 59, 60, 61, 63, 64, 66, 67, 68, 75, 76, 81, 86, 89, 91, 99, 100, 103, 104, 105, 108, 109, 110, 111, 113, 114, 115, 131, 133, 135, 138, 137, 140, 141, 142, 144, 145, 146, 148, 149, 166, 168, 171, 174, 175, 176, 178, 179, 180, 181]) #인간지능

In [None]:
df_train_non_aug[df_train_non_aug['시제']=='과거']

In [None]:
non_aug.reset_index(drop=True, inplace=True)

In [None]:
non_aug['번역문']=non_aug['번역문'].map(lambda x: re.sub(r'입니다[.]$', '이다.', x))
non_aug['번역문']=non_aug['번역문'].map(lambda x: re.sub(r'보이다[.]$', '보인다.', x)) #보입니다. -> 보이다. -> 보인다.
non_aug['번역문']=non_aug['번역문'].map(lambda x: re.sub(r'됩니다[.]$', '된다.', x))
non_aug['번역문']=non_aug['번역문'].map(lambda x: re.sub(r'합니다[.]$', '한다.', x))
non_aug['번역문']=non_aug['번역문'].map(lambda x: re.sub(r'습니다[.]$', '다.', x))
non_aug['번역문']=non_aug['번역문'].map(lambda x: re.sub(r'습니까[?]$', '나요?', x))
non_aug['번역문']=non_aug['번역문'].map(lambda x: re.sub(r'미칩니[ ]까[?]["]$', '미치나요?', x))


In [None]:
non_aug.to_csv('non_aug.csv', index=False, encoding='utf-8-sig')

#### 역번역 증강문 합치기

In [None]:
pred_aug=pd.read_csv('pred_aug.csv')
non_aug=pd.read_csv('non_aug.csv')

In [None]:
pred_aug=pred_aug.rename(columns = {'번역문': '문장'})
non_aug=non_aug.rename(columns = {'번역문': '문장'})

In [None]:
trans_aug=pd.concat([pred_aug, non_aug], axis=0).reset_index(drop=True)

In [None]:
trans_aug=trans_aug.drop_duplicates()
trans_aug.to_csv('trans_aug.csv', index=False, encoding='utf-8-sig')

## 증강 데이터 불러오기, Under sampling 및 레이블 인코딩

In [2]:
df_train_aug = pd.read_csv('df_train_aug2.csv')
df_val_aug = pd.read_csv('df_val_aug2.csv')

In [None]:
df_train = pd.read_csv('df_train.csv')
df_val = pd.read_csv('df_val.csv')

In [None]:
df_train.유형.value_counts(), df_train_aug.유형.value_counts()

(사실형    10851
 추론형     1722
 대화형      480
 예측형      372
 Name: 유형, dtype: int64,
 사실형    26627
 추론형     5912
 대화형     2118
 예측형     1843
 Name: 유형, dtype: int64)

In [None]:
df_train.극성.value_counts(), df_train_aug.극성.value_counts()

In [None]:
df_train.시제.value_counts(), df_train_aug.시제.value_counts()

In [None]:
df_train.확실성.value_counts(), df_train_aug.확실성.value_counts()

In [None]:
df_val_aug.duplicated().sum()

In [None]:
df_train_aug = df_train_aug.sample(frac=1)
df_val_aug = df_val_aug.sample(frac=1)

In [None]:
len(df_val_aug)

7946

In [None]:
df_train_aug.index

Int64Index([19864,   799, 29485, 16796, 20458, 14073, 19907, 17694,  1456,
            19153,
            ...
            12458, 24897, 12647, 16765,  9606, 29310,    72, 30544, 15299,
            19933],
           dtype='int64', length=30708)

In [3]:
import random
def under_sampling(df):
  random.seed(42)
  for i in df.index:
    r = random.uniform(0, 1)
    if r>0.4:
      if df['극성'][i] == '긍정' and df['확실성'][i] == '확실':
        df = df.drop(i)
    
  return df.reset_index(drop=True)

df_train_aug_under = under_sampling(df_train_aug)
df_val_aug_under = under_sampling(df_val_aug)

In [None]:
df_val_aug_under

In [None]:
df_train_aug.유형.value_counts(normalize=True), df_train_aug_under.유형.value_counts(normalize=True)

In [None]:
df_train_aug.극성.value_counts(normalize=True), df_train_aug_under.극성.value_counts(normalize=True)

In [None]:
df_train_aug.시제.value_counts(normalize=True), df_train_aug_under.시제.value_counts(normalize=True)

In [None]:
df_train_aug.확실성.value_counts(normalize=True), df_train_aug_under.확실성.value_counts(normalize=True)

In [None]:
def label_enc(df):
  type_le = preprocessing.LabelEncoder()
  df["유형"] = type_le.fit_transform(df["유형"].values)

  polarity_le = preprocessing.LabelEncoder()
  df["극성"] = polarity_le.fit_transform(df["극성"].values)

  tense_le = preprocessing.LabelEncoder()
  df["시제"] = tense_le.fit_transform(df["시제"].values)


  certainty_le = preprocessing.LabelEncoder()
  df["확실성"] = certainty_le.fit_transform(df["확실성"].values)

  return type_le, polarity_le, tense_le, certainty_le


In [None]:
len(df_train_aug)

30708

In [None]:
df_aug = pd.concat([df_train_aug.reset_index(drop=True), df_val_aug.reset_index(drop=True)])

type_le, polarity_le, tense_le, certainty_le = label_enc(df_aug)

df_train_aug = df_aug[:30708]
df_val_aug = df_aug[30708:]

In [None]:
type_le, polarity_le, tense_le, certainty_le = label_enc(df)

# 토큰화 및 모델 불러오기

## 데이터셋 분리

In [None]:
from datasets import Dataset, DatasetDict, load_dataset

In [None]:
def make_dataset(df):
  raw_dict=df.to_dict("list") #테이블을 딕셔너리형태로 바꿈.
  raw_ds=Dataset.from_dict(raw_dict) #딕셔너리를 데이터세트 형태로 바꿈.

  return raw_ds

#ds = make_dataset(df)

In [None]:
#ds_train = make_dataset(df_train_aug_under)
#ds_val = make_dataset(df_val_aug_under)
ds_train = make_dataset(df_train_aug)
ds_val = make_dataset(df_val_aug)

In [None]:
ds_test = make_dataset(df_test)

In [None]:
ds_val

Dataset({
    features: ['문장', '유형', '극성', '시제', '확실성'],
    num_rows: 7946
})

## 토큰화 CLS 토큰 hidden state 만들기

In [None]:
from huggingface_hub import notebook_login
notebook_login()

Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from transformers import AutoModel, AutoTokenizer
model_ckpt = "snunlp/KR-BERT-char16424"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt).to(device)


In [5]:
encoded_text = tokenizer('치료제가 없다고 치료법이 없는 것은 아니다')
tokenizer.convert_ids_to_tokens(encoded_text.input_ids)

['[CLS]',
 '치료',
 '##제',
 '##가',
 '없다고',
 '치료',
 '##법',
 '##이',
 '없는',
 '것은',
 '아니다',
 '[SEP]']

In [None]:
from transformers import AutoConfig

In [None]:
config=AutoConfig.from_pretrained(model_ckpt)

In [None]:
config

BertConfig {
  "_name_or_path": "snunlp/KR-BERT-char16424",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 16424
}

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
model

In [None]:
tokenizer.vocab

In [None]:
tokenizer.model_max_length

In [None]:
tokenizer.model_input_names

In [None]:
def tokenize(batch):
  return tokenizer(batch['문장'], padding=True, truncation=True, return_token_type_ids=False)

def encode(dataset):
  datasets_encoded = dataset.map(tokenize, batched=True, batch_size=None)
  return datasets_encoded


In [None]:
#ds_encoded = encode(ds)

In [None]:
ds_train_encoded = encode(ds_train)
ds_val_encoded = encode(ds_val)


In [None]:
ds_test_encoded = encode(ds_test)

In [None]:
ds_test_encoded

Dataset({
    features: ['ID', '문장', 'input_ids', 'attention_mask'],
    num_rows: 7090
})

In [None]:
ds_val_encoded

Dataset({
    features: ['문장', '유형', '극성', '시제', '확실성', 'input_ids', 'attention_mask'],
    num_rows: 7946
})

In [None]:
def extract_hidden_states(batch):
  inputs={k:v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}
  with torch.no_grad():
    last_hidden_state=model(**inputs).last_hidden_state

  return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}


In [None]:
#hidden = hidden.train_test_split(test_size=0.2, seed=1)
#train_hidden = hidden['train']
#val_hidden = hidden['test']

In [None]:
ds_train_encoded.set_format("torch", columns=['input_ids', 'attention_mask', '유형', '극성', '시제', '확실성'])
ds_val_encoded.set_format("torch", columns=['input_ids', 'attention_mask', '유형', '극성', '시제', '확실성'])


In [None]:
ds_train_encoded

Dataset({
    features: ['문장', '유형', '극성', '시제', '확실성', 'input_ids', 'attention_mask'],
    num_rows: 30708
})

In [None]:
train_hidden = ds_train_encoded.map(extract_hidden_states, batched=True)
val_hidden = ds_val_encoded.map(extract_hidden_states, batched=True)

In [None]:
ds_test_encoded.set_format("torch", columns=['input_ids', 'attention_mask'])
test_hidden = ds_test_encoded.map(extract_hidden_states, batched = True)

In [None]:
train_hidden

Dataset({
    features: ['문장', '유형', '극성', '시제', '확실성', 'input_ids', 'attention_mask', 'hidden_state'],
    num_rows: 30708
})

In [None]:
val_hidden

Dataset({
    features: ['문장', '유형', '극성', '시제', '확실성', 'input_ids', 'attention_mask', 'hidden_state'],
    num_rows: 7946
})

In [None]:
train_hidden.save_to_disk("train_hidden_aug")
val_hidden.save_to_disk("val_hidden_aug")
#test_hidden.save_to_disk("test_hidden")

In [None]:
!zip -r /content/train_hdn.zip /content/train_hidden/
!zip -r /content/val_hdn.zip /content/val_hidden/
!zip -r /content/test_hdn.zip /content/test_hidden/

  adding: content/train_hidden_aug/ (stored 0%)
  adding: content/train_hidden_aug/data-00000-of-00001.arrow (deflated 31%)
  adding: content/train_hidden_aug/state.json (deflated 42%)
  adding: content/train_hidden_aug/dataset_info.json (deflated 75%)


In [None]:
!zip -r /content/train_hdn_aug.zip /content/train_hidden_aug/
!zip -r /content/val_hdn_aug.zip /content/val_hidden_aug/

In [None]:
!zip -r /content/train_hdn_aug_3000.zip /content/train_hidden_aug_30000/
!zip -r /content/val_hdn_aug_8000.zip /content/val_hidden_aug_8000/

  adding: content/train_hidden_aug_30000/ (stored 0%)
  adding: content/train_hidden_aug_30000/data-00000-of-00001.arrow (deflated 37%)
  adding: content/train_hidden_aug_30000/state.json (deflated 42%)
  adding: content/train_hidden_aug_30000/dataset_info.json (deflated 74%)
  adding: content/val_hidden_aug_8000/ (stored 0%)
  adding: content/val_hidden_aug_8000/data-00000-of-00001.arrow (deflated 26%)
  adding: content/val_hidden_aug_8000/state.json (deflated 43%)
  adding: content/val_hidden_aug_8000/dataset_info.json (deflated 74%)


#### logistic regression(base line)

In [None]:
def preprocess(df):
  df['문장']=df['문장'].map(lambda x: re.sub(r'[\d]+', '1', x)) #숫자를 1로 바꿈.
  df['문장']=df['문장'].map(lambda x: re.sub(r'[a-zA-Z]+', 'a', x)) #영어를 a로 바꿈.
  df['문장']=df['문장'].map(lambda x: re.sub(r'[^ a가-힣1]', '', x)) #한글, 1, a 제외 문자 제거

In [None]:
preprocess(df)
preprocess(df_test)

In [None]:
df

Unnamed: 0,문장,유형,극성,시제,확실성,label
0,1포인트 금리 인상은 1년 이후 1년 만에 처음이다,사실형,긍정,현재,확실,사실형-긍정-현재-확실
1,이어 앞으로 전문가들과 함께 1주 단위로 상황을 재평가할 예정이라며 그 이전이라도 ...,사실형,긍정,과거,확실,사실형-긍정-과거-확실
2,정부가 고유가 대응을 위해 1월부터 연말까지 유류세 인하 폭을 1에서 1까지 확대한다,사실형,긍정,미래,확실,사실형-긍정-미래-확실
3,서울시는 올해 1월 즉시 견인 유예시간 1분을 제공하겠다고 밝혔지만 하루 만에 차도...,사실형,긍정,과거,확실,사실형-긍정-과거-확실
4,익사한 자는 사다리에 태워 거꾸로 놓고 소금으로 코를 막아 가득 채운다,사실형,긍정,현재,확실,사실형-긍정-현재-확실
...,...,...,...,...,...,...
16501,이에 따라 대형 콘서트부터 야외 페스티벌 실내 공연 등 연이어 오픈 소식이 들려오고 있다,사실형,긍정,현재,확실,사실형-긍정-현재-확실
16502,신동덤은 신비한 동물사전과 해리 포터 시리즈를 잇는 마법 어드벤처물로 전편에 이어 ...,사실형,긍정,과거,확실,사실형-긍정-과거-확실
16503,수족냉증은 어릴 때부터 심했으며 관절은 어디 한 곳이 아니고 목 어깨 팔꿈치 등 허...,사실형,긍정,과거,확실,사실형-긍정-과거-확실
16504,김금희 소설가는 계약서 조정이 그리 어려운가 작가를 격려한다면서 그런 문구 하나 고...,사실형,긍정,과거,확실,사실형-긍정-과거-확실


In [None]:
def tokenize(batch):
  return tokenizer(batch['문장'], padding=True, truncation=True, return_token_type_ids=False)

def encode(dataset):
  datasets_encoded = dataset.map(tokenize, batched=True, batch_size=None)
  return datasets_encoded

In [None]:
ds_encoded = encode(ds)

In [None]:
ds_encoded

Dataset({
    features: ['문장', '유형', '극성', '시제', '확실성', 'label', 'input_ids', 'attention_mask'],
    num_rows: 16506
})

In [None]:
def extract_hidden_states(batch):
    inputs = {k:v.to(device) for k,v in batch.items() 
              if k in tokenizer.model_input_names}
              
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state

    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [None]:
ds_encoded.set_format("torch", columns=["input_ids", "attention_mask"])

In [None]:
ds_hidden = ds_encoded.map(extract_hidden_states, batched=True)

In [None]:
ds_hidden

In [None]:
ds_hidden['문장'][0]

In [None]:
ds_hidden.save_to_disk("ds_hidden")

In [None]:
!zip -r /content/ds_hidden.zip /content/ds_hidden/

In [None]:
!unzip ds_hidden.zip

In [None]:
from datasets import load_from_disk
ds_hidden = load_from_disk("/content/content/ds_hidden")

In [None]:
ds_hidden.set_format(type='pandas')

In [None]:
df_hidden = ds_hidden[:]

In [None]:
df_hidden

In [None]:
type_le, polarity_le, tense_le, certainty_le = label_enc(df_hidden)

In [None]:
df_type_hidden = df_hidden[['문장', '유형', 'hidden_state']]
df_polar_hidden = df_hidden[['문장', '극성', 'hidden_state']]
df_tense_hidden = df_hidden[['문장', '시제', 'hidden_state']]
df_cert_hidden = df_hidden[['문장', '확실성', 'hidden_state']]

In [None]:
def to_ds(df):
  
  dic= df.to_dict("list") #테이블을 딕셔너리형태로 바꿈.
  ds = Dataset.from_dict(dic) #딕셔너리를 데이터세트 형태로 바꿈.
  ds = ds.train_test_split(test_size=0.2, seed=1)

  return ds

In [None]:
ds_type_hidden = to_ds(df_type_hidden)
ds_polar_hidden = to_ds(df_polar_hidden)
ds_tense_hidden = to_ds(df_tense_hidden)
ds_cert_hidden = to_ds(df_cert_hidden)

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression

def ds_split(ds, label):
  X_train = np.array(ds["train"]["hidden_state"])
  X_valid = np.array(ds["test"]["hidden_state"])
  y_train = np.array(ds["train"][label])
  y_valid = np.array(ds["test"][label])
  
  return X_train, X_valid, y_train, y_valid

def lr(X_train, X_valid, y_train, y_valid):
  lr_clf = LogisticRegression(max_iter=3000)
  lr_clf.fit(X_train, y_train)
  print(lr_clf.score(X_valid, y_valid))

  return lr_clf

In [None]:
X_type_train, X_type_val, y_type_train, y_type_val = ds_split(ds_type_hidden, '유형')
X_polar_train, X_polar_val, y_polar_train, y_polar_val = ds_split(ds_polar_hidden, '극성')
X_tense_train, X_tense_val, y_tense_train, y_tense_val = ds_split(ds_tense_hidden, '시제')
X_cert_train, X_cert_val, y_cert_train, y_cert_val = ds_split(ds_cert_hidden, '확실성')

In [None]:
lr_type = lr(X_type_train, X_type_val, y_type_train, y_type_val)
lr_polar = lr(X_polar_train, X_polar_val, y_polar_train, y_polar_val)
lr_tense = lr(X_tense_train, X_tense_val, y_tense_train, y_tense_val)
lr_cert = lr(X_cert_train, X_cert_val, y_cert_train, y_cert_val)

In [None]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_type_train, y_type_train)
dummy_clf.score(X_type_val, y_type_val)

In [None]:
!unzip test_hdn.zip

In [None]:
from datasets import load_from_disk
test_hidden = load_from_disk("/content/content/test_hidden")

In [None]:
test_hidden['hidden_state']

Dataset({
    features: ['ID', '문장', 'input_ids', 'attention_mask', 'hidden_state'],
    num_rows: 7090
})

In [None]:
type_preds = lr_type.predict(test_hidden['hidden_state'])
polarity_preds = lr_polar.predict(test_hidden['hidden_state'])
tense_preds = lr_tense.predict(test_hidden['hidden_state'])
certainty_preds = lr_cert.predict(test_hidden['hidden_state'])

In [None]:
type_preds = list(type_preds)
polarity_preds = list(polarity_preds)
tense_preds = list(tense_preds)
certainty_preds = list(certainty_preds)

In [None]:
type_preds = type_le.inverse_transform(type_preds)
polarity_preds = polarity_le.inverse_transform(polarity_preds)
tense_preds = tense_le.inverse_transform(tense_preds)
certainty_preds = certainty_le.inverse_transform(certainty_preds)

In [None]:
predictions = []
for type_pred, polarity_pred, tense_pred, certainty_pred in zip(type_preds, polarity_preds, tense_preds, certainty_preds):
    predictions.append(type_pred+'-'+polarity_pred+'-'+tense_pred+'-'+certainty_pred)

In [None]:
submit = pd.read_csv('sample_submission.csv')
submit['label'] = predictions

In [None]:
submit

Unnamed: 0,ID,label
0,TEST_0000,사실형-긍정-현재-확실
1,TEST_0001,사실형-긍정-현재-확실
2,TEST_0002,사실형-긍정-과거-확실
3,TEST_0003,사실형-긍정-현재-확실
4,TEST_0004,사실형-긍정-과거-확실
...,...,...
7085,TEST_7085,사실형-긍정-현재-확실
7086,TEST_7086,추론형-긍정-현재-확실
7087,TEST_7087,사실형-긍정-미래-확실
7088,TEST_7088,사실형-긍정-미래-확실


In [None]:
submit.to_csv('submit_lr_bs.csv', index=False, encoding='utf-8-sig')

## CustomModel

In [None]:
!unzip train_hdn.zip
!unzip val_hdn.zip


In [None]:
!unzip test_hdn.zip

In [None]:
from datasets import load_from_disk

In [None]:

train_hidden = load_from_disk("/content/content/train_hidden")
val_hidden = load_from_disk("/content/content/val_hidden")

In [None]:
test_hidden = load_from_disk("/content/content/test_hidden")

In [None]:
train_hidden['문장'][0]

In [None]:
train_hidden['input_ids'][0]

In [None]:
train_hidden['hidden_state'][0]

In [None]:
val_hidden['유형'][0]

In [None]:
test_hidden

In [None]:
from datasets import Dataset, DatasetDict, load_dataset


train_hidden.set_format(type='pandas')
val_hidden.set_format(type='pandas')
train_df=train_hidden[:]
val_df=val_hidden[:]

df = pd.concat([train_df, val_df], axis=0).sample(frac=1).reset_index(drop=True)

In [None]:
def to_ds(df):
  
  dic= df.to_dict("list") #테이블을 딕셔너리형태로 바꿈.
  ds = Dataset.from_dict(dic) #딕셔너리를 데이터세트 형태로 바꿈.

  return ds

#ds=to_ds(df)

In [None]:
from typing import Optional, Sequence
from torch import Tensor

https://github.com/HaloKim/Competitions/blob/main/%5BDacon%5D%EB%AC%B8%EC%9E%A5%EC%9C%A0%ED%98%95%EB%B6%84%EB%A5%98AI%EA%B2%BD%EC%A7%84%EB%8C%80%ED%9A%8C/740/fold.ipynb

In [None]:
class FocalLoss(nn.Module):
    """ Focal Loss, as described in https://arxiv.org/abs/1708.02002.
    It is essentially an enhancement to cross entropy loss and is
    useful for classification tasks when there is a large class imbalance.
    x is expected to contain raw, unnormalized scores for each class.
    y is expected to contain class labels.
    Shape:
        - x: (batch_size, C) or (batch_size, C, d1, d2, ..., dK), K > 0.
        - y: (batch_size,) or (batch_size, d1, d2, ..., dK), K > 0.
    """

    def __init__(self,
                 alpha: Optional[Tensor] = None,
                 gamma: float = 0.,
                 reduction: str = 'mean',
                 ignore_index: int = -100):
        """Constructor.
        Args:
            alpha (Tensor, optional): Weights for each class. Defaults to None.
            gamma (float, optional): A constant, as described in the paper.
                Defaults to 0.
            reduction (str, optional): 'mean', 'sum' or 'none'.
                Defaults to 'mean'.
            ignore_index (int, optional): class label to ignore.
                Defaults to -100.
        """
        if reduction not in ('mean', 'sum', 'none'):
            raise ValueError(
                'Reduction must be one of: "mean", "sum", "none".')

        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.ignore_index = ignore_index
        self.reduction = reduction

        self.nll_loss = nn.NLLLoss(
            weight=alpha, reduction='none', ignore_index=ignore_index)

    def __repr__(self):
        arg_keys = ['alpha', 'gamma', 'ignore_index', 'reduction']
        arg_vals = [self.__dict__[k] for k in arg_keys]
        arg_strs = [f'{k}={v!r}' for k, v in zip(arg_keys, arg_vals)]
        arg_str = ', '.join(arg_strs)
        return f'{type(self).__name__}({arg_str})'

    def forward(self, x: Tensor, y: Tensor) -> Tensor:
        if x.ndim > 2:
            # (N, C, d1, d2, ..., dK) --> (N * d1 * ... * dK, C)
            c = x.shape[1]
            x = x.permute(0, *range(2, x.ndim), 1).reshape(-1, c)
            # (N, d1, d2, ..., dK) --> (N * d1 * ... * dK,)
            y = y.view(-1)

        unignored_mask = y != self.ignore_index
        y = y[unignored_mask]
        if len(y) == 0:
            return torch.tensor(0.)
        x = x[unignored_mask]

        # compute weighted cross entropy term: -alpha * log(pt)
        # (alpha is already part of self.nll_loss)
        log_p = F.log_softmax(x, dim=-1)
        ce = self.nll_loss(log_p, y)

        # get true class column from each row
        all_rows = torch.arange(len(x))
        log_pt = log_p[all_rows, y]

        # compute focal term: (1 - pt)^gamma
        pt = log_pt.exp()
        focal_term = (1 - pt)**self.gamma

        # the full loss: -alpha * ((1 - pt)^gamma) * log(pt)
        loss = focal_term * ce

        if self.reduction == 'mean':
            loss = loss.mean()
        elif self.reduction == 'sum':
            loss = loss.sum()

        return loss


def focal_loss(alpha: Optional[Sequence] = None,
               gamma: float = 0.,
               reduction: str = 'mean',
               ignore_index: int = -100,
               device='cpu',
               dtype=torch.float32) -> FocalLoss:
    """Factory function for FocalLoss.
    Args:
        alpha (Sequence, optional): Weights for each class. Will be converted
            to a Tensor if not None. Defaults to None.
        gamma (float, optional): A constant, as described in the paper.
            Defaults to 0.
        reduction (str, optional): 'mean', 'sum' or 'none'.
            Defaults to 'mean'.
        ignore_index (int, optional): class label to ignore.
            Defaults to -100.
        device (str, optional): Device to move alpha to. Defaults to 'cpu'.
        dtype (torch.dtype, optional): dtype to cast alpha to.
            Defaults to torch.float32.
    Returns:
        A FocalLoss object
    """
    if alpha is not None:
        if not isinstance(alpha, Tensor):
            alpha = torch.tensor(alpha)
        alpha = alpha.to(device=device, dtype=dtype)

    fl = FocalLoss(
        alpha=alpha,
        gamma=gamma,
        reduction=reduction,
        ignore_index=ignore_index)
    return fl
        
def compute_metrics(pred):
    # label = [[cls1,cls2,...],]
    # preds = n list
    focal_loss = FocalLoss()
    labels = pred.label_ids
    preds = pred.predictions
    f1 = []
    focal = []
    for i in range(4):
        # focal.append(focal_loss(torch.tensor(preds[i], dtype=torch.float), torch.tensor(labels[::, i],dtype=torch.float)))
        f1.append(f1_score(y_true = labels[::, i], y_pred = preds[i], average='weighted'))
    return {
        #'focal': sum(focal),
        'f1-sum': sum(f1)/4
    }

In [None]:
class CustomModel_a(nn.Module):
    def __init__(self, input_dim=768):
        super(CustomModel_a, self).__init__()
        self.feature_extract= nn.Sequential(
            nn.Linear(in_features=input_dim, out_features=512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU()
        )

        self.type_classifier = nn.Sequential(
            nn.Dropout(p=0.3),
            nn.Linear(in_features=512, out_features=4),
        )
        self.polarity_classifier = nn.Sequential(
            nn.Dropout(p=0.3),
            nn.Linear(in_features=512, out_features=3),
        )
        self.tense_classifier = nn.Sequential(
            nn.Dropout(p=0.3),
            nn.Linear(in_features=512, out_features=3),
        )
        self.certainty_classifier = nn.Sequential(
            nn.Dropout(p=0.3),
            nn.Linear(in_features=512, out_features=2),
        )
            
    def forward(self, x):
        x = self.feature_extract_first(x)
        tense_output = self.tense_classifier(x)

        x = self.feature_extract_second(x)
        type_output = self.type_classifier(x)
        
        x = self.feature_extract_third(x)
        polarity_output = self.polarity_classifier(x)
        
        x = self.feature_extract_forth(x)
        certainty_output = self.certainty_classifier(x)
    
        return type_output, polarity_output, tense_output, certainty_output

In [None]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
  
    model.to(device)
    
    criterion = {
        'type' : FocalLoss().to(device),
        'polarity' : FocalLoss().to(device),
        'tense' : FocalLoss().to(device),
        'certainty' : FocalLoss().to(device)
    }
    
    best_loss = 999999
    best_model = None
    
    for epoch in range(1, CFG['EPOCHS']+40):
        model.train()
        train_loss = []
        for type_label, polarity_label, tense_label, certainty_label, hidden_state in tqdm(iter(train_loader)):
            type_label = type_label.to(device)
            polarity_label = polarity_label.to(device)
            tense_label = tense_label.to(device)
            certainty_label = certainty_label.to(device)
            hidden_state = hidden_state.to(device)

            optimizer.zero_grad()
            
            type_logit, polarity_logit, tense_logit, certainty_logit = model(hidden_state)
            
            loss = 0.25 * criterion['type'](type_logit, type_label) + \
                    0.25 * criterion['polarity'](polarity_logit, polarity_label) + \
                    0.25 * criterion['tense'](tense_logit, tense_label) + \
                    0.25 * criterion['certainty'](certainty_logit, certainty_label)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
        
        val_loss, val_type_f1, val_polarity_f1, val_tense_f1, val_certainty_f1 = validation(model, val_loader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}] 유형 F1 : [{val_type_f1:.5f}] 극성 F1 : [{val_polarity_f1:.5f}] 시제 F1 : [{val_tense_f1:.5f}] 확실성 F1 : [{val_certainty_f1:.5f}]')
        
        if scheduler is not None:
            scheduler.step(val_loss)
            
        if best_loss > val_loss:
            best_loss = val_loss
            best_model = model
            
    return best_model

In [None]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []
    
    type_preds, polarity_preds, tense_preds, certainty_preds = [], [], [], []
    type_labels, polarity_labels, tense_labels, certainty_labels = [], [], [], []
    
    
    with torch.no_grad():
        for type_label, polarity_label, tense_label, certainty_label, hidden_state in tqdm(iter(val_loader)):
            type_label = type_label.to(device)
            polarity_label = polarity_label.to(device)
            tense_label = tense_label.to(device)
            certainty_label = certainty_label.to(device)
            hidden_state = hidden_state.to(device)

            type_logit, polarity_logit, tense_logit, certainty_logit = model(hidden_state)
            
            loss = 0.25 * criterion['type'](type_logit, type_label) + \
                    0.25 * criterion['polarity'](polarity_logit, polarity_label) + \
                    0.25 * criterion['tense'](tense_logit, tense_label) + \
                    0.25 * criterion['certainty'](certainty_logit, certainty_label)
            
            val_loss.append(loss.item())
            
            type_preds += type_logit.argmax(1).detach().cpu().numpy().tolist()
            type_labels += type_label.detach().cpu().numpy().tolist()
            
            polarity_preds += polarity_logit.argmax(1).detach().cpu().numpy().tolist()
            polarity_labels += polarity_label.detach().cpu().numpy().tolist()
            
            tense_preds += tense_logit.argmax(1).detach().cpu().numpy().tolist()
            tense_labels += tense_label.detach().cpu().numpy().tolist()
            
            certainty_preds += certainty_logit.argmax(1).detach().cpu().numpy().tolist()
            certainty_labels += certainty_label.detach().cpu().numpy().tolist()
    
    type_f1 = f1_score(type_labels, type_preds, average='weighted')
    polarity_f1 = f1_score(polarity_labels, polarity_preds, average='weighted')
    tense_f1 = f1_score(tense_labels, tense_preds, average='weighted')
    certainty_f1 = f1_score(certainty_labels, certainty_preds, average='weighted')
    
    return np.mean(val_loss), type_f1, polarity_f1, tense_f1, certainty_f1

In [None]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
  def __init__(self, dataset):
    dataset.set_format(type = 'pandas')  
    self.df = dataset[:]

  def __len__(self):
    return len(self.df)

  def __getitem__(self, index):
    if '유형' in self.df.columns:
      type_label = self.df['유형'][index]
      polarity_label = self.df['극성'][index]
      tense_label = self.df['시제'][index]
      certainty_label = self.df['확실성'][index]
      hidden_state = torch.tensor(self.df['hidden_state'][index])

      return type_label, polarity_label, tense_label, certainty_label, hidden_state

    else:
      hidden_state = torch.tensor(self.df['hidden_state'][index])
      return hidden_state


### k-fold

In [None]:
from sklearn.model_selection import train_test_split, KFold

In [None]:
k = 5
kf = KFold(n_splits=k)

In [None]:
kf_generator = kf.split(ds)

In [None]:
train_index, val_index = next(iter(kf_generator))

In [None]:
dataset_t = MyDataset(ds.select_columns(['유형', '극성', '시제', '확실성', 'hidden_state']).select(train_index))
dataset_v = MyDataset(ds.select_columns(['유형', '극성', '시제', '확실성', 'hidden_state']).select(val_index))

train_loader = DataLoader(dataset_t, batch_size=256, shuffle=False)
val_loader = DataLoader(dataset_v, batch_size=256, shuffle=False)

model = CustomModel()
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-8, verbose=True)

infer_model = train(model, optimizer, train_loader, val_loader, scheduler, device)


### train-val split

In [None]:
train_hidden

Dataset({
    features: ['문장', '유형', '극성', '시제', '확실성', 'label', 'input_ids', 'attention_mask', 'hidden_state'],
    num_rows: 13204
})

In [None]:
train_hidden.set_format(type='pandas')
val_hidden.set_format(type='pandas')

train_hidden_df = train_hidden[:]
val_hidden_df = val_hidden[:]

hidden_df=pd.concat([train_hidden_df, val_hidden_df])

type_le, polarity_le, tense_le, certainty_le = label_enc(hidden_df)

train_hidden_df = hidden_df[:18698]
val_hidden_df = hidden_df[18698:]

In [None]:
from datasets import Dataset, DatasetDict, load_dataset

train_hidden = to_ds(train_hidden_df)
val_hidden = to_ds(val_hidden_df)

In [None]:
train_hidden.set_format("torch", columns=['input_ids', 'attention_mask', '유형', '극성', '시제', '확실성'])
val_hidden.set_format("torch", columns=['input_ids', 'attention_mask', '유형', '극성', '시제', '확실성'])

In [None]:
dataset_t = MyDataset(train_hidden.select_columns(['유형', '극성', '시제', '확실성', 'hidden_state']))
dataset_v = MyDataset(val_hidden.select_columns(['유형', '극성', '시제', '확실성', 'hidden_state']))

train_loader = DataLoader(dataset_t, batch_size=256, shuffle=True)
val_loader = DataLoader(dataset_v, batch_size=256, shuffle=True)


In [None]:
model = CustomModel()
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-8, verbose=True)

infer_model = train(model, optimizer, train_loader, val_loader, scheduler, device)

In [None]:
model_a = CustomModel_a()
model_a.eval()
optimizer = torch.optim.Adam(params = model_a.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-8, verbose=True)

infer_model = train(model_a, optimizer, train_loader, val_loader, scheduler, device)

In [None]:
model_b = CustomModel_b()
model_b.eval()
optimizer = torch.optim.Adam(params = model_b.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-8, verbose=True)

infer_model = train(model_b, optimizer, train_loader, val_loader, scheduler, device)

In [None]:
model_c = CustomModel_c()
model_c.eval()
optimizer = torch.optim.Adam(params = model_c.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-8, verbose=True)

infer_model = train(model_c, optimizer, train_loader, val_loader, scheduler, device)

### inference

In [None]:
dataset_test = MyDataset(test_hidden.select_columns(['hidden_state']))

# 배치 사이즈 설정
batch_size = 256

# 데이터 로더 설정
test_loader = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)


In [None]:
def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    
    type_preds, polarity_preds, tense_preds, certainty_preds = [], [], [], []
    
    with torch.no_grad():
        for hidden_state in tqdm(test_loader):
            hidden_state = hidden_state.to(device)
            
            type_logit, polarity_logit, tense_logit, certainty_logit = model(hidden_state)
            
            type_preds += type_logit.argmax(1).detach().cpu().numpy().tolist()
            polarity_preds += polarity_logit.argmax(1).detach().cpu().numpy().tolist()
            tense_preds += tense_logit.argmax(1).detach().cpu().numpy().tolist()
            certainty_preds += certainty_logit.argmax(1).detach().cpu().numpy().tolist()
            
    return type_preds, polarity_preds, tense_preds, certainty_preds

In [None]:
type_preds, polarity_preds, tense_preds, certainty_preds = inference(model, test_loader, device)

In [None]:
type_preds_a, polarity_preds_a, tense_preds_a, certainty_preds_a = inference(model_a, test_loader, device)
type_preds_b, polarity_preds_b, tense_preds_b, certainty_preds_b = inference(model_b, test_loader, device)
type_preds_c, polarity_preds_c, tense_preds_c, certainty_preds_c = inference(model_c, test_loader, device)

In [None]:
type_preds = type_le.inverse_transform(type_preds)
polarity_preds = polarity_le.inverse_transform(polarity_preds)
tense_preds = tense_le.inverse_transform(tense_preds)
certainty_preds = certainty_le.inverse_transform(certainty_preds)

In [None]:
predictions = []
for type_pred, polarity_pred, tense_pred, certainty_pred in zip(type_preds, polarity_preds, tense_preds, certainty_preds):
    predictions.append(type_pred+'-'+polarity_pred+'-'+tense_pred+'-'+certainty_pred)

In [None]:
submit = pd.read_csv('sample_submission.csv')
submit['label'] = predictions

In [None]:
df_test

Unnamed: 0,ID,문장
0,TEST_0000,"장욱진의 ＇가족＇은 허물 없는 가족애를, 처음 공개되는 정약용의 ＇정효자전＇과 ＇정..."
1,TEST_0001,"조지 W 부시, 버락 오바마 전 대통령도 전쟁 위험 때문에 버린 카드다."
2,TEST_0002,지난해 1분기 128억원이었던 영업이익이 올해 1분기 505억원으로 급증했다.
3,TEST_0003,수상 작가와 맺으려던 계약서 내용 가운데 일부가 ＇독소 조항＇으로 해석돼 수정을 요...
4,TEST_0004,결국 최근 KDB산업은행은 대규모 손실 위기에 닥친 에어부산에 140억원 금융지원을...
...,...,...
7085,TEST_7085,"2020 세계국가편람 모바일 앱은 세계 216개국의 국가개황과 주요 경제지표, 사회..."
7086,TEST_7086,탈세계화 징후들이 반갑지 않은 이유다.
7087,TEST_7087,"틱톡은 6월 ＇인터넷 안전의 달＇을 맞아 올바른 개인정보 보호 관리 방법, 앱 내 ..."
7088,TEST_7088,만약 3개월 간 채굴자들의 투표를 거쳐 2/3 이상의 해시파워가 ＇채굴세＇ 도입에 ...


In [None]:
submit

Unnamed: 0,ID,label
0,TEST_0000,사실형-긍정-현재-확실
1,TEST_0001,사실형-긍정-현재-확실
2,TEST_0002,사실형-긍정-과거-확실
3,TEST_0003,사실형-긍정-과거-확실
4,TEST_0004,사실형-긍정-과거-확실
...,...,...
7085,TEST_7085,사실형-긍정-현재-확실
7086,TEST_7086,추론형-긍정-현재-확실
7087,TEST_7087,사실형-긍정-현재-확실
7088,TEST_7088,추론형-긍정-미래-확실


In [None]:
submit.to_csv('submit_bert_focal.csv', index=False, encoding='utf-8-sig')