In [None]:
# https://keep-steady.tistory.com/37


In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy
import random
import warnings

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from transformers import AutoTokenizer, set_seed

warnings.filterwarnings('ignore')

In [None]:
SEED = 42

set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


In [None]:
tr = pd.read_csv('./data/train_data.csv', index_col='index')
te = pd.read_csv('./data/test_data.csv', index_col='index')

In [None]:
# 문자만 남기고 제거
import re

reg = re.compile('[가-힣a-zA-Z]+')
tr['title'] =  tr['title'].map(lambda x: ' '.join(reg.findall(x)))
te['title'] = te['title'].map(lambda x: ' '.join(reg.findall(x)))


In [None]:
with open('./data/sent.txt', 'w', encoding='utf-8') as f:
    for s in tr['title'].values:
        f.write(s+'\n')

In [None]:
import konlpy
mecab = konlpy.tag.Mecab('C:\mecab')
mecab_tokenizer = mecab.morphs

In [None]:
with open('data/sent.txt', 'r', encoding='utf-8') as f:
    data = f.read().split('\n')

total_morph=[]
for sentence in data:
    # 문장단위 mecab 적용
    morph_sentence = mecab_tokenizer(sentence)
#     morph_sentence = list(filter(lambda x: x not in stopwords, morph_sentence))
    # 문장단위 저장
    total_morph.append(morph_sentence)

In [None]:
total_morph[0]

In [None]:
with open('data/after_mecab.txt', 'w', encoding='utf-8') as f:
    for line in total_morph:
        f.write(' '.join(line)+'\n')

In [None]:
from tokenizers import BertWordPieceTokenizer

how_to_tokenize = BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer(strip_accents=False, 
                                   lowercase=False)

In [None]:
corpus_file   = ['data/after_mecab.txt']  # data path
vocab_size    = 32000
limit_alphabet = 6000
min_frequency = 1

In [None]:
tokenizer.train(files=corpus_file,
               vocab_size=vocab_size,
               min_frequency=min_frequency,  # 단어의 최소 발생 빈도, 5
               limit_alphabet=limit_alphabet,  # ByteLevelBPETokenizer 학습시엔 주석처리 필요
               show_progress=True)

In [None]:
tokenizer.save_model('vocab') 

In [None]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('vocab',
                                                       strip_accents=False,  # Must be False if cased model
                                                       lowercase=False)  # 로드

In [None]:
from transformers import DistilBertConfig, DistilBertForSequenceClassification

config = DistilBertConfig(vocab_size=tokenizer.vocab_size)
print(config)
model = DistilBertForSequenceClassification(config)
model.classifier = nn.Linear(768, 7)
model.cuda()

In [None]:
class TrainDataset(Dataset):
    def __init__(self, data, tokenizer, max_seq_len=40):
        self.data = data
        self.max_seq_len = max_seq_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        record = self.data.iloc[index]
        document, label = str(record['title']), int(record['topic_idx'])
        inputs = self.tokenizer(
            document, 
            return_tensors='pt',
            truncation=True,
            max_length=self.max_seq_len,
            pad_to_max_length=True,
            add_special_tokens=True
            )
        
        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]
            
        return {'input_ids': input_ids,
                'attention_mask': attention_mask,
                'labels': np.array(label, dtype=np.int_)}
    
class TestDataset(Dataset):
    def __init__(self, data, tokenizer, max_seq_len=40):
        self.data = data
        self.max_seq_len = max_seq_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        record = self.data.iloc[index]
        document = str(record['title'])
        inputs = self.tokenizer(
            document, 
            return_tensors='pt',
            truncation=True,
            max_length=self.max_seq_len,
            pad_to_max_length=True,
            add_special_tokens=True
            )
        
        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]
            
        return {'input_ids': input_ids,
                'attention_mask': attention_mask}

In [None]:
# tokenize 결과를 바탕으로 문장 max_seq_len 결정
token_len = tr['title'].map(tokenizer.tokenize)
token_len.map(len).describe()

In [None]:
# train parameters
epochs = 10
batch_size = 8
max_seq_len = 20

In [None]:
# k-fold로 validation 하시면 더 좋을 것 같습니다.
from sklearn.model_selection import train_test_split

train, val = train_test_split(tr, test_size=0.15, random_state=SEED)

In [None]:
# train loader
train_ds = TrainDataset(train, tokenizer, max_seq_len=max_seq_len)
tr_loader = DataLoader(train_ds, batch_size=batch_size, num_workers=0, shuffle=True)

val_ds = TrainDataset(val, tokenizer, max_seq_len=max_seq_len)
val_loader = DataLoader(val_ds, batch_size=batch_size, num_workers=0, shuffle=False)

test_ds = TestDataset(te, tokenizer, max_seq_len=max_seq_len)
test_loader = DataLoader(test_ds, 8)

In [None]:
# optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 5, )
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
best_score = -float('inf')
best_model = None

for e in range(epochs):
    tr_loss = []
    model.train()
    scheduler.step()
    for batch in tr_loader:
        optimizer.zero_grad()
        ids, atts, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        ids = torch.tensor(ids).long().cuda()
        atts = torch.tensor(atts).float().cuda()
        labels = torch.tensor(labels).long().cuda()
        pred = model(ids, attention_mask=atts)
        loss = loss_fn(pred[0], labels)
        
        loss.backward()
        optimizer.step()
        tr_loss += [loss.item()]
    
    model.eval()
    preds = []
    trues = []
    for batch in val_loader:
        optimizer.zero_grad()
        ids, atts, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        ids = torch.tensor(ids).long().cuda()
        atts = torch.tensor(atts).float().cuda()
        trues += list(labels.numpy())
        pred = model(ids, attention_mask=atts)
        preds += list(np.argmax(pred[0].detach().cpu().numpy(), 1))
        
    trues = np.array(trues)
    preds = np.array(preds)
    acc = np.sum(trues == preds) / len(trues)
    
    if best_score < acc:
        best_score = acc
        best_model = copy.deepcopy(model)
    
    print(e, 'tr_loss:', np.mean(tr_loss), 'val_score:', acc)

In [None]:
raise('eo')

In [None]:
preds = []
model.eval()

for b in tqdm(test_loader):
    ids, atts = b['input_ids'], b['attention_mask']
    ids = torch.tensor(ids).long().cuda()
    atts = torch.tensor(atts).float().cuda()
    pred = best_model(ids, attention_mask=atts)
    preds += list(np.argmax(pred[0].detach().cpu().numpy(), 1))

In [None]:
sub = pd.read_csv('./data/sample_submission.csv', index_col='index')
sub['topic_idx'] = preds
sub.head(20)

In [None]:
# sub.to_csv('./custom_Distilbert.csv')