# setting

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/dacon/aitext

/content/drive/MyDrive/dacon/aitext


In [None]:
!pip install -U transformers sentencepiece -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.3 MB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.3/1.3 MB[0m [31m20.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler

from keras.utils import pad_sequences

from transformers import AdamW
from transformers import AutoTokenizer, AutoModel

In [None]:
def set_seed(seed = 42):

    np.random.seed(seed) #이 부분이 pandas의 sample함수에도 영향을 줍니다.
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [None]:
set_seed()

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# load data

In [None]:
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

# preprocess

In [None]:
human_reviews = []
ai_reviews = []

클래스 균형을 맞추기 위해 가짜 데이터 중 하나만 Random Choice

In [None]:
for i, label in enumerate(df_train['label']):

    labels = [1,2,3,4]
    labels.remove(label) #인간이 쓴건 지우고
    ai_label = np.random.choice(labels) #ai가 쓴 것중 하나만 선택

    human_reviews.append(df_train.iloc[i, label])
    ai_reviews.append(df_train.iloc[i, ai_label])

In [None]:
#change multi classification to binary classification
human_labels = [1]*len(human_reviews)
ai_labels = [0]*len(ai_reviews)

In [None]:
reviews = human_reviews + ai_reviews
labels = human_labels + ai_labels

# tokenizer & model

In [None]:
tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2')

config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.83M [00:00<?, ?B/s]

In [None]:
class GPTModel(nn.Module):

    def __init__(self, num_classes = 2):
        super(GPTModel, self).__init__()
        self.num_classes = num_classes
        self.gpt = AutoModel.from_pretrained('skt/kogpt2-base-v2')
        self.fc = nn.Linear(768, num_classes, bias = False)

    def forward(self, input_ids, attention_mask, labels = None):

        output = self.gpt(input_ids = input_ids,
                          attention_mask = attention_mask)

        output = self.fc(output[0])

        output = output[torch.arange(self.num_classes, device = device),-1]

        return output

In [None]:
model = GPTModel()
model = model.to(device)

pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]

# dataloader

In [None]:
tokenized_texts = [tokenizer.tokenize(s) for s in reviews]

MAX_LEN = max([len(x) for x in tokenized_texts])

input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen = MAX_LEN, dtype = 'long', truncating = 'post', padding = 'post')

attention_masks = []

for seq in tqdm(input_ids):
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

train_inputs = torch.tensor(input_ids)
train_labels = torch.tensor(labels)
train_masks = torch.tensor(attention_masks)

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
BATCH_SIZE = 2

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = BATCH_SIZE)

# training setting

In [None]:
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
criterion = nn.CrossEntropyLoss()
epochs = 8



In [None]:
# accuracy metric
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# train

In [None]:
for epoch in range(epochs):
    print('--------------------------------------------------------------------------')
    print(f'Epoch "{epoch+1}"')

    print('Train Mode:', end = " ")
    total_loss, train_accuracy = 0.0, 0.0
    model.train()

    for batch in tqdm(train_dataloader):
        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch

        outputs = model(input_ids = b_input_ids,
                        attention_mask = b_input_mask)

        logits = outputs.view(-1, 2)

        loss = criterion(logits, b_labels)

        total_loss += loss.item()

        loss.backward()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        train_accuracy += flat_accuracy(logits, label_ids)

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    model.zero_grad()
    avg_train_loss = total_loss/len(train_dataloader)
    avg_train_acc = train_accuracy/len(train_dataloader)

    print('Average training loss: {0:.2f}'.format(avg_train_loss), end = " ")
    print('Average training accuracy: {0:.2f}'.format(avg_train_acc))




--------------------------------------------------------------------------
Epoch "1"
Train Mode: 

  0%|          | 0/50 [00:00<?, ?it/s]

Average training loss: 1.29 Average training accuracy: 0.68
--------------------------------------------------------------------------
Epoch "2"
Train Mode: 

  0%|          | 0/50 [00:00<?, ?it/s]

Average training loss: 0.90 Average training accuracy: 0.78
--------------------------------------------------------------------------
Epoch "3"
Train Mode: 

  0%|          | 0/50 [00:00<?, ?it/s]

Average training loss: 0.66 Average training accuracy: 0.91
--------------------------------------------------------------------------
Epoch "4"
Train Mode: 

  0%|          | 0/50 [00:00<?, ?it/s]

Average training loss: 0.68 Average training accuracy: 0.90
--------------------------------------------------------------------------
Epoch "5"
Train Mode: 

  0%|          | 0/50 [00:00<?, ?it/s]

Average training loss: 0.75 Average training accuracy: 0.85
--------------------------------------------------------------------------
Epoch "6"
Train Mode: 

  0%|          | 0/50 [00:00<?, ?it/s]

Average training loss: 0.91 Average training accuracy: 0.88
--------------------------------------------------------------------------
Epoch "7"
Train Mode: 

  0%|          | 0/50 [00:00<?, ?it/s]

Average training loss: 0.50 Average training accuracy: 0.96
--------------------------------------------------------------------------
Epoch "8"
Train Mode: 

  0%|          | 0/50 [00:00<?, ?it/s]

Average training loss: 0.37 Average training accuracy: 0.94


In [None]:
torch.save(model.state_dict(), './best_model.pth')

# predict

- test 데이터도 train 데이터와 동일한 프로세스로 처리
- train 데이터와 달리 test 데이터는 4개의 문장 중 가장 인간이 작성했을 법한 문장을 예측하는 것이기 때문에 순서대로 배치
- 4개의 문장 중 가장 score가 높은 2개 선택

In [None]:
test_reviews = []

for i in range(df_test.shape[0]):

    cols = [1,2,3,4]

    test_reviews.extend(df_test.iloc[i, cols])

In [None]:
tokenized_texts = [tokenizer.tokenize(s) for s in test_reviews]

input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen = MAX_LEN, dtype = 'long', truncating = 'post', padding = 'post')

attention_masks = []

for seq in input_ids:
    seq_mask = [float(i > 0) for i in seq]
    attention_masks.append(seq_mask)

test_inputs = torch.tensor(input_ids)
test_masks = torch.tensor(attention_masks)

test_data = TensorDataset(test_inputs, test_masks)
test_sampler = SequentialSampler(test_data)
test_dataloder = DataLoader(test_data, sampler = test_sampler, batch_size = BATCH_SIZE)

0과 1을 예측하는 것이기 아니기 때문에 확률 Score를 저장

In [None]:
model.eval()

preds = []

for batch in tqdm(test_dataloder):

    batch = tuple(t.to(device) for t in batch) # batch to cuda

    b_input_ids, b_input_mask = batch

    outputs = model(input_ids = b_input_ids,
                    attention_mask = b_input_mask)

    logits = outputs.view(-1, 2).detach().cpu().numpy()
    preds.extend(logits)

preds = np.array(preds)

  0%|          | 0/2200 [00:00<?, ?it/s]

가장 높은 Score를 가진 2 문장 label 선택

In [None]:
pred_labels = []

for i in range(0,preds.shape[0],4):

    tmp = preds[i:i+4,1] #4문장씩

    label1 = np.argmax(tmp) #가장 높은 score

    tmp[label1] = -np.inf

    label2 = np.argmax(tmp) #두번째로 높은 score

    label = str(label1 + 1) + str(label2 + 1)

    pred_labels.append(label)

# submission

In [None]:
submit = pd.read_csv('./sample_submission.csv')
submit['label'] = pred_labels
submit

Unnamed: 0,id,label
0,TEST_0000,34
1,TEST_0001,13
2,TEST_0002,24
3,TEST_0003,23
4,TEST_0004,32
...,...,...
1095,TEST_1095,24
1096,TEST_1096,24
1097,TEST_1097,41
1098,TEST_1098,41


In [None]:
submit.to_csv('./binary_clf_gpt_submission.csv', index = False)