In [1]:
import warnings 
warnings.filterwarnings(action='ignore')

In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import os

from matplotlib import rcParams, pyplot as plt
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GlobalMaxPooling1D, Conv1D, Dropout, Bidirectional
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam
from tqdm import tqdm_notebook

## 1. 데이터불러오기

In [6]:
# 데이터 불러오기

raw_train = pd.read_csv('/content/drive/MyDrive/[데이콘] 소설 작가 분류 AI 경진대회/data/train.csv')
raw_test = pd.read_csv('/content/drive/MyDrive/[데이콘] 소설 작가 분류 AI 경진대회/data/test_x.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/[데이콘] 소설 작가 분류 AI 경진대회/data/sample_submission.csv')

In [7]:
train = raw_train.copy()
test = raw_test.copy()

## 2. 텍스트전처리(토큰화 + 패딩화)

In [8]:
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)


def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)


stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [9]:
train['text'] = train['text'].str.lower().apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].str.lower().apply(alpha_num).apply(remove_stopwords)

In [10]:
X_train = train['text'].values
X_test = test['text'].values
y = train['author'].values.reshape(-1, 1)
print(X_train.shape, X_test.shape, y.shape)

(54879,) (19617,) (54879, 1)


In [11]:
vocab_size = 20000
padding_type='post'
max_length = 500

In [12]:
tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

x_train = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)
x_test = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)
print(x_train.shape, x_test.shape)

(54879, 500) (19617, 500)


## 3. LSTM 모델 학습



In [16]:
target_col = 'author'
n_class = 5
seed = 42
embedding_dim = 32

In [19]:
# LSTM 모델 생성

def get_model():

    from tensorflow.keras import Sequential
    from tensorflow.keras.layers import Dense, Embedding, LSTM, GlobalMaxPooling1D, Conv1D, Dropout, Bidirectional
    
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(Bidirectional(LSTM(32, return_sequences=False)))
    model.add(Dense(n_class, activation = 'softmax'))

    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=.02), metrics = ['accuracy'])

    return model

In [20]:
# 3-Fold 시행
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=3, random_state=123, shuffle=True)

In [21]:
validation_pred = np.zeros((y.shape[0], n_class))
test_pred = np.zeros((test.shape[0], n_class))

i = 0
for train_idx, val_idx in tqdm_notebook(cv.split(x_train, y)):
    print("{}-Fold" .format(i+1))
    X_train = x_train[train_idx]
    y_train = y[train_idx]

    X_validation = x_train[val_idx]
    y_validation = y[val_idx]

    LSTM = get_model()

    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)

    LSTM.fit(X_train, y_train,
             epochs           = 20,
             callbacks        = [es],
             batch_size       = 32,
             validation_data  = (X_validation, y_validation))
    
    validation_pred[val_idx, :] = LSTM.predict(X_validation)
    test_pred += (LSTM.predict(x_test) / 3)
    print('')

    i += 1

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

1-Fold
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
  35/1144 [..............................] - ETA: 1:18 - loss: 1.0829 - accuracy: 0.5518

KeyboardInterrupt: ignored

## 5. Validation 평가

In [None]:
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, confusion_matrix

In [None]:
def model_evaluate(y_pred, y_test):
    print('accuracy_score   : ', accuracy_score(y_pred, y_test))
    print('Confusion Matrix : \n', confusion_matrix(y_pred, y_test))

In [None]:
validation_pred_class = np.argmax(validation_pred, axis=1)

In [None]:
model_evaluate(validation_pred_class, y)

accuracy_score   :  0.6624756282002223
Confusion Matrix : 
 [[ 8497   810   863   945  1070]
 [ 1004  4510   619   595   285]
 [ 1235   789  7190  1428  1157]
 [ 1462   864  1803 11650   784]
 [ 1037   249  1079   445  4509]]


## 6. Test set 예측 및 제출결과저장

In [None]:
sample_submission[['0','1','2','3','4']] = test_pred
sample_submission.to_csv('/content/drive/MyDrive/[데이콘] 소설 작가 분류 AI 경진대회/02_나의코드/01_LSTM/Test예측결과/submission_05.csv', index = False, encoding = 'utf-8')