In [1]:
import warnings 
warnings.filterwarnings(action='ignore')

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import os

from matplotlib import rcParams, pyplot as plt
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam
from tqdm import tqdm_notebook

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords



In [3]:
# 데이터 불러오기

raw_train = pd.read_csv('/content/drive/MyDrive/[데이콘] 소설 작가 분류 AI 경진대회/data/train.csv')
raw_test = pd.read_csv('/content/drive/MyDrive/[데이콘] 소설 작가 분류 AI 경진대회/data/test_x.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/[데이콘] 소설 작가 분류 AI 경진대회/data/sample_submission.csv')

In [4]:
train = raw_train.copy()
test = raw_test.copy()

In [6]:
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)


def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)


stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [7]:
train['text'] = train['text'].str.lower().apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].str.lower().apply(alpha_num).apply(remove_stopwords)

In [8]:
X_train = train['text'].values
X_test = test['text'].values
y = train['author'].values.reshape(-1, 1)
print(X_train.shape, X_test.shape, y.shape)

(54879,) (19617,) (54879, 1)


In [9]:
max_length = 500
padding_type='post'
vocab_size= 20000

In [10]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token = 'OOV')
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
word_count = tokenizer.word_counts

train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

x_train = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)
x_test = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)
print(x_train.shape, x_test.shape)

(54879, 500) (19617, 500)


In [None]:
target_col = 'author'
n_class = 5
seed = 42
embedding_dim = 32

In [None]:
import tensorflow.keras.backend as K
# Mish 활성화 함수
def mish(x):
    return x * K.tanh(K.softplus(x))

In [None]:
def get_model() :

    from tensorflow.keras import Sequential
    from tensorflow.keras.layers import Dense, Embedding, LSTM, GlobalMaxPooling1D, Conv1D, Dropout, Bidirectional, Flatten, MaxPool1D, GlobalAveragePooling1D
    import tensorflow as tf


    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
    
    model.add(Dropout(0.5))
    
    model.add(Conv1D(filters=128, kernel_size=3, padding='same', activation=mish, strides=1))
    model.add(GlobalAveragePooling1D())

    model.add(Flatten())

    model.add(Dropout(0.5))

    model.add(Dense(128, activation=mish))

    model.add(Dropout(0.5))
    
    model.add(Dense(n_class, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.01))

    return model


In [None]:

# 3-Fold 시행
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)

In [None]:
validation_pred = np.zeros((y.shape[0], n_class))
test_pred = np.zeros((test.shape[0], n_class))

i = 0
for train_idx, val_idx in tqdm_notebook(cv.split(x_train, y)):
    print("{}-Fold" .format(i+1))
    X_train = x_train[train_idx]
    y_train = y[train_idx]

    X_validation = x_train[val_idx]
    y_validation = y[val_idx]

    CNN = get_model()

    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2, restore_best_weights=True)

    CNN.fit(X_train, y_train,
             epochs           = 20,
             callbacks        = [es],
             batch_size       = 64,
             validation_data  = (X_validation, y_validation))
    
    validation_pred[val_idx, :] = CNN.predict(X_validation)
    test_pred += (CNN.predict(x_test) / 5)
    print('')

    i += 1

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

1-Fold
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 00008: early stopping

2-Fold
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 00010: early stopping

3-Fold
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 00011: early stopping

4-Fold
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 00008: early stopping

5-Fold
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 00009: early stopping




In [None]:
from sklearn.metrics import log_loss, recall_score, confusion_matrix

In [None]:
y_pred = np.argmax(validation_pred, axis=1)
y_real = y.reshape((1, -1))[0]

In [None]:
confusion_matrix(y_pred, y_real)

array([[ 8685,   908,   746,   672,  1120],
       [  708,  4173,   177,   293,   164],
       [ 2098,  1284,  8908,  2637,  2227],
       [ 1178,   747,  1179, 11334,   507],
       [  566,   110,   544,   127,  3787]])

In [None]:
# log_loss

log_loss(pd.get_dummies(y_real), validation_pred)

0.8748929676052163

In [None]:
# 결과 저장
sample_submission[['0','1','2','3','4']] = test_pred
sample_submission.to_csv('/content/drive/MyDrive/[데이콘] 소설 작가 분류 AI 경진대회/02_나의코드/01_LSTM/Test예측결과/submission_06.csv', index = False, encoding = 'utf-8')

FileNotFoundError: ignored