In [None]:
import warnings 
warnings.filterwarnings(action='ignore')

import numpy as np
import pandas as pd
import tensorflow as tf
import re
import os
import nltk
import datetime

from gensim.models.fasttext import FastText
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tqdm import tqdm_notebook

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [29]:
# 데이터 불러오기

raw_train = pd.read_csv('/content/drive/MyDrive/[데이콘] 소설 작가 분류 AI 경진대회/data/train.csv')
raw_test = pd.read_csv('/content/drive/MyDrive/[데이콘] 소설 작가 분류 AI 경진대회/data/test_x.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/[데이콘] 소설 작가 분류 AI 경진대회/data/sample_submission.csv')

In [95]:
train = raw_train.copy()
test = raw_test.copy()

In [96]:
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)


def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)



stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"]

In [97]:
train['text'] = train['text'].str.lower().apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].str.lower().apply(alpha_num).apply(remove_stopwords)

In [98]:
X_train = train['text'].values
X_test = test['text'].values
y = train['author'].values.reshape(-1, 1)
print(X_train.shape, X_test.shape, y.shape)

(54879,) (19617,) (54879, 1)


In [99]:
max_length = 500
padding_type='post'
vocab_size= 10000

In [100]:
tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

x_train = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)
x_test = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)
print(x_train.shape, x_test.shape)

(54879, 500) (19617, 500)


## 1. Glove 사용하기

In [101]:
import numpy as np
embedding_dict = dict()
f = open('/content/drive/MyDrive/Glove/glove.6B.300d.txt', encoding="utf8")

for line in f:
    word_vector = line.split()
    word = word_vector[0]
    word_vector_arr = np.asarray(word_vector[1:], dtype='float32')
    embedding_dict[word] = word_vector_arr
f.close()
print('%s개의 Embedding vector가 있습니다.' % len(embedding_dict))

400000개의 Embedding vector가 있습니다.


In [102]:
embedding_dim = 300
embedding_matrix = np.zeros( (len(word_index)+1, embedding_dim) )

In [103]:
# 임베딩테이블 만들기
for word, idx in word_index.items():
    embedding_vector = embedding_dict.get(word)

    if embedding_vector is not None :
        embedding_matrix[idx] = embedding_vector
    else :
        embedding_matrix[idx] = np.zeros((1, embedding_dim))

In [104]:
# 임베딩 테이블
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.043343  ,  0.40163001,  0.055433  , ..., -0.037958  ,
        -0.74250001,  0.097346  ],
       [ 0.0083903 ,  0.28769001, -0.23466   , ..., -0.66409999,
         0.10303   ,  0.1219    ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [105]:
target_col = 'author'
n_class = 5
seed = 42

In [106]:
import tensorflow.keras.backend as K
# Mish 활성화 함수
def mish(x):
    return x * K.tanh(K.softplus(x))

In [92]:
def get_model() :

    from tensorflow.keras import Sequential
    from tensorflow.keras.layers import Dense, Embedding, GlobalMaxPooling1D, Conv1D, Dropout, Flatten, MaxPool1D, GlobalAveragePooling1D, Flatten

    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix[0:vocab_size]], input_length=max_length))
    model.add(Dropout(0.2))
    model.add(Conv1D(50, 3, padding='same', activation=mish, strides=1))
    model.add(GlobalAveragePooling1D())
    model.add(Flatten())
    model.add(Dropout(0.2))
    model.add(Dense(50, activation=mish))
    model.add(Dropout(0.2))
    model.add(Dense(n_class, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.002))

    return model


## 2. Hold Out

In [107]:
from sklearn.model_selection import train_test_split

In [108]:
X_train_holdout, X_test_holdout, y_train_holdout, y_test_holdout = train_test_split(x_train, y, test_size=0.2, random_state=123)

In [109]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2, restore_best_weights=True)

In [110]:
model = get_model()
history = model.fit(X_train_holdout, y_train_holdout, 
                    epochs           = 20,
                    callbacks        = [es],
                    batch_size       = 16,
                    validation_data = (X_test_holdout, y_test_holdout))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 00005: early stopping


## 3. K-Fold

In [93]:

# 5-Fold 시행
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)

In [94]:
validation_pred = np.zeros((y.shape[0], n_class))
test_pred = np.zeros((test.shape[0], n_class))

i = 0
for train_idx, val_idx in tqdm_notebook(cv.split(x_train, y)):
    print("{}-Fold" .format(i+1))
    X_train = x_train[train_idx]
    y_train = y[train_idx]

    X_validation = x_train[val_idx]
    y_validation = y[val_idx]

    CNN = get_model()

    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2, restore_best_weights=True)

    CNN.fit(X_train, y_train,
             epochs           = 20,
             callbacks        = [es],
             batch_size       = 16,
             validation_data  = (X_validation, y_validation))
    
    validation_pred[val_idx, :] = CNN.predict(X_validation)
    test_pred += (CNN.predict(x_test) / 5)
    print('')

    i += 1

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

1-Fold
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 00006: early stopping

2-Fold
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

KeyboardInterrupt: ignored

In [66]:
CNN.summary()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 500, 100)          1000000   
_________________________________________________________________
dropout_29 (Dropout)         (None, 500, 100)          0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 500, 50)           15050     
_________________________________________________________________
global_average_pooling1d_6 ( (None, 50)                0         
_________________________________________________________________
flatten_11 (Flatten)         (None, 50)                0         
_________________________________________________________________
dropout_30 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_22 (Dense)             (None, 50)              

In [67]:
from sklearn.metrics import log_loss, recall_score, confusion_matrix

In [68]:
y_pred = np.argmax(validation_pred, axis=1)
y_real = y.reshape((1, -1))[0]

In [69]:
confusion_matrix(y_pred, y_real)

array([[10122,   688,   921,   881,   960],
       [  535,  5116,   315,   387,   158],
       [ 1039,   765,  8561,  1214,   861],
       [  918,   490,  1013, 12238,   475],
       [  621,   163,   744,   343,  5351]])

In [70]:
# log_loss

log_loss(pd.get_dummies(y_real), validation_pred)

0.6601250055621993

In [71]:
# 결과 저장
sample_submission[['0','1','2','3','4']] = test_pred
sample_submission.to_csv('/content/drive/MyDrive/[데이콘] 소설 작가 분류 AI 경진대회/02_나의코드/01_딥러닝/Test예측결과/submission_14.csv', index = False, encoding = 'utf-8')