In [None]:
import warnings 
warnings.filterwarnings(action='ignore')

import numpy as np
import pandas as pd
import tensorflow as tf
import re
import os
import nltk
import datetime

from gensim.models.fasttext import FastText
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import Adam
from tqdm import tqdm_notebook

In [None]:
# 데이터 불러오기

raw_train = pd.read_csv('/content/drive/MyDrive/[데이콘] 소설 작가 분류 AI 경진대회/data/train.csv')
raw_test = pd.read_csv('/content/drive/MyDrive/[데이콘] 소설 작가 분류 AI 경진대회/data/test_x.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/[데이콘] 소설 작가 분류 AI 경진대회/data/sample_submission.csv')

In [None]:
train = raw_train.copy()
test = raw_test.copy()

In [None]:
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)


def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)


stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [None]:
train['text'] = train['text'].str.lower().apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].str.lower().apply(alpha_num).apply(remove_stopwords)

In [None]:
X_train = train['text'].values
X_test = test['text'].values
y = train['author'].values.reshape(-1, 1)
print(X_train.shape, X_test.shape, y.shape)

(54879,) (19617,) (54879, 1)


In [None]:
max_length = 500
padding_type='post'
vocab_size= 20000

In [None]:
tokenizer = Tokenizer(oov_token='oov', num_words = vocab_size)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

x_train = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)
x_test = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)
print(x_train.shape, x_test.shape)

(54879, 500) (19617, 500)


# 1. FastText

In [None]:
!pip install fasttext

Collecting fasttext
[?25l  Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)
[K     |████▊                           | 10kB 15.6MB/s eta 0:00:01[K     |█████████▌                      | 20kB 20.2MB/s eta 0:00:01[K     |██████████████▎                 | 30kB 22.2MB/s eta 0:00:01[K     |███████████████████             | 40kB 14.8MB/s eta 0:00:01[K     |███████████████████████▉        | 51kB 9.4MB/s eta 0:00:01[K     |████████████████████████████▋   | 61kB 10.8MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 4.6MB/s 
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp36-cp36m-linux_x86_64.whl size=3038347 sha256=1bc347b881c5d19c7aef91aa4ce1d4d65ed07f57edfcc1e3d544be6413416de1
  Stored in directory: /root/.cache/pip/wheels/98/ba/7f/b154944a1cf5a8cee91

In [None]:
import fasttext
import fasttext.util

In [None]:
print(f"== LOAD fasttext START at {datetime.datetime.now()}")
ft = fasttext.load_model('/content/drive/MyDrive/FastText/cc.en.300.bin')
print(f"== LOAD fasttext   END at {datetime.datetime.now()}")

== LOAD fasttext START at 2020-11-29 18:43:10.505883
== LOAD fasttext   END at 2020-11-29 18:44:39.178920




In [None]:
embedding_dim = 300
embedding_matrix = np.zeros( (len(word_index)+1, embedding_dim) )

In [None]:
# 임베딩테이블 만들기
for word, idx in word_index.items():
    embedding_vector = ft.get_word_vector(word)
    if embedding_vector is not None :
        embedding_matrix[idx] = embedding_vector

In [None]:
import tensorflow.keras.backend as K
# Mish 활성화 함수
def mish(x):
    return x * K.tanh(K.softplus(x))


n_class = 5

In [None]:
def get_model() :

    from tensorflow.keras import Sequential
    from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, Bidirectional, Flatten, GRU, GlobalAveragePooling1D, GlobalMaxPooling1D, Conv1D, AveragePooling1D
    import tensorflow as tf

    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix[0:20000]], input_length=max_length))
    model.add(Conv1D(filters=64, kernel_size=3, padding='valid', activation=mish, strides=1))
    model.add(AveragePooling1D())
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.2))
    model.add(Dense(n_class, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.005))

    return model

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train_holdout, X_test_holdout, y_train_holdout, y_test_holdout = train_test_split(x_train, y, test_size=0.2, random_state=123)

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2, restore_best_weights=True)

In [None]:
model = get_model()
history = model.fit(X_train_holdout, y_train_holdout, 
                    epochs           = 20,
                    callbacks        = [es],
                    batch_size       = 16,
                    validation_data = (X_test_holdout, y_test_holdout))

Epoch 1/20
Epoch 2/20
  69/2744 [..............................] - ETA: 14:24 - loss: 0.7139

KeyboardInterrupt: ignored

In [None]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 500, 300)          6000000   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 500, 128)          140544    
_________________________________________________________________
dense_6 (Dense)              (None, 500, 5)            645       
Total params: 6,141,189
Trainable params: 6,141,189
Non-trainable params: 0
_________________________________________________________________
