https://dacon.io/competitions/official/235670/codeshare/1771?page=1&dtype=recent&ptype=pub

위 코드를 참고하여 데이콘 대회 데이터를 적용해본 NLP 연습



# 작업환경 설정 및 데이터 불러오기

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
np.random.seed(42)
import re
import os
import tqdm

import lightgbm as lgbm
import xgboost as xgb

from scipy import sparse
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import ensemble, metrics, model_selection, naive_bayes

import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GlobalMaxPooling1D, Conv1D, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam
tf.random.set_seed(42)

from keras.initializers import Constant

import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk import FreqDist
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
import string

In [None]:
train = pd.read_csv('train.csv', encoding = 'utf-8')
test = pd.read_csv('test_x.csv', encoding = 'utf-8')
sample_submission = pd.read_csv('sample_submission.csv', encoding = 'utf-8')

# 전처리

In [None]:
#소문자로 변환
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()

In [None]:
#일부 접어 전처리
def decontraction(text):
    text = re.sub(r"’", "\'", text)
    text = re.sub(r"won\'t", " will not", text)
    text = re.sub(r"won\'t've", " will not have", text)
    text = re.sub(r"can\'t", " can not", text)
    text = re.sub(r"don\'t", " do not", text)
    text = re.sub(r"can\'t've", " can not have", text)
    text = re.sub(r"ma\'am", " madam", text)
    text = re.sub(r"let\'s", " let us", text)
    text = re.sub(r"ain\'t", " am not", text)
    text = re.sub(r"shan\'t", " shall not", text)
    text = re.sub(r"sha\n't", " shall not", text)
    text = re.sub(r"o\'clock", " of the clock", text)
    text = re.sub(r"y\'all", " you all", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"n\'t've", " not have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'d've", " would have", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ll've", " will have", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'re", " are", text)
    return text 

train['text'] = train['text'].apply(decontraction)
test['text'] = test['text'].apply(decontraction)

In [None]:
#불용어 1차 처리 및 부호 제거
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords_base:
            final_text.append(i.strip())
    return " ".join(final_text)


stopwords_base = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves",
             "odin", "said", "mr", "upon", "one"]



In [None]:
train['text'] = train['text'].str.lower().apply(remove_stopwords).apply(alpha_num)
test['text'] = test['text'].str.lower().apply(remove_stopwords).apply(alpha_num)

In [None]:
#2차 불용어 불러오기
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 

#토크나이저 및 스테머 불러오기
tokenizer_tb = TreebankWordTokenizer()
pst = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#2차 불용어 처리
tokens_trn = []
tokens_tst = []

for txt in train['text'] :
  token = tokenizer_tb.tokenize(txt)
  non_stopwords = [pst.stem(t) for t in token if not t in stop_words]
  tokens_trn.append(non_stopwords)


for txt in test['text'] :
  token = tokenizer_tb.tokenize(txt)
  non_stopwords = [pst.stem(t) for t in token if not t in stop_words]
  tokens_tst.append(non_stopwords)

In [None]:
#2차 불용어 처리
tokens_trn = []
tokens_tst = []

for txt in train['text'] :
  token = tokenizer_tb.tokenize(txt)
  tokens_trn.append(token)


for txt in test['text'] :
  token = tokenizer_tb.tokenize(txt)
  tokens_tst.append(token)

In [None]:
train['text'] = tokens_trn
test['text'] = tokens_tst

In [None]:
X_train = np.array([x for x in train['text']])
X_test = np.array([x for x in test['text']])
y_train = np.array([x for x in train['author']])

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [None]:
max_words = len(word_index) + 1
print( 'unique words are : %d' % max_words)

unique words are : 46611


In [None]:
#단어 길이 확인
max_length = max([len(s) for s in X_train])
min_length = min([len(s) for s in X_train])
mean_length = np.mean([len(s) for s in X_train])
median_length = np.median([len(s) for s in X_train])

print( 'Train Max length: %d ' % max_length)
print( 'Train Min length: %d ' % min_length) # test에서는 7이므로 괜찮음
print( 'Train Mean length: %d ' % mean_length)
print( 'Train Median length: %d ' % median_length)


max_length_tst = max([len(s) for s in X_test])
min_length_tst = min([len(s) for s in X_test])
mean_length_tst = np.mean([len(s) for s in X_test])
median_length_tst = np.median([len(s) for s in X_test])

print( 'Test Max length: %d ' % max_length_tst)
print( 'Test Min length: %d ' % min_length_tst) # test에서는 7이므로 괜찮음
print( 'Test Mean length: %d ' % mean_length_tst)
print( 'Test Median length: %d ' % median_length_tst)

Train Max length: 474 
Train Min length: 0 
Train Mean length: 42 
Train Median length: 22 
Test Max length: 470 
Test Min length: 29 
Test Mean length: 91 
Test Median length: 71 


# 모델링 및 성능평가

LSTM : 0.4196639302	

In [None]:
#파라미터 설정
vocab_size = 20000
embedding_dim = 100
max_length = 50
padding_type='post'

In [None]:
#데이터를 sequence로 변환하고 padding
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)

In [None]:
num_words = len(word_index)+1
embedding_matrix = np.zeros((num_words, 100))

for word, i in word_index.items():
    embedding_vector = embedding_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
n_fold = 5
n_class = 5
cv = StratifiedKFold(n_splits = n_fold, shuffle = True, random_state = 42)

In [None]:
def get_model():
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        Bidirectional(LSTM(64, dropout= 0.2, recurrent_dropout=0.2,return_sequences=True)),
        Bidirectional(LSTM(64, dropout= 0.2)),
        Dense(n_class, activation='softmax')
    ])
    
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=.01))
    return model

In [None]:
p_val = np.zeros((train_padded.shape[0], 5))
p_tst = np.zeros((test_padded.shape[0], 5))
for i, (i_trn, i_val) in enumerate(cv.split(train_padded, y_train), 1):
    print(f'training model for CV #{i}')
    clf = get_model()
    
    es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3,
                       verbose=1, mode='min', baseline=None, restore_best_weights=True)

    clf.fit(train_padded[i_trn], 
            to_categorical(y_train[i_trn]),
            validation_data=(train_padded[i_val], to_categorical(y_train[i_val])),
            epochs= 20,
            batch_size=512,
            callbacks=[es])
    p_val[i_val, :] = clf.predict(train_padded[i_val])
    p_tst += clf.predict(test_padded) / n_fold

training model for CV #1
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 00005: early stopping
training model for CV #2
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 00005: early stopping
training model for CV #3
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 00005: early stopping
training model for CV #4
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 00005: early stopping
training model for CV #5
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 00005: early stopping


In [None]:
# submission
sample_submission[['0','1','2','3','4']] = p_tst
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,0.122850,0.603022,0.197693,0.065256,0.011178
1,1,0.259682,0.622593,0.008344,0.047605,0.061775
2,2,0.844489,0.044937,0.010351,0.003821,0.096402
3,3,0.180283,0.057147,0.546468,0.003079,0.213024
4,4,0.271554,0.442475,0.065065,0.097653,0.123254
...,...,...,...,...,...,...
19612,19612,0.003257,0.996643,0.000025,0.000057,0.000017
19613,19613,0.100828,0.059387,0.375514,0.050895,0.413375
19614,19614,0.011379,0.985739,0.000704,0.001420,0.000758
19615,19615,0.062077,0.496767,0.017962,0.420880,0.002314
