In [1]:
import os
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import keras
from keras.layers import Dense, GlobalAveragePooling1D, Embedding
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

Using TensorFlow backend.


In [3]:
# 데이터 불러오기
train = pd.read_csv('./train/train.csv')
test = pd.read_csv('./test/test.csv')

# 결측치 처리
train['Description'] = train.Description.fillna("none").values
test['Description'] = test.Description.fillna("none").values

# y 원핫인코딩
target = train['AdoptionSpeed']
y = to_categorical(target)

In [4]:
MAX_SEQUENCE_LENGTH = 100
MAX_NB_WORDS = 50000 #200000

In [5]:
# function to clean data
import string
import itertools 
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation

stop_words = set(stopwords.words('english'))

def cleanData(text, lowercase = False, remove_stops = False, stemming = False, lemmatization = False):
    txt = str(text)
    
    # Replace apostrophes with standard lexicons
    txt = txt.replace("isn't", "is not")
    txt = txt.replace("aren't", "are not")
    txt = txt.replace("ain't", "am not")
    txt = txt.replace("won't", "will not")
    txt = txt.replace("didn't", "did not")
    txt = txt.replace("shan't", "shall not")
    txt = txt.replace("haven't", "have not")
    txt = txt.replace("hadn't", "had not")
    txt = txt.replace("hasn't", "has not")
    txt = txt.replace("don't", "do not")
    txt = txt.replace("wasn't", "was not")
    txt = txt.replace("weren't", "were not")
    txt = txt.replace("doesn't", "does not")
    txt = txt.replace("'s", " is")
    txt = txt.replace("'re", " are")
    txt = txt.replace("'m", " am")
    txt = txt.replace("'d", " would")
    txt = txt.replace("'ll", " will")
    txt = txt.replace("--th", " ")
    
    # More cleaning
    txt = re.sub(r"alot", "a lot", txt)
    txt = re.sub(r"what's", "", txt)
    txt = re.sub(r"What's", "", txt)
    
    
    # Remove urls and emails
    txt = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', txt, flags=re.MULTILINE)
    txt = re.sub(r'[\w\.-]+@[\w\.-]+', ' ', txt, flags=re.MULTILINE)
    
    # Replace words like sooooooo with so
    txt = ''.join(''.join(s)[:2] for _, s in itertools.groupby(txt))
    
    # Remove punctuation from text
    txt = ''.join([c for c in text if c not in punctuation])
    
    # Remove all symbols
    txt = re.sub(r'[^A-Za-z\s]',r' ',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stop_words])
        
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])
    
    if lemmatization:
        wordnet_lemmatizer = WordNetLemmatizer()
        txt = " ".join([wordnet_lemmatizer.lemmatize(w, pos='v') for w in txt.split()])

    return txt

In [6]:
# clean comments
train['Description'] = train['Description'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=False, lemmatization = False))
test['Description'] = test['Description'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=False, lemmatization = False))

In [7]:
tokenizer = Tokenizer(lower=False, filters='',num_words = MAX_NB_WORDS) # 설정
tokenizer.fit_on_texts(train['Description']) # token 번호 지정?

sequences = tokenizer.texts_to_sequences(train['Description']) # train : text -> sequence
test_sequences = tokenizer.texts_to_sequences(test['Description']) # test : text -> sequence

train_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) # train : padding why? 길이 맞출려고

print('Shape of train data tensor:', train_data.shape)

test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH) # test : padding

print("max sequence length : {:,}" .format(MAX_SEQUENCE_LENGTH))
nb_words = (np.max(train_data) + 1)
print("# of words :{:,}" .format(nb_words))

Shape of train data tensor: (14993, 100)
# of words :24,752


In [14]:
node_n = 32

from keras.layers.recurrent import LSTM, GRU
model = Sequential()
model.add(Embedding(nb_words,50,input_length=MAX_SEQUENCE_LENGTH))
model.add(GlobalAveragePooling1D())
# model.add(Dense(node_n, activation='relu'))
# model.add(Dense(node_n, activation='relu'))
# model.add(Dense(node_n, activation='relu'))
model.add(Dense(5, activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 50)           1237600   
_________________________________________________________________
global_average_pooling1d_2 ( (None, 50)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 255       
Total params: 1,237,855
Trainable params: 1,237,855
Non-trainable params: 0
_________________________________________________________________


In [15]:
early_stopping = EarlyStopping(patience = 1)

model.fit(train_data, y, validation_split=0.2, nb_epoch=100, batch_size=128, callbacks=[early_stopping])

Train on 11994 samples, validate on 2999 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100


<keras.callbacks.History at 0x2168c5ac320>

In [16]:
pred = model.predict(test_data)
pred

array([[0.02822204, 0.17540796, 0.2717901 , 0.25085285, 0.27372706],
       [0.02997983, 0.1812263 , 0.26700017, 0.20136054, 0.3204331 ],
       [0.0257531 , 0.14775589, 0.24293004, 0.20147549, 0.38208547],
       ...,
       [0.04051525, 0.30515024, 0.16184403, 0.21062815, 0.28186235],
       [0.03428663, 0.16891043, 0.22220442, 0.23371549, 0.3408831 ],
       [0.02166584, 0.13252798, 0.17451857, 0.12695326, 0.54433435]],
      dtype=float32)

In [17]:
pred = pred.argmax(axis=-1)
pred

array([4, 4, 4, ..., 1, 4, 4], dtype=int64)

In [18]:
pd.Series(pred).value_counts()

4    2035
2    1122
3     462
1     329
dtype: int64