In [5]:
import os
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import keras
from keras.layers import Dense, GlobalAveragePooling1D, Embedding
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

In [6]:
# 데이터 불러오기
train = pd.read_csv('./train/train.csv')
test = pd.read_csv('./test/test.csv')

# 결측치 처리
train['Description'] = train.Description.fillna("none").values
test['Description'] = test.Description.fillna("none").values

# y 원핫인코딩
target = train['AdoptionSpeed']
y = to_categorical(target)

In [24]:
# test -> token
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train['Description'])

# 토큰화 된 text -> sequence
sequences = tokenizer.texts_to_sequences(train['Description'])
test_sequences = tokenizer.texts_to_sequences(test['Description'])

# padding (최대길이로)
train_data = pad_sequences(sequences)
MAX_SEQUENCE_LENGTH = train_data.shape[1]
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of train data tensor:', train_data.shape)
nb_words = (np.max(train_data) + 1)
print("max sequence length : {:,}" .format(MAX_SEQUENCE_LENGTH))
print("# of words :{:,}" .format(nb_words))

Shape of train data tensor: (14993, 1233)
max sequence length : 1,233
# of words :21,808


In [13]:
model = Sequential()
model.add(Embedding(nb_words,50,input_length=MAX_SEQUENCE_LENGTH))
model.add(GlobalAveragePooling1D())
model.add(Dense(5, activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1233, 50)          1090400   
_________________________________________________________________
global_average_pooling1d_2 ( (None, 50)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 255       
Total params: 1,090,655
Trainable params: 1,090,655
Non-trainable params: 0
_________________________________________________________________


In [14]:
early_stopping = EarlyStopping(patience = 1)

model.fit(train_data, y, validation_split=0.2, nb_epoch=100, batch_size=128, callbacks=[early_stopping])

Train on 11994 samples, validate on 2999 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100


<keras.callbacks.History at 0x2066a640240>

In [15]:
pred = model.predict(test_data)
pred

array([[0.02777462, 0.19761194, 0.2782477 , 0.21996589, 0.27639985],
       [0.02751336, 0.19755906, 0.27823168, 0.21956755, 0.2771284 ],
       [0.02798403, 0.19761641, 0.27765197, 0.22068396, 0.27606362],
       ...,
       [0.02888631, 0.19859985, 0.27712375, 0.22103138, 0.2743587 ],
       [0.02799195, 0.19772357, 0.27735287, 0.22030763, 0.27662393],
       [0.02895666, 0.19801924, 0.2773331 , 0.22158435, 0.27410668]],
      dtype=float32)

In [16]:
pred = pred.argmax(axis=-1)
pred

array([2, 2, 2, ..., 2, 2, 2], dtype=int64)

In [22]:
pd.Series(pred).value_counts()

2    3941
4       7
dtype: int64