In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
#http://machinelearningmechanic.com/keras/2018/03/10/keras-regression-with-categorical-variable-embeddings-md.html
#https://www.kaggle.com/rezas26/simple-keras-starter
#https://medium.com/@satnalikamayank12/on-learning-embeddings-for-categorical-data-using-keras-165ff2773fc9

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
print(train.shape)
print(test.shape)

(913000, 4)
(45000, 4)


In [4]:
def convert_dates(x):
    x['date'] = pd.to_datetime(x['date'])
    x['month'] = x['date'].dt.month
    x['year'] = x['date'].dt.year
    x['dayofweek'] = x['date'].dt.dayofweek
    x['week'] = x['date'].dt.week
    x.pop('date')
    return x
train = convert_dates(train)
test = convert_dates(test)

In [5]:
train_x, val_x, train_y, val_y = train_test_split(train.drop('sales',axis=1),train.pop('sales'),random_state=123,test_size=0.2)

In [6]:
train_x.head()

Unnamed: 0,store,item,month,year,dayofweek,week
434364,8,24,5,2017,0,21
619219,10,34,7,2013,3,30
63814,5,4,9,2017,2,39
147213,1,9,2,2016,0,6
255546,10,14,9,2017,4,39


In [7]:
cat_features = list(train_x.columns)

In [8]:
cat_features

['store', 'item', 'month', 'year', 'dayofweek', 'week']

In [9]:
def make_categorical_input(df, cat_list):
    sample_list = []
    for cat in cat_list:
        if np.min(df[cat]) > 1:
            sample_list.append(df[cat].values - np.min(df[cat]))
        else:
            sample_list.append(df[cat].values)
    
    return sample_list

In [10]:
x_train = make_categorical_input(train_x, cat_features)
x_val = make_categorical_input(val_x, cat_features)
x_test = make_categorical_input(test, cat_features)

In [11]:
#역시 1의 y shape를 가진 6개의 어레이로 만든거군
x_train

[array([ 8, 10,  5, ...,  6,  3,  4]),
 array([24, 34,  4, ...,  2, 16, 43]),
 array([ 5,  7,  9, ..., 10, 11,  5]),
 array([4, 0, 4, ..., 1, 0, 3]),
 array([0, 3, 2, ..., 4, 3, 1]),
 array([21, 30, 39, ..., 40, 46, 20])]

In [13]:
from keras.models import Sequential,Model
from keras.layers import Input, Embedding, Reshape, Concatenate, Dense
import keras.backend as K

Using TensorFlow backend.


In [31]:
def custom_smape(x, x_):
    return K.mean(2*K.abs(x-x_)/(K.abs(x)+K.abs(x_)))
#x, x_ : y_true, y_pred

In [37]:
models = []
input_list = []
for cat in cat_features:
    n_unique_cats = train_x[cat].nunique()
    embedding_size = int(min(np.ceil((n_unique_cats)/2), 50 ))
    inputs = Input(shape=(1,), dtype='int32')
    embedding_layer = Embedding(n_unique_cats+1, embedding_size, input_length=1)(inputs)
    embedding_layer = Reshape((embedding_size,))(embedding_layer)
    
    input_list.append(inputs)
    models.append(embedding_layer)

In [38]:
full_model = Concatenate()(models)
dense1 = Dense(units=100, activation='relu')(full_model)
predictions = Dense(1)(dense1)

In [39]:
model = Model(inputs=input_list, outputs=predictions)

model.summary()
model.compile('adam', loss=custom_smape)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_25 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_26 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_27 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_28 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_29 (

In [41]:
hist = model.fit(x_train, train_y, batch_size=32, epochs=5, verbose=1, validation_split=0.2)

Train on 584320 samples, validate on 146080 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [42]:
model.evaluate(x_val, val_y)



0.12820987020499866

In [None]:
#mse를 사용한 50의 loss와 mae를 사용한 5의 loss는 같은 것인가? 다른 것인가?