## This model has the following characteristics:
* No feature engineering
* Applying bidirectional Conv1D to raw transactions

In [2]:
#
# Setting for obtaining reproducible results
#

import numpy as np
# import tensorflow as tf
import random as rn

# The below is necessary in Python 3.2.3 onwards to
# have reproducible behavior for certain hash-based operations.
# See these references for further details:
# https://docs.python.org/3.4/using/cmdline.html#envvar-PYTHONHASHSEED
# https://github.com/keras-team/keras/issues/2280#issuecomment-306959926

import os
os.environ['PYTHONHASHSEED'] = '0'

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.

np.random.seed(1234)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.

#rn.seed(12345)

# Force TensorFlow to use single thread.
# Multiple threads are a potential source of
# non-reproducible results.

# session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, 
#                               inter_op_parallelism_threads=1) #원코드: tf.ConfigProto 

# import tensorflow.python.keras.backend as K
# #원코드: from keras import backend as K

# # The below tf.set_random_seed() will make random number generation
# # in the TensorFlow backend have a well-defined initial state.
# # For further details, 
# # see: https://www.tensorflow.org/api_docs/python/tf/set_random_seed

# tf.random.set_seed(1234) #원코드: tf.set_random_seed(1234)

# sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf) #원코드: tf.Session, tf.get_default_graph()
# K.set_session(sess)

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# import keras

# from keras.preprocessing import sequence
# from keras.preprocessing.text import *

### Read Data

In [4]:
df_train = pd.read_csv('X_train.csv', encoding='cp949')
df_test = pd.read_csv('X_test.csv', encoding='cp949')
y_train = pd.read_csv('y_train.csv').gender
IDtest = df_test.cust_id.unique()

df_train.head(10)

Unnamed: 0,cust_id,tran_date,store_nm,goods_id,gds_grp_nm,gds_grp_mclas_nm,amount
0,0,2007-01-19 00:00:00,강남점,127105,기초 화장품,화장품,850000
1,0,2007-03-30 00:00:00,강남점,342220,니 트,시티웨어,480000
2,0,2007-03-30 00:00:00,강남점,127105,기초 화장품,화장품,3000000
3,0,2007-03-30 00:00:00,강남점,342205,니 트,시티웨어,840000
4,0,2007-03-30 00:00:00,강남점,342220,상품군미지정,기타,20000
5,0,2007-05-13 00:00:00,강남점,127105,기초 화장품,화장품,3000000
6,0,2007-05-13 00:00:00,강남점,321101,디자이너부틱,디자이너,11264000
7,0,2007-05-13 00:00:00,강남점,443204,비지니스군,셔츠,975000
8,0,2007-06-09 00:00:00,강남점,9112,유제품,축산가공,85000
9,0,2007-06-09 00:00:00,강남점,72196,건강식품,건강식품,2340000


In [5]:
df_train.shape, df_test.shape, y_train.shape

((232004, 7), (163558, 7), (3500,))

### Transform Data

In [7]:
df_train.head()

Unnamed: 0,cust_id,tran_date,store_nm,goods_id,gds_grp_nm,gds_grp_mclas_nm,amount
0,0,2007-01-19 00:00:00,강남점,127105,기초 화장품,화장품,850000
1,0,2007-03-30 00:00:00,강남점,342220,니 트,시티웨어,480000
2,0,2007-03-30 00:00:00,강남점,127105,기초 화장품,화장품,3000000
3,0,2007-03-30 00:00:00,강남점,342205,니 트,시티웨어,840000
4,0,2007-03-30 00:00:00,강남점,342220,상품군미지정,기타,20000


In [8]:
df_test.head()

Unnamed: 0,cust_id,tran_date,store_nm,goods_id,gds_grp_nm,gds_grp_mclas_nm,amount
0,3500,2007-01-16 00:00:00,부산본점,735126,기초A,화장품,2465000
1,3500,2007-01-20 00:00:00,부산본점,443227,비지니스군,셔츠,675000
2,3500,2007-03-02 00:00:00,부산본점,461257,트래디셔널Ⅱ,트래디셔널,1840000
3,3500,2007-03-17 00:00:00,부산본점,735126,기초A,화장품,350000
4,3500,2007-03-17 00:00:00,부산본점,245219,트래디셔널,트래디셔널,5014000


In [9]:
max_features = 100000
max_len = 100
emb_dim = 128

# Converts a "gds_grp_nm" to a sequence of indexes in a fixed-size hashing space

# tf.one_hot() applied
X_train = df_train.groupby('cust_id')['gds_grp_nm'].apply(lambda x: [one_hot(i, max_features)[0] for i in x]).values
X_test = df_test.groupby('cust_id')['gds_grp_nm'].apply(lambda x: [one_hot(i, max_features)[0] for i in x]).values

for i in range(X_train.shape[0]):
    x = np.unique(X_train[i])
    y = np.array([])
    for j in range(5):
        y = np.append(y, np.random.choice(x, len(x), replace=False))
    X_train[i] = y    
#    X_train[i] = np.random.choice(x, len(x), replace=False)

for i in range(X_test.shape[0]):
    x = np.unique(X_test[i])
    y = np.array([])
    for j in range(5):
        y = np.append(y, np.random.choice(x, len(x), replace=False))
    X_test[i] = y    
#    X_test[i] = np.random.choice(x, len(x)*5)    
    
# Pads sequences to the same length
X_train = sequence.pad_sequences(X_train, maxlen=max_len)
X_test = sequence.pad_sequences(X_test, maxlen=max_len)

X_train.shape, X_test.shape

NameError: name 'one_hot' is not defined

In [6]:
y_train.shape

(3500,)

### Build Models

In [None]:
from keras.models import Model
from keras import Input
from keras import layers
from keras.optimizers import RMSprop
from keras.constraints import max_norm
from keras.callbacks import EarlyStopping

# Define the Model & its Architecture
in_f = Input(shape=(max_len,), dtype='int32', name='forward')
x = layers.Embedding(max_features, emb_dim)(in_f)
x = layers.Conv1D(32, 3, activation='elu')(x)
x = layers.MaxPooling1D(3)(x)
x = layers.Conv1D(32, 3, activation='elu')(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dropout(0.5)(x)
out = layers.Dense(1, activation='sigmoid')(x)

model = Model(in_f, out)
model.summary()

# Choose the Optimizer and the Cost function
model.compile(optimizer=RMSprop(lr=1e-4), loss='binary_crossentropy', metrics=['acc'])

# Train the Model
history = model.fit(X_train, y_train, epochs=100, batch_size=64, 
                    validation_split=0.2, callbacks=[EarlyStopping(patience=5)])

plt.plot(history.history["loss"], label="train loss")
plt.plot(history.history["val_loss"], label="validation loss")
plt.legend()
plt.title("Loss")
plt.show()

### Make Submissions

In [None]:
pred = model.predict(X_test)[:,0]
fname = 'submissions.csv'
submissions = pd.concat([pd.Series(IDtest, name="cust_id"), pd.Series(pred, name="gender")] ,axis=1)
submissions.to_csv(fname, index=False)
print("'{}' is ready to submit." .format(fname))

## End