In [None]:
import pylab
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras import backend as K

import IPython.core.display as di
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

In [None]:
df_items0 = pd.read_csv("DATA/DMC_2017_task/items.csv", sep = "|")
del df_items0["campaignIndex"]
df_items0.pharmForm = df_items0.pharmForm.str.upper()

In [None]:
def content(df):
    content = pd.DataFrame(df.content.str.split('X',2).tolist(), columns = ['x','y','z'])
    content = content.fillna('1')
    content.x = pd.to_numeric(content.x, errors='coerce')
    content.y = pd.to_numeric(content.y, errors='coerce')
    content.z = pd.to_numeric(content.z, errors='coerce')
    content['Content'] = content.x * content.y * content.z
    df1 = pd.concat([df, content], axis=1, join_axes=[df.index])
    df2 = df1.drop(["x", "y", "z", "content"], axis=1)
    return df2

In [None]:
df_items1 = content(df_items0)
df_items1.category = df_items1.category.fillna(value = df_items1.category.mean())
df_items1.Content = df_items1.Content.fillna(value = df_items1.Content.mean())
df_items1.head()

In [None]:
def missing(df):
    print('Size of testing dataset : %d' % (df.shape[0]))
    df1 = df.dropna()
    print('Size of testing dataset DropNA : %d' % (df1.shape[0]))
    display(df.isnull().sum())
    
def onehot2(df):
    df_oh = pd.DataFrame(np_utils.to_categorical(df,(max(df)+1)))
    original_dim = df_oh.shape[1]
    print('original dimension : %d' % df_oh.shape[1])
    return df_oh, original_dim

def pre_item(df):
    #
    df_num = df[column_conti]
    sc = MinMaxScaler(feature_range=(0, 1), copy=True)
    Z = pd.DataFrame(sc.fit_transform(df_num))
    Z.columns = column_conti
    
    df_cat = df.drop(column_conti, axis=1)
    df_cat = df_cat.reset_index(drop=True, inplace=False, col_level=0, col_fill='')
    
    df1 = df_cat.join(Z, on=None, how='inner', lsuffix='', rsuffix='', sort=False)
    
    #
    m1, m1.dim = onehot2(df.manufacturer)
    s1, s1.dim = onehot2(df.salesIndex)
    
    df2 = pd.concat([df1, m1, s1], axis=1, join_axes=[df.index])
    df3 = df2.drop(["manufacturer", "salesIndex"], axis=1)
    df4 = pd.get_dummies(df3)
    print(df4.shape)
    return df4

In [None]:
missing(df_items1)
column_conti = ["category", "rrp", "Content"]
df_items2 = pre_item(df_items1)

In [None]:
def auto(df, dim):
    from keras.layers import Input, Dense
    from keras.models import Model
    from keras.callbacks import TensorBoard
    encoding_dim = dim

    input_img = Input(shape=(df.shape[1],))
    encoded = Dense(2000, activation='relu')(input_img)
    encoded = Dense(1000, activation='relu')(encoded)
    encoded = Dense(500, activation='relu')(encoded)
    encoded = Dense(encoding_dim, activation='relu')(encoded)

    decoded = Dense(500, activation='relu')(encoded)
    decoded = Dense(1000, activation='relu')(decoded)
    decoded = Dense(2000, activation='relu')(decoded)
    decoded = Dense(df.shape[1], activation='sigmoid')(decoded)

    autoencoder = Model(input = input_img, output = decoded)
    autoencoder.compile(optimizer = 'adadelta', loss = 'binary_crossentropy')
    history = autoencoder.fit(df, df,
                              nb_epoch = nb_epoch,
                              batch_size = 256,
                              shuffle = True,
                              validation_split = 0.2)
    
    from keras import backend as K
    target_layer = K.function(autoencoder.inputs, [autoencoder.layers[4].output])
    ## Extract output from the target hidden layer.
    target_layer_out = target_layer([df])
    df_auto = pd.DataFrame(np.array(target_layer_out[0]))
    print('encoding dimension : %d' % (df_auto.shape[1]))
    return df_auto, history

def Plot(train_value, test_value, value_is_loss_or_acc):
    f, ax = plt.subplots()
    ax.plot([None] + train_value, 'o-')
    ax.plot([None] + test_value, 'x-')
    ax.legend(['Train ' + value_is_loss_or_acc, 'Validation ' + value_is_loss_or_acc], loc = 0) 
    ax.set_title('Training/Validation ' + value_is_loss_or_acc + ' per Epoch')
    ax.set_xlabel('Epoch')
    ax.set_ylabel(value_is_loss_or_acc)  
    plt.show()

In [None]:
X = df_items2.drop(["pid"], axis = 1)
X.columns = [i for i in range(X.shape[1])]
X1 = np.array(X)

nb_epoch = 200
df_items3, his = auto(X1, 185)
df_items3["pid"] = df_items2.pid

## training

In [None]:
def preprocessing(df):
    df["number"] = df["revenue"] / df["price"]
    df["dif"] = df["price"] - df["competitorPrice"]
    
    click = df[(df.click == 1)]
    basket = df[(df.basket == 1)]
    order = df[(df.order == 1)]
    click["class_label"] = 0
    basket["class_label"] = 0
    order["class_label"] = 1
    df1 = pd.concat([click, basket, order])
    
    ad, ad_dim = onehot2(df.adFlag)
    av, av_dim = onehot2(df.availability)
    df2 = pd.concat([df1, pd.DataFrame(ad), pd.DataFrame(av)], axis=1, join_axes=[df1.index])
    df3 = df2.drop(["adFlag", "availability"], axis=1)
    df3 = df3.sort(["lineID"], ascending=True)
    return df3

In [None]:
df_train0 = pd.read_csv("DATA/DMC_2017_task/train.csv", sep = "|")
df_train = preprocessing(df_train0)
df_train.head()

In [None]:
missing(df_train)

In [None]:
def inner(df, df_item, trainingday):
    df = df.sort_values(['lineID', "pid"], ascending=True)
    df1 = df[(df.day < trainingday)]
    df_inner = pd.merge(df1, df_item, how='inner', on=['pid']) #df_items
    df_inner = df_inner.sort_values(['lineID', "pid"], ascending=True)
    print(df_inner.shape)
    return df_inner

In [None]:
result0 = inner(df_train, df_item3, 95)

In [None]:
##################################################### hold on last month
hold_result = result0[result0.day > 65]
hold_result = hold_result.dropna()
hold_result = hold_result.reset_index(drop = True)

result = result0[result0.day <= 65]
result = result.dropna()
result = result.reset_index(drop = True)

print('Number of hold_result: %d' % hold_result.shape[0])
print('Number of result training: %d' % result.shape[0])
print(result['class_label'].value_counts())
result.head()

In [None]:
def std(df):
    df1 = df.drop(["lineID", "number", "click", "basket", "order", "revenue"], axis=1)
    
    ################################################ standardized to (0,1)
    df_num = df[column_conti]
    sc = MinMaxScaler(feature_range=(0, 1), copy=True)
    Z = pd.DataFrame(sc.fit_transform(df_num))
    Z.columns = column_conti
    ################################################ reset row index
    df_cat = df1.drop(column_conti, axis=1)
    df_cat = df_cat.reset_index(drop=True, inplace=False, col_level=0, col_fill='')
    
    df_std = df_cat.join(Z, on=None, how='inner', lsuffix='', rsuffix='', sort=False)
    return df_std

In [None]:
column_conti = ["day", "pid", "competitorPrice", "price", "dif"]

df_st = std(result)

In [None]:
df_st.head()

In [None]:
def split(df):
    X = np.array(df.drop(["class_label"], axis=1))
    y = np_utils.to_categorical(np.array(df[["class_label"]]), 2)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    print('Number of Features: %d' % X_train.shape[1])
    print('Number of training: %d' % X_train.shape[0])
    print('Number of testing: %d' % X_test.shape[0])
    a = pd.DataFrame(y_test)
    a.columns = ["a", "b"]
    print("===testing dataset label===")
    print(a['a'].value_counts())
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = split(df_st)

In [None]:
batch_size = 128
nb_classes = 2
nb_epoch = 200

img_rows, img_cols = 11, 18
nb_filters = 32
pool_size = (2, 2)
kernel_size = (3, 3)

In [None]:
if K.image_dim_ordering() == 'th':
    X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols)
    X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols)
    input_shape = (1, img_rows, img_cols)
else:
    X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 1)
    X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 1)
    input_shape = (img_rows, img_cols, 1)

In [None]:
model = Sequential()

model.add(Convolution2D(nb_filters,
                        kernel_size[0],
                        kernel_size[1],
                        border_mode='valid',
                        input_shape=input_shape))
model.add(Activation('relu'))
model.add(Convolution2D(nb_filters, kernel_size[0], kernel_size[1]))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adadelta',
              metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train,
          batch_size = batch_size,
          nb_epoch = nb_epoch,
          verbose = 1,
          validation_data = (X_test, y_test))
score = model.evaluate(X_test, y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])