In [None]:
# Author : Trong Canh Nguyen

# This script considers all the products a user has ordered
#
# We train a model computing the probability of reorder on the "train" data
#
# For the submission, we keep the orders that have a probability of
# reorder higher than a threshold


import numpy as np
import pandas as pd
import h5py
import gc
IDIR = '../input/'
FEATURES_PATH = './features3/'

In [None]:
gc.collect()

## Data Load

In [None]:
import pickle
with open(FEATURES_PATH + 'dtypes.pickle', 'rb') as f:
    dtype_dict = pickle.load(f)
dtype_dict

In [None]:
#data = pd.read_csv(FEATURES_PATH + "data.csv", dtype= dtype_dict)

In [None]:
data = pd.read_hdf(FEATURES_PATH + "data.h5", "data")
data.reset_index(inplace=True)
print("memory = ", data.memory_usage().sum()/1000000)

In [None]:
data.head()

In [None]:
#data[['user_id', 'product_id']].to_hdf(FEATURES_PATH + "ann_data.h5", "user_product_list", mode = "a")

# MODEL

In [None]:
features = ['user_dep_ratio',
 'up_orders',
 'user_order_size_mean',
 'up_add_to_cart_order_mean',
 'up_last_order',
 'up_order_hour_of_day_mean',
 'up_order_rate_since_first_order',
 'user_aisle_reordered_ratio',
 'user_total_order',
 'user_dep_reordered_ratio',
 'user_order_hour_of_day',
 'up_order_rate',
 'user_days_since_prior_mean',
 'up_first_order',
 'product_reorder_probability',
 'up_order_dow_mean',
 'up_add_to_cart_order_relative_mean',
 'user_reorder_rate',
 'user_days_since_prior_order',
 'dep_reorder_ratio',
 'up_days_since_prior_order_mean',
 'up_days_since_last_order',
 'aisle_reorder_ratio',
 'product_reorder_ratio',
 'user_aisle_ratio',
 'up_orders_since_last_order',
 'user_order_dow']


In [None]:
data_train = pd.read_hdf(FEATURES_PATH + "lgb_data.h5", "data_train")
data_valid = pd.read_hdf(FEATURES_PATH + "lgb_data.h5", "data_valid")
data_test = pd.read_hdf(FEATURES_PATH + "lgb_data.h5", "data_test")

In [None]:
data_train[['user_id', 'product_id']].set_index(['user_id', 'product_id']).to_hdf(FEATURES_PATH + "lgb_data.h5", "data_train_index", mode="a")
data_valid[['user_id', 'product_id']].set_index(['user_id', 'product_id']).to_hdf(FEATURES_PATH + "lgb_data.h5", "data_valid_index", mode="a")

In [None]:
columns = list(data_train.columns)
not_features = ['user_id', 'product_id', 'up_reordered']
features = list(set(columns) - set(not_features))
print("number of features", len(features))
features

In [None]:
X_train= data_train[features]
y_train= data_train['up_reordered']
X_valid= data_valid[features]
y_valid= data_valid['up_reordered']
X_test = data_test[features]

In [None]:
# save to h5
X_train.to_hdf(FEATURES_PATH + "ann_data.h5", "X_train", mode = "a")
y_train.to_hdf(FEATURES_PATH + "ann_data.h5", "y_train", mode = "a")
X_valid.to_hdf(FEATURES_PATH + "ann_data.h5", "X_valid", mode = "a")
y_valid.to_hdf(FEATURES_PATH + "ann_data.h5", "y_valid", mode = "a")
X_test.to_hdf(FEATURES_PATH + "ann_data.h5", "X_test", mode = "a")

In [None]:
# read from h5
X_train = pd.read_hdf(FEATURES_PATH + "ann_data.h5", "X_train")
y_train = pd.read_hdf(FEATURES_PATH + "ann_data.h5", "y_train")
X_valid = pd.read_hdf(FEATURES_PATH + "ann_data.h5", "X_valid")
y_valid = pd.read_hdf(FEATURES_PATH + "ann_data.h5", "y_valid")
X_test = pd.read_hdf(FEATURES_PATH + "ann_data.h5", "X_test")

In [None]:
y_train.value_counts()

## Generate predictions

In [None]:
orders = pd.read_csv(IDIR + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32},
        usecols=["order_id", "user_id", "eval_set"])

test_orders= orders[orders.eval_set == 'test']

In [None]:
prediction = pd.DataFrame()
prediction[['user_id', 'product_id']] = user_product_list.loc[X_test.index]
prediction['proba'] = pred_test
prediction.sort_values(by=['user_id', 'proba'], ascending=[True, False], inplace=True)
prediction = pd.merge(prediction, test_orders[['order_id', 'user_id']], on="user_id", how='left')

In [None]:
user_product_list.loc[X_test.index]

### Recommendation using threshold

In [None]:
threshold = 0.20
recommend = prediction[prediction.proba >= threshold].groupby('order_id').product_id.apply(list)

In [None]:
recommend_df = pd.DataFrame()
recommend_df["count"] = prediction.groupby('order_id').size()
recommend_df['product_list'] = recommend
recommend_df['products']= recommend_df.product_list.apply(lambda p: ' '.join([str(x)  for x in p]) if type(p) == list else 'None' ) 

In [None]:
recommend_df.head()

# Feature scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler

In [None]:
min_max_scaler = MinMaxScaler()

In [None]:
X_train_minmax = min_max_scaler.fit_transform(X_train)

In [None]:
X_valid_minmax = min_max_scaler.transform(X_valid)

In [None]:
X_test_minmax = min_max_scaler.transform(X_test)

In [None]:
with h5py.File(FEATURES_PATH + 'ann_data_np.h5', 'w') as hf:
    hf.create_dataset("X_train_minmax",  data=X_train_minmax)
    hf.create_dataset("X_valid_minmax",  data=X_valid_minmax)
    hf.create_dataset("X_test_minmax",  data=X_test_minmax)

In [None]:
with h5py.File(FEATURES_PATH + 'ann_data_np.h5', 'a') as hf:
    hf.create_dataset("y_train_value",  data=y_train.values)
    hf.create_dataset("y_valid_value",  data=y_valid.values)

### Load

In [None]:
with h5py.File(FEATURES_PATH+ 'ann_data_np.h5', 'r') as hf:
    X_train_values = hf['X_train_minmax'][:]
    X_valid_values = hf['X_valid_minmax'][:]
    y_train_values = hf['y_train_value'][:]
    y_valid_values = hf['y_valid_value'][:]

In [None]:
data_valid_index = pd.read_hdf(FEATURES_PATH + "lgb_data.h5", "data_valid_index").reset_index()

# ANN

In [None]:
import tensorflow as tf

In [None]:
batch_size = 248
total_batch = int(len(X_train_values)/batch_size)+1
print("total batch", total_batch)

In [None]:
#y_train_onehot = pd.DataFrame(y_train)
#y_train_onehot["not_up_reordered"] = 1. - y_train_onehot["up_reordered"]
#y_valid_onehot = pd.DataFrame(y_valid)
#y_valid_onehot["not_up_reordered"] = 1. - y_valid_onehot["up_reordered"]

In [None]:
# old version, load from pandas

#train_indices = np.array(X_train.index)
#def get_batch(indices, batch_number, batch_size):
#    i = batch_number
#    x_batch = X_train.loc[indices[i*batch_size: (i+1)*batch_size]]
#    y_batch = y_train_onehot.loc[indices[i*batch_size: (i+1)*batch_size]]
#    return x_batch, y_batch

In [None]:
def one_hot(y):
    y_onehot = np.zeros((len(y),2))
    y_onehot[:,0] = y
    y_onehot[:,1] = 1 - y
    return y_onehot

In [None]:
y_train_onehot_values = one_hot(y_train_values)
y_valid_onehot_values = one_hot(y_valid_values)

In [None]:
class_weights = [0.8, 0.2]

In [None]:
y_train_onehot_values = y_train_onehot_values*class_weights
y_valid_onehot_values = y_valid_onehot_values*class_weights

In [None]:
def shuffle_data():
    n = len(X_train_values)
    indices = np.random.permutation(n)
    X_ = X_train_values[indices]
    y_ = y_train_onehot_values[indices]
    return X_, y_

In [None]:
def get_batch(i, batch_size, X, y):
    x_batch = X[i*batch_size: (i+1)*batch_size]
    y_batch = y[i*batch_size: (i+1)*batch_size]
    return x_batch, y_batch

In [None]:
# Create model
def create_model(x, weights, biases):
    # Hidden layer with RELU activation
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_1 = tf.nn.relu(layer_1)
    
    layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
    layer_2 = tf.nn.relu(layer_2)
    # Output layer with linear activation
    out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
    return out_layer

In [None]:
d = X_train_values.shape[1]
print("number of features = ", d)
n_input = d # Number of feature
n_hidden_1 = 20 # 1st layer number of features
n_hidden_2 = 20 # 1st layer number of features

n_classes = 2 # Number of classes to predict
learning_rate = tf.placeholder(tf.float32, shape=[])

# tf Graph input
X = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_classes])

weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),   
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}

biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}


# Construct model
logits = create_model(X, weights, biases)

#proba
proba = tf.nn.softmax(logits)

#manual cross_entropy
#coefficients = tf.constant([1.0, 1.0])
#eps= tf.constant(value=1e-12)
#cost_weighted =  tf.reduce_mean(-tf.reduce_sum( y*tf.log(proba + eps), reduction_indices=[1])) 

#weight_coeff = tf.constant([0.8, 0.2])
#y_weighted = tf.multiply(weight_coeff,  y)
#cost_weighted = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels = y_weighted))

# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels = y))
is_correct =  tf.equal(tf.argmax(proba,1), tf.argmax(y,1))
accuracy =  tf.reduce_mean(tf.cast(is_correct, tf.float32))

optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# Initializing the variables
init = tf.global_variables_initializer()


In [None]:
gc.collect()

number_batches = total_batch
training_epochs = 20
display_step = 1
display_valid_step = 5


# Launch the graph
with tf.Session() as sess:
    sess.run(init)
    
    lr =  0.0001
    
    # Training cycle
    for epoch in range(training_epochs):
        print("lr = ", lr)
        lr = lr*0.98
        avg_cost = 0.
        avg_cost_weighted = 0.
        
        X_, y_ = shuffle_data()
        gc.collect()

        # Loop over all batches
        for i in range(number_batches):
            gc.collect()
            batch_x, batch_y = get_batch(i, batch_size, X_, y_)                
                
            # batch_y.shape = (batch_y.shape[0], 1)
            # Run optimization op (backprop) and cost op (to get loss value)
            _,  c = sess.run([optimizer, cost], feed_dict={X: batch_x, y: batch_y, learning_rate: lr})
            # Compute average loss
            avg_cost += c / number_batches
            #avg_cost_weighted += c_weighted/number_batches

        # Display logs per epoch step
        if epoch % display_step == 0:
            print ("Epoch:", epoch+1, "cost= ", avg_cost)
        if (epoch+1) % display_valid_step == 0:
            print("Valid cost:", sess.run( cost, feed_dict={X: X_valid_values, y: y_valid_onehot_values}) )
            #print("Valid cost_weighted:", sess.run( cost_weighted, feed_dict={X: X_valid, y: y_valid_onehot}) )
    print("Optimization Finished!")

    # Validation
   
    print("Accuracy:", sess.run( accuracy, feed_dict={X: X_valid_values, y: y_valid_onehot_values}) )
    global pred_valid 
    pred_valid = sess.run( proba, feed_dict={X: X_valid_values}) 
    #pred_test = sess.run( proba, feed_dict={X: X_test_values})

In [None]:
pred_valid

## Validation

In [1]:
def precision(y, y_, correct):
    if y_>0:
        return correct/y_
    else:
        return 1.0
        
def recall(y, y_, correct):
    if y>0:
        return correct/y
    else:
        return 1.0

def f1(y,y_, correct):
    p = precision(y, y_, correct)
    r = recall(y, y_, correct)
    if (p == 0) and (r ==0):
        return 0.
    f1 = 2*p*r/(p+r)
    return f1

def compute_f1(valid_df, threshold):
    valid_df['y_'] = valid_df['pred'] > threshold
    valid_df['correct'] = (valid_df['y'] == valid_df['y_']) & (valid_df['y_'])
    result = valid_df.groupby('user_id').sum()
    result['f1'] = result.apply(lambda row: f1(row['y'], row['y_'], row['correct']), axis=1)
    return result['f1'].mean()

In [2]:
f1(0,0,0)

1.0

In [None]:
data_test = pd.read_hdf(FEATURES_PATH + "lgb_data.h5", "data_test")

In [None]:
data_valid = pd.read_hdf(FEATURES_PATH + "lgb_data.h5", "data_valid")

In [None]:
valid_df = data_valid[['user_id', 'product_id']].copy()
valid_df["y"] = y_valid_values
valid_df["pred"] = pred_valid[:,0]
valid_df["y_"] = valid_df["pred"]  >= 0.20
valid_df['correct'] = (valid_df['y'] == valid_df['y_']) & (valid_df['y_'])
valid_df.sort_values(['user_id', 'pred'], ascending=[True, False], inplace = True)
#print("valid log loss = ", -((valid_df["y"]*np.log(valid_df["pred"])+ (1.-valid_df["y"])* np.log(1.- valid_df["pred"]))).mean())

In [None]:
print("valid f1 = ", compute_f1(valid_df, 0.20))

In [None]:
#result = valid_df.groupby('user_id').sum()
#result['f1'] = result.apply(lambda row: f1(row['y'], row['y_'], row['correct']), axis=1)