In [1]:
# Author : Trong Canh Nguyen

# This script considers all the products a user has ordered
#
# We train a model computing the probability of reorder on the "train" data
#
# For the submission, we keep the orders that have a probability of
# reorder higher than a threshold


import numpy as np
import pandas as pd
import lightgbm as lgb
import gc
IDIR = '../input/'
FEATURES_PATH = './features3/'

In [156]:
gc.collect()

7547

## Data Load

In [None]:
import pickle
with open(FEATURES_PATH + 'dtypes.pickle', 'rb') as f:
    dtype_dict = pickle.load(f)
dtype_dict

In [None]:
#data = pd.read_csv(FEATURES_PATH + "data.csv", dtype= dtype_dict)

In [16]:
data = pd.read_hdf(FEATURES_PATH + "data.h5", "data")
data.reset_index(inplace=True)
print("memory = ", data.memory_usage().sum()/1000000)

memory =  1211.023803


In [17]:
data.head()

Unnamed: 0,user_id,product_id,up_orders,up_first_order,up_last_order,up_order_rate,up_orders_since_last_order,up_order_rate_since_first_order,up_days_since_last_order,up_add_to_cart_order_mean,...,user_order_size_mean,product_reorder_ratio,aisle_id,department_id,up_in_same_day_previous_order,up_reordered,aisle_reorder_rate,dep_reorder_rate,user_aisle_reorder_rate,user_dep_reorder_rate
0,1,196,10,1,10,1.0,0.0,1.0,14,1.4,...,5.9,0.77648,77,7,0,1.0,0.685758,0.696882,0.916667,0.916667
1,1,10258,9,2,10,0.9,0.0,1.0,14,3.333333,...,5.9,0.713772,117,19,0,1.0,0.552527,0.612557,0.888889,0.9
2,1,10326,1,5,5,0.1,5.0,0.166667,97,5.0,...,5.9,0.652009,24,4,0,0.0,0.763352,0.692611,0.2,0.2
3,1,12427,10,1,10,1.0,0.0,1.0,14,3.3,...,5.9,0.740735,23,19,0,0.0,0.633501,0.612557,1.0,0.9
4,1,13032,3,2,10,0.3,0.0,0.333333,14,6.333333,...,5.9,0.657158,121,14,0,1.0,0.613882,0.600948,0.666667,0.666667


In [18]:
#data[['user_id', 'product_id']].to_hdf(FEATURES_PATH + "ann_data.h5", "user_product_list", mode = "a")

# MODEL

In [10]:
features = [    
    'up_orders',
    'up_add_to_cart_order_mean',
    'up_order_rate',
    'up_order_rate_since_first_order',
    'up_orders_since_last_order',
    'up_days_since_last_order',
    'up_in_same_day_previous_order',
    
    'user_total_order',
    'user_order_size_mean',
    'user_reorder_rate',
    'user_days_since_last_order',   
    
    'product_reorder_ratio',
    
    'aisle_reorder_rate',    
    'dep_reorder_rate',
]

In [None]:
data_train = data[data.user_eval_set == "train"][['user_id', 'product_id']+ features + ["up_reordered"]]
tmp = data_train.user_id.unique()
user_train = tmp[0:120000]
user_test = tmp[120000:]

X_train= data_train[data_train.user_id.isin(user_train)][features]
y_train= data_train[data_train.user_id.isin(user_train)]['up_reordered']
X_valid= data_train[data_train.user_id.isin(user_test)][features]
y_valid= data_train[data_train.user_id.isin(user_test)]['up_reordered']

In [None]:
data_test = data[data.user_eval_set == "test"][['user_id', 'product_id'] + features]
X_test = data_test[features]

In [None]:
# save to h5
X_train.to_hdf(FEATURES_PATH + "ann_data.h5", "X_train", mode = "w")
y_train.to_hdf(FEATURES_PATH + "ann_data.h5", "y_train", mode = "a")
X_valid.to_hdf(FEATURES_PATH + "ann_data.h5", "X_valid", mode = "a")
y_valid.to_hdf(FEATURES_PATH + "ann_data.h5", "y_valid", mode = "a")
X_test.to_hdf(FEATURES_PATH + "ann_data.h5", "X_test", mode = "a")

In [2]:
# read from h5
X_train = pd.read_hdf(FEATURES_PATH + "ann_data.h5", "X_train")
y_train = pd.read_hdf(FEATURES_PATH + "ann_data.h5", "y_train")
X_valid = pd.read_hdf(FEATURES_PATH + "ann_data.h5", "X_valid")
y_valid = pd.read_hdf(FEATURES_PATH + "ann_data.h5", "y_valid")
X_test = pd.read_hdf(FEATURES_PATH + "ann_data.h5", "X_test")

In [19]:
user_product_list = pd.read_hdf(FEATURES_PATH + "ann_data.h5", "user_product_list")

In [3]:
y_train.value_counts()

0.0    6998578
1.0     759329
Name: up_reordered, dtype: int64

## Generate predictions

In [167]:
orders = pd.read_csv(IDIR + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32},
        usecols=["order_id", "user_id", "eval_set"])

test_orders= orders[orders.eval_set == 'test']

In [174]:
prediction = pd.DataFrame()
prediction[['user_id', 'product_id']] = user_product_list.loc[X_test.index]
prediction['proba'] = pred_test
prediction.sort_values(by=['user_id', 'proba'], ascending=[True, False], inplace=True)
prediction = pd.merge(prediction, test_orders[['order_id', 'user_id']], on="user_id", how='left')

ValueError: Wrong number of items passed 2, placement implies 1

In [172]:
user_product_list.loc[X_test.index]

Unnamed: 0,user_id,product_id
120,3,248
121,3,1005
122,3,1819
123,3,7503
124,3,8021
125,3,9387
126,3,12845
127,3,14992
128,3,15143
129,3,16797


### Recommendation using threshold

In [None]:
threshold = 0.20
recommend = prediction[prediction.proba >= threshold].groupby('order_id').product_id.apply(list)

In [None]:
recommend_df = pd.DataFrame()
recommend_df["count"] = prediction.groupby('order_id').size()
recommend_df['product_list'] = recommend
recommend_df['products']= recommend_df.product_list.apply(lambda p: ' '.join([str(x)  for x in p]) if type(p) == list else 'None' ) 

In [None]:
recommend_df.head()

# ANN

In [4]:
import tensorflow as tf

In [95]:
batch_size = 248
total_batch = int(len(X_train)/batch_size)+1
print("total batch", total_batch)

total batch 31282


In [6]:
y_train_onehot = pd.DataFrame(y_train)
y_train_onehot["not_up_reordered"] = 1. - y_train_onehot["up_reordered"]
y_valid_onehot = pd.DataFrame(y_valid)
y_valid_onehot["not_up_reordered"] = 1. - y_valid_onehot["up_reordered"]

In [7]:
X_batches = np.array_split(X_train, total_batch)
Y_batches = np.array_split(y_train_onehot, total_batch)

In [67]:
train_indices = np.array(X_train.index)

In [101]:
def get_batch(indices, batch_number, batch_size):
    i = batch_number
    x_batch = X_train.loc[indices[i*batch_size: (i+1)*batch_size]]
    y_batch = y_train_onehot.loc[indices[i*batch_size: (i+1)*batch_size]]
    return x_batch, y_batch

In [161]:
# Create model
def create_model(x, weights, biases):
    # Hidden layer with RELU activation
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_1 = tf.nn.relu(layer_1)
    
    layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
    layer_2 = tf.nn.relu(layer_2)
    # Output layer with linear activation
    out_layer = tf.matmul(layer_1, weights['out']) + biases['out']
    return out_layer

In [162]:
d = len(features)
n_input = d # Number of feature
n_hidden_1 = 10 # 1st layer number of features
n_hidden_2 = 10 # 1st layer number of features
n_classes = 2 # Number of classes to predict
learning_rate = 0.001

# tf Graph input
X = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, n_classes])

weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),   
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}

biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}



# Construct model
logits = create_model(X, weights, biases)

#proba
proba = tf.nn.softmax(logits)

#manual cross_entropy
#coefficients = tf.constant([1.0, 1.0])
#eps= tf.constant(value=1e-12)
#cost_weighted =  tf.reduce_mean(-tf.reduce_sum( y*tf.log(proba + eps), reduction_indices=[1])) 

weight_coeff = tf.constant([2.0, 1.0])
y_weighted = tf.multiply(weight_coeff,  y)
cost_weighted = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels = y_weighted))

# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels = y))
is_correct =  tf.equal(tf.argmax(proba,1), tf.argmax(y,1))
accuracy =  tf.reduce_mean(tf.cast(is_correct, tf.float32))

optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# Initializing the variables
init = tf.global_variables_initializer()


In [163]:
number_batches = total_batch
training_epochs = 20
display_step = 1
display_valid_step = 5


# Launch the graph
with tf.Session() as sess:
    sess.run(init)
    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0. 
        avg_cost_weighted = 0.        
        indices = np.random.permutation(train_indices) 

        # Loop over all batches
        for i in range(number_batches):
            batch_x, batch_y = get_batch(indices, i, batch_size)                
                
            # batch_y.shape = (batch_y.shape[0], 1)
            # Run optimization op (backprop) and cost op (to get loss value)
            _,  c = sess.run([optimizer, cost], feed_dict={X: batch_x, y: batch_y})
            # Compute average loss
            avg_cost += c / number_batches
            #avg_cost_weighted += c_weighted/number_batches

        # Display logs per epoch step
        if epoch % display_step == 0:
            print ("Epoch:", epoch+1, "cost= ", avg_cost)
        if (epoch+1) % display_valid_step == 0:
            print("Valid cost:", sess.run( cost, feed_dict={X: X_valid, y: y_valid_onehot}) )
            #print("Valid cost_weighted:", sess.run( cost_weighted, feed_dict={X: X_valid, y: y_valid_onehot}) )
    print("Optimization Finished!")

    # Validation
   
    print("Accuracy:", sess.run( accuracy, feed_dict={X: X_valid, y: y_valid_onehot}) )
    global pred_valid 
    pred_valid = sess.run( proba, feed_dict={X: X_valid}) 
    pred_test = sess.run( proba, feed_dict={X: X_test})

Epoch: 1 cost=  0.0
Epoch: 2 cost=  0.0
Epoch: 3 cost=  0.0
Epoch: 4 cost=  0.0
Epoch: 5 cost=  0.0
Valid cost: 0.247759
Epoch: 6 cost=  0.0
Epoch: 7 cost=  0.0
Epoch: 8 cost=  0.0
Epoch: 9 cost=  0.0
Epoch: 10 cost=  0.0
Valid cost: 0.24698
Epoch: 11 cost=  0.0
Epoch: 12 cost=  0.0
Epoch: 13 cost=  0.0
Epoch: 14 cost=  0.0
Epoch: 15 cost=  0.0
Valid cost: 0.246731
Epoch: 16 cost=  0.0
Epoch: 17 cost=  0.0
Epoch: 18 cost=  0.0
Epoch: 19 cost=  0.0
Epoch: 20 cost=  0.0
Valid cost: 0.24628
Optimization Finished!
Accuracy: 0.909781


In [None]:
pred_valid

## Validation

In [13]:
def precision(y, y_, correct):
    if y_>0:
        return correct/y_
    else:
        return 1.0
        
def recall(y, y_, correct):
    if y>0:
        return correct/y
    else:
        return 1.0

def f1(y,y_, correct):
    p = precision(y, y_, correct)
    r = recall(y, y_, correct)
    if (p == 0) and (r ==0):
        return 0.
    f1 = 2*p*r/(p+r)
    return f1

def compute_f1(valid_df, threshold):
    valid_df['y_'] = valid_df['pred'] > threshold
    valid_df['correct'] = (valid_df['y'] == valid_df['y_']) & (valid_df['y_'])
    result = valid_df.groupby('user_id').sum()
    result['f1'] = result.apply(lambda row: f1(row['y'], row['y_'], row['correct']), axis=1)
    return result['f1'].mean()

In [164]:
valid_df = user_product_list.iloc[X_valid.index].copy()
valid_df["y"] = y_valid
valid_df["pred"] = pred_valid[:,0]
valid_df["y_"] = valid_df["pred"]  >= 0.20
valid_df['correct'] = (valid_df['y'] == valid_df['y_']) & (valid_df['y_'])
valid_df.sort_values(['user_id', 'pred'], ascending=[True, False], inplace = True)
#print("valid log loss = ", -((valid_df["y"]*np.log(valid_df["pred"])+ (1.-valid_df["y"])* np.log(1.- valid_df["pred"]))).mean())

In [166]:
print("valid f1 = ", compute_f1(valid_df, 0.20))

valid f1 =  0.3742306619427898


In [None]:
#result = valid_df.groupby('user_id').sum()
#result['f1'] = result.apply(lambda row: f1(row['y'], row['y_'], row['correct']), axis=1)