In [1]:
# Author : Trong Canh Nguyen

# This script considers all the products a user has ordered
#
# We train a model computing the probability of reorder on the "train" data
#
# For the submission, we keep the orders that have a probability of
# reorder higher than a threshold


import numpy as np
import pandas as pd
import lightgbm as lgb
import gc
from helper import *
IDIR = '../input/'
FEATURES_PATH = './features3/'

In [None]:
gc.collect()

## Data Load

In [None]:
#data = pd.read_csv(FEATURES_PATH + "data.csv", dtype= dtype_dict)

In [None]:
data = pd.read_hdf(FEATURES_PATH + "data.h5", "data")
data.reset_index(inplace=True)
print("memory = ", data.memory_usage().sum()/1000000)

In [None]:
data.head()

In [None]:
columns = list(data.columns)
not_features = ['user_id', 'product_id', 'aisle_id',  'user_eval_set', 'up_reordered']
features = list(set(columns) - set(not_features))
print("number of features", len(features))

features

## Product embedding

In [2]:
product_embedding = pd.read_csv(IDIR+"product_embeddings.csv")
product_embedding.drop(['product_name','aisle_id','department_id'], axis = 1, inplace = True)
columns_dict = dict([(str(i), 'pe_'+str(i)) for i in range(32)])
product_embedding.rename(columns=columns_dict, inplace = True)
pe_columns = ['pe_'+str(i) for i in range(32)]
product_embedding[pe_columns] = product_embedding[pe_columns].astype(np.float32)

In [None]:
data = data.merge(product_embedding, on="product_id", how = "left")

In [None]:
print("memory = ", data.memory_usage().sum()/1000000)

In [None]:
data.set_index(['user_id', 'product_id'], inplace = True)

In [None]:
data.to_hdf(FEATURES_PATH + "data_pe.h5", "data", mode = 'a')

# MODEL

In [None]:
#data_features = data[data.user_eval_set == "train"][['user_id', 'product_id']+ features + ["up_reordered"]]
data_features = data[data.user_eval_set == "train"]

tmp = data_features.user_id.unique()
user_train = tmp[0:120000]
user_valid = tmp[120000:]

data_train = data_features[data_features.user_id.isin(user_train)]
data_valid = data_features[data_features.user_id.isin(user_valid)]
print(len(data_train))
print(len(data_valid))

In [None]:
len(data_features)

In [None]:
#data_features.to_hdf(FEATURES_PATH + "lgb_data.h5", "data_features", mode="a")
data_train.to_hdf(FEATURES_PATH + "lgb_data.h5", "data_train", mode="a")
data_valid.to_hdf(FEATURES_PATH + "lgb_data.h5", "data_valid", mode="a")

In [None]:
data_test = data[data.user_eval_set == "test"]
data_test.to_hdf(FEATURES_PATH + "lgb_data.h5", "data_test", mode="a")

## Load data_train, data_valid

In [7]:
features = ['up_first_order',
 'user_reorder_rate',
 'department_id',
 'product_reorder_probability',
 'product_reorder_ratio',
 'up_order_dow_mean',
 'user_total_order',
 'up_add_to_cart_order_relative_mean',
 'dep_reorder_ratio',
 'up_orders_since_last_order',
 'up_orders',
 'user_days_since_prior_order',
 'up_add_to_cart_order_mean',
 'up_order_rate',
 'user_days_since_prior_mean',
 'aisle_reorder_ratio',
 'user_dep_reordered_ratio',
 'user_aisle_reordered_ratio',
 'up_last_order',
 'up_days_since_prior_order_mean',
 'user_dep_ratio',
 'up_order_hour_of_day_mean',
 'user_order_dow',
 'user_order_hour_of_day',
 'is_organic',
 'user_order_size_mean',
 'up_order_rate_since_first_order',
 'up_days_since_last_order',
 'user_aisle_ratio']

features = features + pe_columns

print(len(features))

61


In [3]:
data_train = pd.read_hdf(FEATURES_PATH + "lgb_data.h5", "data_train")
data_valid = pd.read_hdf(FEATURES_PATH + "lgb_data.h5", "data_valid")
print("data_train len", len(data_train))
print("data_valid len", len(data_valid))

#columns = list(data_train.columns)
#not_features = ['user_id', 'product_id', 'up_reordered']
#features = list(set(columns) - set(not_features))
#print("number of features", len(features))
#features

data_train len 7757907
data_valid len 716754


In [4]:
data_train = data_train.merge(product_embedding, on='product_id', how = 'left')
data_valid = data_valid.merge(product_embedding, on='product_id', how = 'left')

In [None]:
gc.collect()

In [8]:
X_train= data_train[features].values
y_train= data_train['up_reordered'].values
X_valid= data_valid[features].values
y_valid= data_valid['up_reordered'].values

In [9]:
del data_train
del data_valid
gc.collect()

89

In [13]:
import h5py
with h5py.File(FEATURES_PATH+ 'data_train_product_embedding.h5', 'w') as hf:
    hf.create_dataset("X_train",  data=X_train)
    hf.create_dataset("y_train",  data=y_train)

with h5py.File(FEATURES_PATH+ 'data_train_product_embedding.h5', 'a') as hf:
    hf.create_dataset("X_valid",  data=X_valid)
    hf.create_dataset("y_valid",  data=y_valid)
   

In [None]:
weight_train = 1 - (1- y_train)*0.8
weight_valid = 1 - (1- y_valid)*0.8

In [15]:
print('formating for lgb')
d_train = lgb.Dataset(X_train, label=y_train, feature_name = features, categorical_feature='department_id')
d_valid = lgb.Dataset(X_valid, label=y_valid, feature_name = features, categorical_feature='department_id', reference=d_train)
#d_train.save_binary(FEATURES_PATH +  'train.bin')
#d_valid.save_binary(FEATURES_PATH +  'valid.bin')

formating for lgb


In [25]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 500,
    'max_depth': 20,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.95,
    'bagging_freq': 5,
    'learning_rate':0.1
}
ROUNDS = 300

In [26]:
print('light GBM train :-)')
bst = lgb.train(params, d_train, ROUNDS)

light GBM train :-)


In [27]:
bst.save_model(FEATURES_PATH + 'lgb/trained_model_pe_2.txt')

In [None]:
#bst = lgb.Booster(model_file=FEATURES_PATH+ 'lgb/trained_model_28_features_categorical.txt')

In [30]:
bst.add_valid(d_valid, "valid")

In [28]:
bst.eval_train()

[('training', 'binary_logloss', 0.22149595873829106, False)]

In [31]:
bst.eval_valid()

[('valid', 'binary_logloss', 0.24388096216685964, False)]

## Train on all dataset

In [None]:
X_all = data_features[features]
y_all = data_features['up_reordered']

In [None]:
dataset_all = lgb.Dataset(X_all, label=y_all)

In [None]:
print('light GBM train :-)')
bst_all = lgb.train(params, dataset_all, ROUNDS)

In [None]:
bst_all.save_model(FEATURES_PATH + 'lgb/trained_model_num_leaves500_ALL.txt')

In [None]:
bst_all.eval_train()

### Validation

In [None]:
def precision(y, y_, correct, has_none=False):
    if y_ > 0:
        if not has_none:
            return correct / y_
        else:
            if y > 0:
                return correct / (y_ + 1)
            else:
                return 1. / (y_ + 1)
    else:
        return 1.0


def recall(y, y_, correct, has_none=False):
    if y > 0:
        return correct / y
    else:
        if has_none or (y_ == 0):
            return 1.0
        else:
            return 0.


def f1(y, y_, correct, has_none=False):
    p = precision(y, y_, correct, has_none)
    r = recall(y, y_, correct, has_none)
    if (p == 0) and (r == 0):
        return 0.
    f1 = 2 * p * r / (p + r)
    return f1


def compute_f1(valid_df, threshold):
    valid_df['y_'] = valid_df['pred'] > threshold
    valid_df['correct'] = (valid_df['y'] == valid_df['y_']) & (valid_df['y_'])
    result = valid_df.groupby('user_id').sum()
    result['f1'] = result.apply(lambda row: f1(row['y'], row['y_'], row['correct']), axis=1)
    return result['f1'].mean()

In [None]:
#bst = lgb.Booster(model_file=FEATURES_PATH+ 'lgb/trained_model_27_features_3.txt')

In [None]:
pred_valid = bst.predict(X_valid)
pred_valid

In [None]:
valid_df = data_valid[['user_id', 'product_id']].copy()
valid_df["y"] = y_valid
valid_df["pred"] = pred_valid
valid_df["y_"] = valid_df["pred"]  >= 0.2
valid_df['correct'] = (valid_df['y'] == valid_df['y_']) & (valid_df['y_'])
valid_df.sort_values(['user_id', 'pred'], ascending=[True, False], inplace = True)
#print("valid log loss = ", -((valid_df["y"]*np.log(valid_df["pred"])+ (1.-valid_df["y"])* np.log(1.- valid_df["pred"]))).mean())

In [None]:
compute_f1(valid_df, 0.20)

In [None]:
valid_df

In [None]:
result = valid_df.groupby('user_id').sum()
result['precision'] = result.apply(lambda row: precision(row['y'], row['y_'], row['correct']), axis=1)
result['recall'] = result.apply(lambda row: recall(row['y'], row['y_'], row['correct']), axis=1)
result['f1'] = result.apply(lambda row: f1(row['y'], row['y_'], row['correct']), axis=1)

print("precision mean = ", result.precision.mean())
print("recall mean = ", result.recall.mean())
print("f1 mean = ", result.f1.mean())

In [None]:
result

In [None]:
#valid_df.to_csv(FEATURES_PATH+ "valid_df.csv")
valid_df.to_hdf(FEATURES_PATH+ "results.h5", "valid_df", mode = "a")

In [None]:
valid_df.head()

## Generate predictions

In [None]:
#bst_all = lgb.Booster(model_file=FEATURES_PATH+ 'trained_model.txt')

In [None]:
data_test = pd.read_hdf(FEATURES_PATH + "lgb_data.h5", "data_test")

In [None]:
print('light GBM predict')
pred_test = bst.predict(data_test[features])
pred_test

In [None]:
orders = pd.read_csv(IDIR + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32},
        usecols=["order_id", "user_id", "eval_set"])

test_orders= orders[orders.eval_set == 'test']

In [None]:
prediction = pd.DataFrame()
prediction[['user_id', 'product_id']] = data_test[['user_id', 'product_id']]
prediction['proba'] = pred_test
prediction.sort_values(by=['user_id', 'proba'], ascending=[True, False], inplace=True)
prediction = pd.merge(prediction, test_orders[['order_id', 'user_id']], on="user_id", how='left')

In [None]:
prediction

## Feature importance

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
print('Plot feature importances...')
ax = lgb.plot_importance(bst, max_num_features=50)
plt.show()


### Recommendation using threshold

In [None]:
threshold = 0.20
recommend = prediction[prediction.proba >= threshold].groupby('order_id').product_id.apply(list)

In [None]:
recommend.head()

In [None]:
add_none_df = prediction[prediction.proba >= threshold].groupby('order_id').proba.agg([np.size, np.max])
add_none_df['None'] = (add_none_df['size'] > 0) & (add_none_df['size'] < 4) & (add_none_df['amax'] < 0.46)
add_none_df.head()

In [None]:
recommend_df = pd.DataFrame()
recommend_df["count"] = prediction.groupby('order_id').size()
recommend_df['product_list'] = recommend
recommend_df['none'] = add_none_df['None']


In [None]:
recommend_df.head()

In [None]:
def generate_prediction(row):
    p = row.product_list
    if type(p) == list:
        result = ' '.join([str(x)  for x in p])
        
        if row.none:
            result = 'None ' + result
        
        return result
    else:
        return 'None'

In [None]:
recommend_df['products']= recommend_df.apply(lambda row:  generate_prediction(row) , axis = 1) 

In [None]:
recommend_df['products']

In [None]:
recommend_df['products'].to_csv(FEATURES_PATH +  'lgb/recommend5_none.csv', header = True)

### Recommendation using average user basket

In [None]:
products = []
count = 0
for _,row in test_orders[['user_id', 'order_id']].iterrows():
    count += 1
    if (count)%10000 == 0:
        print(count)    
    
    user_id, order_id = row['user_id'], row['order_id']
    n = int(user_basket_avg.ix[user_id].basket_size_avg)+1
    products.append(list(prediction[prediction.user_id == user_id].product_id[:n]))
    
# create submission
submission = pd.DataFrame()
submission['order_id'] = test_orders['order_id']
submission['products'] = [' '.join([str(x) for x in p]) for p in products]
submission.sort_values(by='order_id', inplace = True)
submission.to_csv(FEATURES_PATH + 'submission.csv', index=False)

## Investigate result

In [None]:
valid_df.head()

In [None]:
tmp

In [None]:
users_none_incorrect = tmp[(tmp.y == 0) &  (tmp.y_ > 0)].user_id

In [None]:
valid_df[valid_df.user_id.isin(users_none_incorrect) & valid_df.y_].pred.mean()

In [None]:
valid_df[valid_df.y_].pred.mean()

In [None]:
print("None predicted correctly = ", len(tmp[(tmp.y == 0) &  (tmp.y_ == 0)]) / len(tmp[tmp.y == 0]))

In [None]:
print("precision mean = ", result.precision.mean())
print("recall mean = ", result.recall.mean())
print("f1 mean = ", result.f1.mean())

In [None]:
p = 0.3934358303379222
r = 0.5387833182445639
2*p*r/(p+r)

In [None]:
print("Not None but predict None = ", len(tmp[(tmp.y > 0) &  (tmp.y_ == 0)]) / len(tmp[tmp.y > 0]))
print("None but predicted not None = ", len(tmp[(tmp.y == 0) &  (tmp.y_ > 0)]) / len(tmp[tmp.y == 0]))

In [None]:
users_valid_not_reorder = tmp[(tmp.y == 0)].reset_index().user_id

In [None]:
data_valid_user_not_reorder = data_valid[data_valid.user_id.isin(users_valid_not_reorder)]

In [None]:
data_valid_user_not_reorder.head()

In [None]:
user_info = pd.read_hdf(FEATURES_PATH + "features.h5", "user_info").reset_index()

In [None]:
user_info_user_not_reorder = user_info[user_info.user_id.isin(users_valid_not_reorder)]

In [None]:
user_info_user_not_reorder

In [None]:
trains = pd.read_hdf(IDIR + "input.h5", "trains")
trains = trains.merge(orders, on= "order_id", how="left")

In [None]:
trains[trains.user_id == 188603]

In [None]:
valid_df[valid_df.user_id == 188603]

In [None]:
def f1_score(p,q):
    return 2*p*q/(p+q)

In [None]:
f1_score(1, 0.5)

In [None]:
tmp[tmp.y == 0].y_.mean()

### When there is None, we predict on average 2.8542780748663104 products ==> can be improved

In [None]:
nb_none_false =  len(tmp[(tmp.y == 0) &  (tmp.y_ > 0)]) 
nb_none_true = len(tmp[(tmp.y == 0) &  (tmp.y_ ==  0)]) 

In [None]:
nb_none_false

In [None]:
nb_none_true

In [None]:
nb_none = len(tmp[(tmp.y == 0)])
nb_none

In [None]:
nb_valid=  len(tmp)
nb_valid

In [None]:
total_f1 = 0.38*11209
total_f1

In [None]:
total_f1_improve = nb_none_false*0.67*0.
total_f1_improve

In [None]:
new_f1 = (total_f1 + total_f1_improve)/nb_valid
new_f1

In [None]:
tmp[(tmp.y_ == 1) & (tmp.y == 1) & (tmp.correct == 1)]

In [None]:
tmp[(tmp.y_ == 1) & (tmp.correct == 1)]

In [None]:
tmp[(tmp.y_ == 1) & (tmp.y == 0)]

In [None]:
f1_score(1,1) - f1_score(1/2,1)

In [None]:
f1_score(1,1/2) - f1_score(1/2,1/2)

In [None]:
f1_score(1,1/3) - f1_score(1/2,1/3)

In [None]:
f1_score(1/2,1) - 0

In [None]:
valid_df = data_valid[['user_id', 'product_id']].copy()
valid_df["y"] = y_valid
valid_df["pred"] = pred_valid

In [None]:
valid_df["y_"] = valid_df["pred"]  >= 0.2
valid_df['correct'] = (valid_df['y'] == valid_df['y_']) & (valid_df['y_'])
valid_df.sort_values(['user_id', 'pred'], ascending=[True, False], inplace = True)
valid_df.head()

In [None]:
threshold_none = 0.45
result = valid_df.groupby('user_id').agg({'y':np.sum,'y_':np.sum, 'correct':np.sum, 'pred':np.max}).reset_index()
result['None'] = (result['y_'] > 0) & (result['y_'] < 4) &(result['pred'] < threshold_none)
result['precision'] = result.apply(lambda row: precision(row['y'], row['y_'], row['correct'], row['None']), axis=1)
result['recall'] = result.apply(lambda row: recall(row['y'], row['y_'], row['correct'], row['None']), axis=1)
result['f1'] = result.apply(lambda row: f1(row['y'], row['y_'], row['correct'], row['None']), axis=1)
print(result.f1.mean())

In [None]:
result.head()

In [None]:
result[(result.y > 0) & (result.correct == 0)]

In [None]:
result[(result.y == 0) & (result.y_ == 0)]

In [None]:
plt.hist(result.f1)

In [None]:
result[(result.y ==0) & (result.y_ > 0) & (result['None'] == False)]

In [None]:
459 + 1777 

In [None]:
len(result[(result.f1 == 0)])

In [None]:
valid_df[valid_df.user_id == 188562]

In [None]:
data_valid[(data_valid.user_id == 188562) & (data_valid.product_id  == 43409)].T