In [None]:
# Author : Trong Canh Nguyen

# This script considers all the products a user has ordered
#
# We train a model computing the probability of reorder on the "train" data
#
# For the submission, we keep the orders that have a probability of
# reorder higher than a threshold


import numpy as np
import pandas as pd
import lightgbm as lgb
import gc
IDIR = '../input/'
FEATURES_PATH = './features3/'

In [None]:
gc.collect()

## Data Load

In [None]:
#data = pd.read_csv(FEATURES_PATH + "data.csv", dtype= dtype_dict)

In [None]:
data = pd.read_hdf(FEATURES_PATH + "data.h5", "data")
data.reset_index(inplace=True)
print("memory = ", data.memory_usage().sum()/1000000)

In [None]:
data.head()

In [None]:
data.isnull().sum()

# MODEL

In [None]:
features = [    
    'up_orders',
    'up_add_to_cart_order_mean',
    'up_order_rate',
    'up_order_rate_since_first_order',
    'up_orders_since_last_order',
    'up_days_since_last_order',
    'up_in_same_day_previous_order',
    
    'user_total_order',
    'user_order_size_mean',
    'user_reorder_rate',
    'user_days_since_last_order',   
    
    'product_reorder_ratio',   
    
    'aisle_reorder_ratio',
    'user_aisle_reordered_ratio',    
    'dep_reorder_ratio',
    'user_dep_reordered_ratio'
]



In [None]:
data_features = data[data.user_eval_set == "train"][['user_id', 'product_id']+ features + ["up_reordered"]]

tmp = data_features.user_id.unique()
user_train = tmp[0:120000]
user_valid = tmp[120000:]

data_train = data_features[data_features.user_id.isin(user_train)]
data_valid = data_features[data_features.user_id.isin(user_valid)]

In [None]:
data_train.to_hdf(FEATURES_PATH + "lgb_data.h5", "data_train", mode="a")
data_valid.to_hdf(FEATURES_PATH + "lgb_data.h5", "data_valid", mode="a")

In [None]:
data_test = data[data.user_eval_set == "test"][['user_id', 'product_id']+ features]
data_test.to_hdf(FEATURES_PATH + "lgb_data.h5", "data_test", mode="a")

## Load data_train, data_valid

In [None]:
data_train = pd.read_hdf(FEATURES_PATH + "lgb_data.h5", "data_train")
data_valid = pd.read_hdf(FEATURES_PATH + "lgb_data.h5", "data_valid")

In [None]:
print(len(data_train))
print(len(data_valid))

In [None]:
X_train= data_train[features]
y_train= data_train['up_reordered']
X_valid= data_valid[features]
y_valid= data_valid['up_reordered']

In [None]:
print('formating for lgb')
d_train = lgb.Dataset(X_train, label=y_train)
d_valid = lgb.Dataset(X_valid, label=y_valid, reference=d_train)
#d_train.save_binary(FEATURES_PATH +  'train.bin')
#d_valid.save_binary(FEATURES_PATH +  'valid.bin')

In [None]:
gc.collect()

In [None]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 200,
    'max_depth': 10,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 100

print('light GBM train :-)')
bst = lgb.train(params, d_train, ROUNDS)

In [None]:
bst.save_model(FEATURES_PATH+ 'trained_model.txt')

In [None]:
bst.add_valid(d_valid, "valid1")

In [None]:
bst.eval_train()

In [None]:
bst.eval_valid()

### Validation

In [None]:
def precision(y, y_, correct):
    if y_>0:
        return correct/y_
    else:
        return 1.0
        
def recall(y, y_, correct):
    if y>0:
        return correct/y
    else:
        return 1.0

def f1(y,y_, correct):
    p = precision(y, y_, correct)
    r = recall(y, y_, correct)
    if (p == 0) and (r ==0):
        return 0.
    f1 = 2*p*r/(p+r)
    return f1

def compute_f1(valid_df, threshold):
    valid_df['y_'] = valid_df['pred'] > threshold
    valid_df['correct'] = (valid_df['y'] == valid_df['y_']) & (valid_df['y_'])
    result = valid_df.groupby('user_id').sum()
    result['f1'] = result.apply(lambda row: f1(row['y'], row['y_'], row['correct']), axis=1)
    return result['f1'].mean()

In [None]:
pred_valid = bst.predict(X_valid)
pred_valid

In [None]:
valid_df = data_valid[['user_id', 'product_id']].copy()
valid_df["y"] = y_valid
valid_df["pred"] = pred_valid
valid_df["y_"] = valid_df["pred"]  >= 0.20
valid_df['correct'] = (valid_df['y'] == valid_df['y_']) & (valid_df['y_'])
valid_df.sort_values(['user_id', 'pred'], ascending=[True, False], inplace = True)
#print("valid log loss = ", -((valid_df["y"]*np.log(valid_df["pred"])+ (1.-valid_df["y"])* np.log(1.- valid_df["pred"]))).mean())

In [None]:
result = valid_df.groupby('user_id').sum()
result['f1'] = result.apply(lambda row: f1(row['y'], row['y_'], row['correct']), axis=1)

In [None]:
result[result.y_ == 0]

In [None]:
compute_f1(valid_df, 0.20)

## Generate predictions

In [None]:
bst = lgb.Booster(model_file=FEATURES_PATH+ 'trained_model.txt')

In [None]:
data_test = pd.read_hdf(FEATURES_PATH + "lgb_data.h5", "data_test")

In [None]:
print('light GBM predict')
pred_test = bst.predict(data_test[features])
pred_test

In [None]:
orders = pd.read_csv(IDIR + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32},
        usecols=["order_id", "user_id", "eval_set"])

test_orders= orders[orders.eval_set == 'test']

In [None]:
prediction = pd.DataFrame()
prediction[['user_id', 'product_id']] = data_test[['user_id', 'product_id']]
prediction['proba'] = pred_test
prediction.sort_values(by=['user_id', 'proba'], ascending=[True, False], inplace=True)
prediction = pd.merge(prediction, test_orders[['order_id', 'user_id']], on="user_id", how='left')

In [None]:
prediction

## Feature importance

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
print('Plot feature importances...')
ax = lgb.plot_importance(bst, max_num_features=20)
plt.show()


### Recommendation using threshold

In [None]:
threshold = 0.20
recommend = prediction[prediction.proba >= threshold].groupby('order_id').product_id.apply(list)

In [None]:
recommend_df = pd.DataFrame()
recommend_df["count"] = prediction.groupby('order_id').size()
recommend_df['product_list'] = recommend
recommend_df['products']= recommend_df.product_list.apply(lambda p: ' '.join([str(x)  for x in p]) if type(p) == list else 'None' ) 

In [None]:
recommend_df.head()

In [None]:
recommend_df['products'].to_csv(FEATURES_PATH +  'lgb/recommend.csv', header = True)

### Recommendation using average user basket

In [None]:
products = []
count = 0
for _,row in test_orders[['user_id', 'order_id']].iterrows():
    count += 1
    if (count)%10000 == 0:
        print(count)    
    
    user_id, order_id = row['user_id'], row['order_id']
    n = int(user_basket_avg.ix[user_id].basket_size_avg)+1
    products.append(list(prediction[prediction.user_id == user_id].product_id[:n]))
    
# create submission
submission = pd.DataFrame()
submission['order_id'] = test_orders['order_id']
submission['products'] = [' '.join([str(x) for x in p]) for p in products]
submission.sort_values(by='order_id', inplace = True)
submission.to_csv(FEATURES_PATH + 'submission.csv', index=False)