In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
print ('Reading the orders dataset...')
df_orders = pd.read_csv('data/orders.csv',dtype={'order_id':np.uint32,
                                                 'user_id':np.uint32,
                                                 'order_number':np.uint8,
                                                 'order_dow':np.uint8,
                                                 'order_hour_of_day':np.uint8,
                                                 'days_since_prior_order':np.float16})

Reading the orders dataset...


In [3]:
print('Reading the prior products dataset...')
df_products_prior = pd.read_csv('data/order_products__prior.csv',dtype={'order_id':np.uint32,
                                                                        'product_id':np.uint32,
                                                                        'add_to_cart_order':np.uint8,
                                                                        'reordered':np.bool})

Reading the prior products dataset...


In [4]:
print('Reading the train products dataset...')
df_products_train = pd.read_csv('data/order_products__train.csv',dtype={'order_id':np.uint32,
                                                                        'product_id':np.uint32,
                                                                        'add_to_cart_order':np.uint8,
                                                                        'reordered':np.bool})

Reading the train products dataset...


In [9]:
print('Selecting prior orders associated with the users in the train dataframe...')
df_orders_prior = df_orders.loc[(df_orders['user_id'].isin(df_users.index.values))
                               & (df_orders['eval_set']=='prior')]
df_orders_prior=df_orders_prior.drop(['eval_set',],axis=1)

Selecting prior orders associated with the users in the train dataframe...


In [12]:
df_prod_cat = pd.read_csv('data/products_categorized.csv',dtype={'product_id':np.uint32,
                                                               'aisle_cat':np.uint8,
                                                               'department_cat':np.uint8})


## Prediction of test dataset

In [5]:
from keras.models import load_model
model = load_model('model_file.h5')

Using TensorFlow backend.


In [7]:
print('Selecting orders to train on...')
df_orders_test = df_orders.loc[df_orders['eval_set']=='test']
print('Creating a user dedicated dataframe...')
df_users = df_orders_test[['user_id','order_number','order_dow','order_hour_of_day','days_since_prior_order']]
df_users=df_users.set_index('user_id')
df_users = df_users.rename(columns={'order_number':'train_order_number'})
df_users=df_users.drop(['order_dow','order_hour_of_day'],axis=1)

Selecting orders to train on...
Creating a user dedicated dataframe...


In [10]:
df_products_prior = pd.merge(df_orders_prior,df_products_prior, on='order_id')
df_products_prior = df_products_prior.drop(['order_id','reordered'],axis=1)

In [13]:
df_products_prior = pd.merge(df_products_prior,df_prod_cat, on='product_id')

In [14]:
df_products_prior['order_hour_of_week'] = 24 * df_products_prior.order_dow + df_products_prior.order_hour_of_day
df_products_prior.drop(['order_dow','order_hour_of_day'],axis=1,inplace=True)

In [15]:
merged = df_products_prior.groupby(['user_id','product_id'])
df_products_prior3 = merged.agg({'order_number':np.max,
                                 'days_since_prior_order':np.mean,
                                 'add_to_cart_order':np.mean,
                                 'department_cat':'last',
                                 'aisle_cat':'last',
                                 'order_hour_of_week':np.mean})
df_products_prior3 = df_products_prior3.rename(columns={'add_to_cart_order':'mean_add_to_cart_order',
                                                        'days_since_prior_order':'days_since_last_order',
                                                        'order_number':'last_order_number'})
df_products_prior3['times_ordered']=merged['order_number'].count()

In [16]:
df_final = df_products_prior3.join(df_users,rsuffix='_test')

In [17]:
df_final['proba'] = (df_final['times_ordered']-1.) / (df_final['train_order_number']-2.)
df_final['orders_since_last_order'] = df_final['train_order_number']-df_final['last_order_number']
df_final2 = df_final.drop(['days_since_last_order',
                           'times_ordered',
                           'train_order_number',
                           'last_order_number',
                           'days_since_prior_order'],axis=1)
#df_final2 = pd.get_dummies(df_final2)

In [18]:
#X = df_final2[['mean_add_to_cart_order','proba','orders_since_last_order',
#              'aisle_cat_very_low','aisle_cat_low','aisle_cat_middle','aisle_cat_high','aisle_cat_very_high',
#              'department_cat_very_low','department_cat_low','department_cat_middle','department_cat_high','department_cat_very_high']].values
X = df_final2[['mean_add_to_cart_order',
               'proba',
               'orders_since_last_order',
               'aisle_cat',
               'department_cat',
              'order_hour_of_week']].values

print(X.shape)

(4833292, 6)


In [229]:
resultat = model.predict(X)

In [245]:
df_resultats = pd.DataFrame(resultat,index=df_final2.index)
#df_resultats.head(200)

In [246]:
def tirage_alea(row):
    if (row['std']==0):
        return 0
    else:
        #alea = np.maximum(0,int(np.random.normal(row['mean'],row['std'])))
        alea = np.maximum(0,int(row['mean']))
        return alea
                            
df_number_of_product = pd.read_csv('data/number_of_products_per_user.csv')
df_number_of_product['nombre_produits_alea'] = df_number_of_product.apply(tirage_alea,axis=1)
df_number_of_product.set_index('user_id',inplace=True)
#df_number_of_product.head()

In [247]:
def choix_produits(group):
    inds = np.argsort(group[1].values)
    nbre_prod = int(np.minimum(len(inds),df_number_of_product.loc[group.name,'nombre_produits_alea']))
    #print(group.name,df_number_of_product.loc[group.name,'nombre_produits_alea'])
    inds = inds[:-nbre_prod-1:-1]
    
    if len(inds) == 0:
        return 'None'
    
    else :
        products = (group['product_id'].values)[inds]
        return ' '.join(products.astype(str))

In [248]:
def output_pred2(df_resultats,decision_threshold):
    grouped = df_resultats.reset_index(level='product_id').groupby(level='user_id')
    df_resultats = grouped.apply(choix_produits)
    df_resultats = pd.DataFrame({'product_id':df_resultats})
    
    df_a_soumettre = df_orders_test.join(df_resultats,on='user_id')[['order_id','product_id']].sort_values('order_id')
    #df_a_soumettre = df_a_soumettre.fillna('None')

    df_a_soumettre.to_csv('soumission.csv',sep=',',
                               index=False,
                               header=['order_id','products'],
                               quoting=False)
    return df_a_soumettre

In [249]:
def output_pred(df_resultats,decision_threshold):
    df_res = df_resultats.loc[df_resultats[1]>decision_threshold].reset_index()
    df_res = df_res.groupby(['user_id'])['product_id'].apply(lambda x : ' '.join(x.astype(str)))

    df_a_soumettre = df_orders_test.join(df_res,on='user_id')[['order_id','product_id']].sort_values('order_id')
    df_a_soumettre = df_a_soumettre.fillna('None')

    df_a_soumettre.to_csv('soumission.csv',sep=',',
                                 index=False,
                                 header=['order_id','products'],
                                 quoting=False)
    return df_a_soumettre

In [250]:
def test_pred(df_resultats, decision_threshold):
    df_a_soumettre = output_pred(df_resultats,decision_threshold)
    df_true = df_products_train[df_products_train['order_id'].isin(df_orders_test['order_id'])]
    df_true = df_true[df_true['reordered']==1][['order_id','product_id']]
    df_true = df_true.groupby('order_id')['product_id'].apply(lambda x : ' '.join(x.astype(str)))
    df_true = df_a_soumettre.join(df_true,on='order_id',rsuffix='_true')
    df_true['product_id_true'] = df_true['product_id_true'].fillna('None')
    #df_true.head()
    df_true['f1'] = df_true[:100].apply(f1_score_perso,axis=1)
    return decision_threshold,np.mean(df_true['f1'])

In [237]:
df_a_soumettre = output_pred2(df_resultats,1.)
df_true = df_products_train[df_products_train['order_id'].isin(df_orders_test['order_id'])]
#df_true = df_true[df_true['reordered']==1][['order_id','product_id']]
#df_true = df_true.groupby('order_id')['product_id'].apply(lambda x : ' '.join(x.astype(str)))
#df_true = df_a_soumettre.join(df_true,on='order_id',rsuffix='_true')
#df_true['product_id_true'] = df_true['product_id_true'].fillna('None')
df_true.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
92,170,18394,1,True
93,170,37766,2,True
94,170,13176,3,True
95,170,6236,4,True
96,170,5077,5,True


In [238]:
#from wikipedia : https://en.wikipedia.org/wiki/F1_score
# f1 = 2 * precision * recall / (precision+recall)
# precision  = number of true posisive  / number of predicted positive
# recall     = number of true posisive  / number of positives

def f1_score_perso(row):
    y_pred = row['product_id']
    y_true = row['product_id_true']
    if y_pred == 'None':
        y_pred = [0]
    else:
        y_pred = [int(y) for y in y_pred.split(' ')]
    if y_true == 'None':
        y_true = [0]
    else:
        y_true = [int(y) for y in y_true.split(' ')]
    true_pos = np.intersect1d(y_pred,y_true)
    precision = len(true_pos) / float(len(y_pred))
    recall = len(true_pos) / float(len(y_true))
    if precision+recall == 0:
        f1 = 0.
    else :
        f1 = 2 * precision * recall / (precision+recall)
    return f1

In [239]:
#test_pred(df_resultats, 1.)

In [252]:
#decision_thresholds = np.arange(0.1,0.95,0.1)
decision_thresholds = np.arange(0.55,0.76,0.01)
#decision_thresholds = np.array([0.73])
#decision_thresholds = np.arange(0.05,0.26,0.01)
#decision_thresholds = []
f1_sco = []
for decision_threshold in decision_thresholds:
    var_tmp = test_pred(df_resultats, decision_threshold)
    f1_sco.append(var_tmp)
    print (var_tmp)

(0.55000000000000004, 0.33404581105701397)
(0.56000000000000005, 0.33912644595165131)
(0.57000000000000006, 0.34181339128458688)
(0.58000000000000007, 0.3471330839784656)
(0.59000000000000008, 0.34896709080881644)
(0.60000000000000009, 0.34967902234963932)
(0.6100000000000001, 0.34763971562149243)
(0.62000000000000011, 0.35467422405636179)
(0.63000000000000012, 0.35660348698161926)
(0.64000000000000012, 0.35995923816366926)
(0.65000000000000013, 0.36419954399652077)
(0.66000000000000014, 0.36841483111575157)
(0.67000000000000015, 0.36407769944517038)
(0.68000000000000016, 0.36952428232044471)
(0.69000000000000017, 0.36935495523228395)
(0.70000000000000018, 0.36785692952147181)
(0.71000000000000019, 0.3685372000071564)
(0.7200000000000002, 0.36844049277984908)
(0.7300000000000002, 0.37430126061541374)
(0.74000000000000021, 0.36914326587205926)
(0.75000000000000022, 0.37305693200226914)


In [253]:
#decision_thresholds = np.arange(0.1,0.95,0.1)
decision_thresholds = np.arange(0.76,0.80,0.01)
#decision_thresholds = np.array([0.73])
#decision_thresholds = np.arange(0.05,0.26,0.01)
#decision_thresholds = []
f1_sco = []
for decision_threshold in decision_thresholds:
    var_tmp = test_pred(df_resultats, decision_threshold)
    f1_sco.append(var_tmp)
    print (var_tmp)

(0.76000000000000001, 0.36942014561847963)
(0.77000000000000002, 0.36263139197050942)
(0.78000000000000003, 0.36100079861166301)
(0.79000000000000004, 0.36222383927546842)
(0.80000000000000004, 0.36815403069293451)


In [241]:
#df = output_pred2(df_resultats,0.73)
#df.shape