## First Attempt

The idea for this very first basic attempt is to compute the probability for a user to reorder a product based on its previous orders.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
print ('Reading the orders dataset...')
df_orders = pd.read_csv('data/orders.csv')

#Selecting the test orders to get the corresponding user_id 
df_test  = df_orders.loc[df_orders.eval_set=='test',['order_id','user_id','order_number']]
user_ids = df_test['user_id'].values
order_ids = df_test['order_id'].values
df_orders = df_orders[df_orders['user_id'].isin(user_ids)]

print('Reading the prior products dataset...')
df_products_prior = pd.read_csv('data/order_products__prior.csv')

print ('Merging the 2 datasets to get user id for every each order...')
merged = pd.merge(df_orders,df_products_prior,on='order_id')

Reading the orders dataset...
Reading the prior products dataset...
Merging the 2 datasets to get user id for every each order...


In [3]:
print('Computing the number of times each product has been ordered by an user.')
grouped = merged.groupby(['user_id','product_id'])['reordered'].count()
grouped.head()

Computing the number of times each product has been ordered by an user.


user_id  product_id
3        248           1
         1005          1
         1819          3
         7503          1
         8021          1
Name: reordered, dtype: int64

In [4]:
#making the grouped series a dataframe again to be able to merge it with df_test
grouped = pd.DataFrame(grouped).reset_index()

In [5]:
np.random.seed(343)
print('Merging dataframes...')
merged = pd.merge(grouped,df_test,on='user_id',how='right')
print('Computing the probabilities to order per product and per user...')
merged['proba_reorder'] = merged['reordered'] / (merged['order_number'] -1.)
print('Randomly sampling the uniform distribution...')
merged['alea'] =  np.random.uniform(size=merged.shape[0])
print('Will the product be ordered again or not??? Depends on the proba_reorder and the random number...')
merged['will_be_reordered'] = merged.apply(lambda x : (x['alea']<x['proba_reorder']),axis=1)
merged.head()


Merging dataframes...
Computing the probabilities to order per product and per user...
Randomly sampling the uniform distribution...
Will the product be ordered again or not??? Depends on the proba_reorder and the random number...


Unnamed: 0,user_id,product_id,reordered,order_id,order_number,proba_reorder,alea,will_be_reordered
0,3,248,1,2774568,13,0.083333,0.075056,True
1,3,1005,1,2774568,13,0.083333,0.646019,False
2,3,1819,3,2774568,13,0.25,0.656301,False
3,3,7503,1,2774568,13,0.083333,0.420031,False
4,3,8021,1,2774568,13,0.083333,0.239017,False


In [6]:
#keeping only the products that will be reordered
merged = merged[merged['will_be_reordered']]
#liste = merged.groupby('order_id')['product_id'].apply(list)
#merging them all in a string
next_orders = merged.groupby('order_id')['product_id'].apply(lambda x : ' '.join(x.astype(str)))
next_orders.head()

order_id
17                    6291 13107 13535 31964 44056 48896
34     2596 6317 10132 12456 18441 31533 39180 44663 ...
137    2326 5134 8357 9515 25890 26209 41787 43068 44...
182    1244 5479 6972 13629 28800 32537 37642 37687 3...
257    1025 2063 11140 15438 24852 24964 27104 28476 ...
Name: product_id, dtype: object

In [7]:
print ('next_orders entries : {}, df_test entrie : {}'.format(next_orders.shape,df_test.shape))
#Need to get back the index from original test set
print('Sorting index...')
df_test = df_test.sort_values('order_id')
next_orders = next_orders.reindex(df_test.order_id)
print('Filling NA with None')
next_orders = next_orders.fillna('None')

next_orders entries : (74553,), df_test entrie : (75000, 3)
Sorting index...
Filling NA with None


In [8]:
dataframe_a_soumettre = next_orders.reset_index()
dataframe_a_soumettre.to_csv('soumission.csv',sep=',',
                             index=False,
                             header=['order_id','products'],
                             quoting=False)


Kaggle F1 Score on Public LeaderBoard : 0.2382460 