In [1]:
import pandas as pd
import matplotlib.pyplot as plt
#from IPython.display import Audio
#sound_file = '/home/thomas/Musique/Modem-KP-551027942.wav'

In [2]:
print ('Reading the orders dataset...')
df_orders = pd.read_csv('data/orders.csv')
print('Reading the prior products dataset...')
df_products_prior = pd.read_csv('data/order_products__prior.csv')
print('Reading the train products dataset...')
df_products_train = pd.read_csv('data/order_products__train.csv')

Reading the orders dataset...
Reading the prior products dataset...
Reading the train products dataset...


In [3]:
df_orders.head(12)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
5,3367565,1,prior,6,2,7,19.0
6,550135,1,prior,7,1,9,20.0
7,3108588,1,prior,8,1,14,14.0
8,2295261,1,prior,9,1,16,0.0
9,2550362,1,prior,10,4,8,30.0


## Selection of orders

In [62]:
print('Selecting orders to train on...')
df_orders_train = df_orders.loc[df_orders['eval_set']=='train']
print(df_orders_train.shape)
print('Keeping some data for calculating the f1 score' )
df_orders_test = df_orders_train.sample(frac=0.3,replace=False,random_state=43).sort_index()
inds_test = df_orders_test.index
df_orders_train = df_orders_train[~(df_orders_train.index.isin(inds_test))]
print('Creating a user dedicated dataframe...')
df_users = df_orders_train[['user_id','order_number','order_dow','order_hour_of_day','days_since_prior_order']]
df_users=df_users.set_index('user_id')
df_users = df_users.rename(columns={'order_number':'train_order_number'})
df_users=df_users.drop(['order_dow','order_hour_of_day'],axis=1)

Selecting orders to train on...
(131209, 7)
Keeping some data for calculating the f1 score
Creating a user dedicated dataframe...


In [63]:
print(df_orders_train.shape)
print(df_orders_test.shape)
print(df_orders_train.shape[0]+df_orders_test.shape[0])

(91846, 7)
(39363, 7)
131209


In [66]:
df_orders_train.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
10,1187899,1,train,11,4,8,14.0
49,2196797,5,train,5,0,11,6.0
74,525192,7,train,21,2,11,6.0
78,880375,8,train,4,1,14,10.0
82,1094988,9,train,4,6,10,30.0


In [6]:
print('Selecting prior orders associated with the users in the train dataframe...')
df_orders_prior = df_orders.loc[(df_orders['user_id'].isin(df_users.index.values))
                               & (df_orders['eval_set']=='prior')]
df_orders_prior=df_orders_prior.drop(['eval_set','order_dow','order_hour_of_day'],axis=1)

Selecting prior orders associated with the users in the train dataframe...


In [7]:
df_orders_prior.head(20)

Unnamed: 0,order_id,user_id,order_number,days_since_prior_order
0,2539329,1,1,
1,2398795,1,2,15.0
2,473747,1,3,21.0
3,2254736,1,4,29.0
4,431534,1,5,28.0
5,3367565,1,6,19.0
6,550135,1,7,20.0
7,3108588,1,8,14.0
8,2295261,1,9,0.0
9,2550362,1,10,30.0


## Selection of products

In [8]:
print('Products in the training set that have not been reordered are removed from the analysis: they cannot be predicted and they should not...')
df_products_train = df_products_train.loc[df_products_train['reordered']==1]

Products in the training set that have not been reordered are removed from the analysis: they cannot be predicted and they should not...


In [9]:
df_products_prior2 = pd.merge(df_orders_prior,df_products_prior, on='order_id')
df_products_prior2 = df_products_prior2.drop(['order_id','reordered'],axis=1)

In [10]:
df_products_prior2[50:].head(20)

Unnamed: 0,user_id,order_number,days_since_prior_order,product_id,add_to_cart_order
50,1,10,30.0,196,1
51,1,10,30.0,46149,2
52,1,10,30.0,39657,3
53,1,10,30.0,38928,4
54,1,10,30.0,25133,5
55,1,10,30.0,10258,6
56,1,10,30.0,35951,7
57,1,10,30.0,13032,8
58,1,10,30.0,12427,9
59,2,1,,32792,1


In [11]:
merged = df_products_prior2.groupby(['user_id','product_id'])
df_products_prior3 = merged.agg({'order_number':np.max,'days_since_prior_order':np.mean,'add_to_cart_order':np.mean})
df_products_prior3 = df_products_prior3.rename(columns={'add_to_cart_order':'mean_add_to_cart_order',
                                                        'days_since_prior_order':'days_since_last_order',
                                                        'order_number':'last_order_number'})
df_products_prior3['times_ordered']=merged['order_number'].count()

In [12]:
df_products_prior3.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_add_to_cart_order,days_since_last_order,last_order_number,times_ordered
user_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,196,1.4,19.555556,10,10
1,10258,3.333333,19.555556,10,9
1,10326,5.0,28.0,5,1
1,12427,3.3,19.555556,10,10
1,13032,6.333333,21.666667,10,3
1,13176,6.0,21.5,5,2
1,14084,2.0,,1,1
1,17122,6.0,28.0,5,1
1,25133,4.0,20.125,10,8
1,26088,4.5,15.0,2,2


In [13]:
df_products_train2 = pd.merge(df_orders_train,df_products_train, on='order_id')
df_products_train2 = df_products_train2.drop(['order_id','eval_set',
                                              'order_dow','order_hour_of_day',
                                              'add_to_cart_order'],axis=1)
df_products_train2 = df_products_train2.set_index(['user_id','product_id'])
df_products_train2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,order_number,days_since_prior_order,reordered
user_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,196,11,14.0,1
1,25133,11,14.0,1
1,38928,11,14.0,1
1,26405,11,14.0,1
1,39657,11,14.0,1


In [14]:
df_final = df_products_prior3.join(df_users,rsuffix='_train')
df_final = df_final.join(df_products_train2['reordered'])
df_final['reordered'].fillna(0,inplace=True)

In [15]:
df_final['proba'] = df_final['times_ordered'] / (df_final['train_order_number']-1.)
df_final2 = df_final.drop(['days_since_last_order',
                           'times_ordered',
                           'train_order_number',
                           'last_order_number',
                           'days_since_prior_order'],axis=1)

In [16]:
df_final2.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_add_to_cart_order,reordered,proba
user_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,196,1.4,1.0,1.0
1,10258,3.333333,1.0,0.9
1,10326,5.0,0.0,0.1
1,12427,3.3,0.0,1.0
1,13032,6.333333,1.0,0.3
1,13176,6.0,0.0,0.2
1,14084,2.0,0.0,0.1
1,17122,6.0,0.0,0.1
1,25133,4.0,1.0,0.8
1,26088,4.5,1.0,0.2


In [22]:
df_final3 = df_final2.loc[df_final2['reordered']==0].sample(frac=0.1)
df_final3 = pd.concat([df_final3,df_final2.loc[df_final2['reordered']==1]])
df_final3 = df_final3.sample(frac=1.)

In [23]:
pos = float(df_final3.loc[df_final3['reordered']==1].shape[0])
neg = float(df_final3.loc[df_final3['reordered']==0].shape[0])
print(pos,neg,pos/(pos+neg))

(828824.0, 764584.0, 0.5201580511708238)


In [24]:
X = df_final2[['mean_add_to_cart_order','proba']].values
y = df_final2['reordered'].values
print(X.shape)
print(y.shape)

(8474661, 2)
(8474661,)


In [25]:
y
#Audio(url=sound_file, autoplay=True)

array([ 1.,  1.,  0., ...,  0.,  0.,  0.])

In [34]:
import cPickle as pickle
with open('data_X.pkl','wb') as fout:
    pickle.dump(X,fout)

with open('data_y.pkl','wb') as fout:
    pickle.dump(y,fout)

In [1]:
import cPickle as pickle
with open('data_X.pkl','rb') as fin:
    X=pickle.load(fin)
with open('data_y.pkl','rb') as fin:
    y=pickle.load(fin)

In [None]:
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier

clf = svm.SVC(verbose=True)
knn = KNeighborsClassifier(n_neighbors=6)

knn.fit(X[:10000], y[:10000])  

In [None]:
import cPickle as pickle
with open('svc.pkl', 'wb') as fid:
    pickle.dumps(clf,fid)

with open('svc.pkl', 'rb') as fid:
    clf = pickle.load(fid)
#clf = pickle.loads(s)
#clf.predict(X[0:1])

## Prediction of test dataset

In [54]:
print('Selecting orders to train on...')
df_orders_train = df_orders.loc[df_orders['eval_set']=='test']
print('Creating a user dedicated dataframe...')
df_users = df_orders_train[['user_id','order_number','order_dow','order_hour_of_day','days_since_prior_order']]
df_users=df_users.set_index('user_id')
df_users = df_users.rename(columns={'order_number':'train_order_number'})
df_users=df_users.drop(['order_dow','order_hour_of_day'],axis=1)

Selecting orders to train on...
Creating a user dedicated dataframe...


In [55]:
df_users.head()

Unnamed: 0_level_0,train_order_number,days_since_prior_order
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,13,11.0
4,6,30.0
6,4,22.0
11,8,8.0
12,6,30.0


In [56]:
print('Selecting prior orders associated with the users in the train dataframe...')
df_orders_prior = df_orders.loc[(df_orders['user_id'].isin(df_users.index.values))
                               & (df_orders['eval_set']=='prior')]
df_orders_prior=df_orders_prior.drop(['eval_set','order_dow','order_hour_of_day'],axis=1)

Selecting prior orders associated with the users in the train dataframe...


In [57]:
df_products_prior2 = pd.merge(df_orders_prior,df_products_prior, on='order_id')
df_products_prior2 = df_products_prior2.drop(['order_id','reordered'],axis=1)

In [58]:
merged = df_products_prior2.groupby(['user_id','product_id'])
df_products_prior3 = merged.agg({'order_number':np.max,'days_since_prior_order':np.mean,'add_to_cart_order':np.mean})
df_products_prior3 = df_products_prior3.rename(columns={'add_to_cart_order':'mean_add_to_cart_order',
                                                        'days_since_prior_order':'days_since_last_order',
                                                        'order_number':'last_order_number'})
df_products_prior3['times_ordered']=merged['order_number'].count()

In [59]:
df_products_prior3.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_add_to_cart_order,days_since_last_order,last_order_number,times_ordered
user_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,248,3.0,9.0,2,1
3,1005,5.0,17.0,10,1
3,1819,2.666667,11.333333,7,3
3,7503,6.0,21.0,3,1
3,8021,5.0,9.0,2,1


In [60]:
df_final = df_products_prior3.join(df_users,rsuffix='_test')

In [61]:
df_final.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_add_to_cart_order,days_since_last_order,last_order_number,times_ordered,train_order_number,days_since_prior_order
user_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,248,3.0,9.0,2,1,13,11.0
3,1005,5.0,17.0,10,1,13,11.0
3,1819,2.666667,11.333333,7,3,13,11.0
3,7503,6.0,21.0,3,1,13,11.0
3,8021,5.0,9.0,2,1,13,11.0


In [62]:
df_final['proba'] = df_final['times_ordered'] / (df_final['train_order_number']-1.)
df_final2 = df_final.drop(['days_since_last_order',
                           'times_ordered',
                           'train_order_number',
                           'last_order_number',
                           'days_since_prior_order'],axis=1)

In [None]:
X = df_final2[['mean_add_to_cart_order','proba']].values
print(X.shape)

(4833292, 3)
(4833292,)


In [None]:
resultat = model.predict(X)

In [None]:
df_resultats = pd.DataFrame(resultat,index=df_final2.index)
df_resultats.head(200)