In [1]:
import pandas as pd
import operator

# reading data
order_products_prior = pd.read_csv('../input/order_products__prior.csv')
order_products_train = pd.read_csv('../input/order_products__train.csv')
orders = pd.read_csv('../input/orders.csv')

## Split out train dataset

In [2]:
train_users = orders[orders['eval_set'] == 'train' ].user_id.unique()
test_users = orders[orders['eval_set'] == 'test' ].user_id.unique()
print("train users ", len(train_users))
print("test users ", len(test_users))

train users  131209
test users  75000


In [3]:
orders_prior = orders[orders.eval_set == "prior"]
orders_train = orders[orders.eval_set == "train"]

- 131209 users in train dataset
- orders_train contains last order of those users
- orders_priors contains previous order of those users

## Count number of products in order

In [4]:
orders_prior_count = order_products_prior.groupby('order_id')['product_id'].count()
orders_prior_count.head()

order_id
2     9
3     8
4    13
5    26
6     3
Name: product_id, dtype: int64

In [5]:
orders_prior['item_count'] = orders_prior['order_id'].map(orders_prior_count)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [6]:
# SAVE TO FILE
#
#orders_prior.to_csv("./save/orders_prior.csv")
#orders_prior[['order_id', 'item_count']].to_csv("./orders_prior_itemcount.csv")


## Train user basket average

In [7]:
user_basket_avg = orders_prior.groupby('user_id')['item_count'].mean()
user_basket_avg.head()

user_id
1     5.900000
2    13.928571
3     7.333333
4     3.600000
5     9.250000
Name: item_count, dtype: float64

In [8]:
# SAVE TO FILE
#train_user_basket_avg.to_csv("./save/train_user_basket_avg.csv")

## User product count

In [9]:
order_products_prior = pd.merge(order_products_prior, orders_prior[['order_id', 'user_id', 'order_number']], on='order_id', how='left')

In [10]:
order_products_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number
0,2,33120,1,1,202279,3
1,2,28985,2,1,202279,3
2,2,9327,3,0,202279,3
3,2,45918,4,1,202279,3
4,2,30035,5,0,202279,3


In [11]:
user_products_prior_count = order_products_prior[['user_id', 'product_id']].groupby('user_id')['product_id'].value_counts()
user_products_prior_count.head()

user_id  product_id
1        196           10
         12427         10
         10258          9
         25133          8
         13032          3
Name: product_id, dtype: int64

In [12]:
#user_products_prior_count.to_csv("./save/user_products_prior_count.csv")

In [13]:
user_orders_prior_count = orders_prior.groupby('user_id')['order_id'].count()
user_orders_prior_count.head()

user_id
1    10
2    14
3    12
4     5
5     4
Name: order_id, dtype: int64

In [None]:
features = order_products_prior.groupby(['user_id', 'product_id'])[['order_id']].count()
features.columns = ['product_count']
features = features.reset_index()

In [13]:
# percentage of time an item is bought by the user
features['product_part'] = features['product_count']/features['user_id'].map(user_orders_prior_count)

In [16]:
def save_features(path = './save/features.csv'):
    features.to_csv(path)

In [24]:
#save_features()

In [14]:
features.head()

Unnamed: 0,user_id,product_id,product_count,product_part
0,1,196,10,1.0
1,1,10258,9,0.9
2,1,10326,1,0.1
3,1,12427,10,1.0
4,1,13032,3,0.3


In [30]:
features_train = features[features.user_id.isin(train_users)]

In [56]:
features_train.head(100)

Unnamed: 0,user_id,product_id,product_count,product_part,add_to_cart_order,reorder
0,1,196,10,1.000000,1.400000,1.0
1,1,10258,9,0.900000,3.333333,1.0
2,1,10326,1,0.100000,5.000000,0.0
3,1,12427,10,1.000000,3.300000,0.0
4,1,13032,3,0.300000,6.333333,1.0
5,1,13176,2,0.200000,6.000000,0.0
6,1,14084,1,0.100000,2.000000,0.0
7,1,17122,1,0.100000,6.000000,0.0
8,1,25133,8,0.800000,4.000000,1.0
9,1,26088,2,0.200000,4.500000,1.0


In [32]:
order_products_train = pd.merge(order_products_train, orders_train[['order_id', 'user_id']], on='order_id', how='left')
order_products_train['reorder'] = 1

In [33]:
order_products_train.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,reorder
0,1,49302,1,1,112108,1
1,1,11109,2,1,112108,1
2,1,10246,3,0,112108,1
3,1,49683,4,0,112108,1
4,1,43633,5,1,112108,1


In [34]:
features_train = pd.merge(features_train, order_products_train[['user_id', 'product_id', 'reorder']], 
                    on=['user_id', 'product_id'], how='left')
features_train['reorder'] = features_train['reorder'].fillna(0)

In [35]:
features_train.head()

Unnamed: 0,user_id,product_id,product_count,product_part,add_to_cart_order,reorder
0,1,196,10,1.0,1.4,1.0
1,1,10258,9,0.9,3.333333,1.0
2,1,10326,1,0.1,5.0,0.0
3,1,12427,10,1.0,3.3,0.0
4,1,13032,3,0.3,6.333333,1.0


In [37]:
features_test = features[features.user_id.isin(test_users)]
features_test.head()

Unnamed: 0,user_id,product_id,product_count,product_part,add_to_cart_order
120,3,248,1,0.083333,3.0
121,3,1005,1,0.083333,5.0
122,3,1819,3,0.25,2.666667
123,3,7503,1,0.083333,6.0
124,3,8021,1,0.083333,5.0


## model

In [38]:
from sklearn.linear_model import LogisticRegression

In [39]:
clf = LogisticRegression()

In [40]:
clf.fit(features_train[['product_part', 'add_to_cart_order']], features_train['reorder'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [55]:
clf.predict_proba([[1.0,1.0]])[:,1]

array([ 0.83486949])

In [63]:
features_test['proba'] = clf.predict_proba(features_test[['product_part']])[:,1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [64]:
features_test.head()

Unnamed: 0,user_id,product_id,product_count,product_part,proba
120,3,248,1,0.083333,0.0552
121,3,1005,1,0.083333,0.0552
122,3,1819,3,0.25,0.116276
123,3,7503,1,0.083333,0.0552
124,3,8021,1,0.083333,0.0552


In [73]:
features_test = features_test.sort_values(by=['user_id', 'proba'], ascending = [True,False])

In [74]:
features_test.head(20)

Unnamed: 0,user_id,product_id,product_count,product_part,proba
143,3,39190,10,0.833333,0.692799
150,3,47766,9,0.75,0.600446
135,3,21903,8,0.666667,0.500352
125,3,9387,5,0.416667,0.228581
131,3,17668,5,0.416667,0.228581
133,3,18599,4,0.333333,0.164893
148,3,43961,4,0.333333,0.164893
122,3,1819,3,0.25,0.116276
129,3,16797,3,0.25,0.116276
136,3,22035,3,0.25,0.116276


In [77]:
list(features_test[features_test.user_id == 3].product_id[:5])

[39190, 47766, 21903, 9387, 17668]

## Build product list

In [28]:
orders_test = orders[orders.eval_set == "test"]

In [82]:
products = []
count = 0
for _,row in orders_test[['user_id', 'order_id']].iterrows():
    count += 1
    if (count)%10000 == 0:
        print(count)    
    
    user_id, order_id = row['user_id'], row['order_id']
    n = int(user_basket_avg[user_id])+1
    products.append(list(features_test[features_test.user_id == user_id].product_id[:n]))

10000
20000
30000
40000
50000
60000
70000


In [83]:
# create submission
submission = pd.DataFrame()
submission['order_id'] = orders_test['order_id']
submission['products'] = [' '.join([str(x) for x in p]) for p in products]
submission.to_csv('submission_3.csv', index=False)

In [84]:
submission.head()

Unnamed: 0,order_id,products
38,2774568,39190 47766 21903 9387 17668 18599 43961 1819
44,329954,35469 1200 2707 7160
53,1528013,21903 38293 8424 10644 11068
96,1376945,8309 27959 14947 35948 8670 28465 34658 35640 ...
102,1356845,7076 10863 13176 14992 5746 8239 20350 21616 2...


In [15]:
order_products_prior['add_to_cart_order']/order_products_prior['order_id'].map

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id
0,2,33120,1,1,202279
1,2,28985,2,1,202279
2,2,9327,3,0,202279
3,2,45918,4,1,202279
4,2,30035,5,0,202279


In [20]:
add_to_cart_order_avg = order_products_prior.groupby(['user_id', 'product_id'])[['add_to_cart_order']].mean()

In [22]:
#add_to_cart_order_avg.to_csv('./save/add_to_cart_order_avg.csv')

In [23]:
add_to_cart_order_avg.reset_index(inplace=True)

In [25]:
len(add_to_cart_order_avg)

13307953

In [26]:
len(features)

13307953

In [27]:
features = pd.merge(features, add_to_cart_order_avg, on=['user_id', 'product_id'], how='left')

In [28]:
features.head()

Unnamed: 0,user_id,product_id,product_count,product_part,add_to_cart_order
0,1,196,10,1.0,1.4
1,1,10258,9,0.9,3.333333
2,1,10326,1,0.1,5.0
3,1,12427,10,1.0,3.3
4,1,13032,3,0.3,6.333333


In [29]:
features.to_csv('./save/features_2.csv')

In [63]:
features[['product_part', 'add_to_cart_order']].corr()

Unnamed: 0,product_part,add_to_cart_order
product_part,1.0,-0.121252
add_to_cart_order,-0.121252,1.0


In [64]:
orders_prior.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,item_count
0,2539329,1,prior,1,2,8,,5
1,2398795,1,prior,2,3,7,15.0,6
2,473747,1,prior,3,3,12,21.0,5
3,2254736,1,prior,4,4,7,29.0,5
4,431534,1,prior,5,4,15,28.0,8


In [23]:
order_number_total = order_products_prior.groupby(['user_id', 'product_id'])[['order_number']].aggregate('sum')

In [24]:
order_number_total.reset_index(inplace = True)

In [33]:
order_number_total['recent'] = order_number_total['order_number']/order_number_total['user_id'].map(user_orders_prior_count)

In [37]:
order_number_total.head()

Unnamed: 0,user_id,product_id,recent
0,1,196,5.5
1,1,10258,5.4
2,1,10326,0.5
3,1,12427,5.5
4,1,13032,1.9


In [38]:
features.head()

NameError: name 'features' is not defined