In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.float_format', lambda x: '%.4f' % x)


In [2]:
print('loading prior')
priors = pd.read_csv('order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading train')
train = pd.read_csv('order_products__train.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading orders')
orders = pd.read_csv('orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

print('loading products')
products = pd.read_csv('products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])

print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))


loading prior
loading train
loading orders
loading products
priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (1384617, 4): order_id, product_id, add_to_cart_order, reordered


In [3]:
### product features

print('computing products: (count_orders, count_reorders, reorder_rate)')
prods = pd.DataFrame()
prods['orders'] = priors.groupby(priors.product_id).size().astype(np.int32)
prods['reorders'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods
products.head()

computing products: (count_orders, count_reorders, reorder_rate)


Unnamed: 0_level_0,product_id,aisle_id,department_id,orders,reorders,reorder_rate
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,61,19,1852.0,1136.0,0.6134
2,2,104,13,90.0,12.0,0.1333
3,3,94,7,277.0,203.0,0.7329
4,4,38,1,329.0,147.0,0.4468
5,5,5,13,15.0,9.0,0.6


In [4]:
print('add order info to priors')
orders.set_index('order_id', inplace=True, drop=False)
priors = priors.join(orders, on='order_id', rsuffix='_')
priors.drop('order_id_', inplace=True, axis=1)
priors.head()

add order info to priors


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8.0
1,2,28985,2,1,202279,prior,3,5,9,8.0
2,2,9327,3,0,202279,prior,3,5,9,8.0
3,2,45918,4,1,202279,prior,3,5,9,8.0
4,2,30035,5,0,202279,prior,3,5,9,8.0


In [5]:
### user features

print('computing user: (average_days_between_orders, nb_orders)')
usr = pd.DataFrame()
usr['average_days_between_orders'] = orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
usr['nb_orders'] = orders.groupby('user_id').size().astype(np.int16)

print('computing user: (total_items, all_products, total_distinct_items)')
users = pd.DataFrame()
users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)

users = users.join(usr)
del usr
print('computing user: (average_basket)')
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
print('Shape of user :', users.shape)
users.head()

computing user: (average_days_between_orders, nb_orders)
computing user: (total_items, all_products, total_distinct_items)
computing user: (average_basket)
Shape of user : (206209, 6)


Unnamed: 0_level_0,total_items,all_products,total_distinct_items,average_days_between_orders,nb_orders,average_basket
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,59,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",18,19.0,11,5.3636
2,195,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1...",102,16.2857,15,13.0
3,88,"{17668, 44683, 48523, 21903, 14992, 21137, 324...",33,12.0,13,6.7692
4,18,"{21573, 42329, 17769, 35469, 37646, 1200, 1905...",17,17.0,6,3.0
5,37,"{11777, 40706, 28289, 48775, 20754, 6808, 1398...",23,11.5,5,7.4


In [6]:
### user X product features

print('compute userXproduct f - this is long...')
priors['user_product'] = priors.product_id + priors.user_id * 100000
priors.head()

compute userXproduct f - this is long...


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,user_product
0,2,33120,1,1,202279,prior,3,5,9,8.0,20227933120
1,2,28985,2,1,202279,prior,3,5,9,8.0,20227928985
2,2,9327,3,0,202279,prior,3,5,9,8.0,20227909327
3,2,45918,4,1,202279,prior,3,5,9,8.0,20227945918
4,2,30035,5,0,202279,prior,3,5,9,8.0,20227930035


In [7]:
d= dict()
for row in priors.itertuples():
    z = row.user_product
    if z not in d:
        d[z] = (1, (row.order_number, row.order_id), row.add_to_cart_order)
    else:
        d[z] = (d[z][0] + 1, max(d[z][1], (row.order_number, row.order_id)), d[z][2] + row.add_to_cart_order)

print('to dataframe (less memory)')
userXproduct = pd.DataFrame.from_dict(d, orient = 'index')
userXproduct.head(10)

to dataframe (less memory)


Unnamed: 0,0,1,2
20227933120,5,"(8, 104690)",10
20227928985,5,"(6, 132412)",16
20227909327,1,"(3, 2)",3
20227945918,5,"(7, 2382766)",24
20227930035,3,"(7, 2382766)",14
20227917794,7,"(7, 2382766)",25
20227940141,5,"(6, 132412)",29
20227901819,2,"(3, 2)",19
20227943668,3,"(6, 132412)",20
20597033754,17,"(25, 368699)",86


In [8]:
# This was to slow !!
#def last_order(order_group):
#    ix = order_group.order_number.idxmax
#    return order_group.shape[0], order_group.order_id[ix],  order_group.add_to_cart_order.mean()
#userXproduct = pd.DataFrame()
#userXproduct['tmp'] = df.groupby('user_product').apply(last_order)

d= dict()
for row in priors.itertuples():
    z = row.user_product
    if z not in d:
        d[z] = (1, (row.order_number, row.order_id), row.add_to_cart_order)
    else:
        d[z] = (d[z][0] + 1, max(d[z][1], (row.order_number, row.order_id)), d[z][2] + row.add_to_cart_order)

print('to dataframe (less memory)')
userXproduct = pd.DataFrame.from_dict(d, orient = 'index')
del d

userXproduct.columns = ['nb_orders', 'last_order_id', 'sum_pos_in_cart']
userXproduct.nb_orders = userXproduct.nb_orders.astype(np.int16)
userXproduct.last_order_id = userXproduct.last_order_id.map(lambda x: x[1]).astype(np.int32)
userXproduct.sum_pos_in_cart = userXproduct.sum_pos_in_cart.astype(np.int16)
print('user X product f', len(userXproduct))

del priors
userXproduct.head()

to dataframe (less memory)
user X product f 13307953


Unnamed: 0,nb_orders,last_order_id,sum_pos_in_cart
20227933120,5,104690,10
20227928985,5,132412,16
20227909327,1,2,3
20227945918,5,2382766,24
20227930035,3,2382766,14


In [9]:
### train / test orders ###
print('split orders : train, test')
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']

train.set_index(['order_id', 'product_id'], inplace=True, drop=False)


split orders : train, test


In [10]:
def features(selected_orders, labels_given=False):
    print('build candidate list: (order_list, product_list)')
    order_list = []
    product_list = []
    labels = []
    i=0
    for row in selected_orders.itertuples():
        i+=1
        if i%20000 == 0: print('order row',i)
        order_id = row.order_id
        user_id = row.user_id
        user_products = users.all_products[user_id] ###
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_given:
            labels += [(order_id, product) in train.index for product in user_products]
        
    df = pd.DataFrame({'order_id':order_list, 'product_id':product_list}, dtype=np.int32)
    labels = np.array(labels, dtype=np.int8)
    print("Size of candidate list :", df.shape[0])
    del order_list
    del product_list
    
    print('user related features')
    df['user_id'] = df.order_id.map(orders.user_id)
    df['user_total_orders'] = df.user_id.map(users.nb_orders)
    df['user_total_items'] = df.user_id.map(users.total_items)
    df['total_distinct_items'] = df.user_id.map(users.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(users.average_basket)
    
    print('order related features')
    df['dow'] = df.order_id.map(orders.order_dow)
    df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
    
    print('product related features')
    df['aisle_id'] = df.product_id.map(products.aisle_id)
    df['department_id'] = df.product_id.map(products.department_id)
    df['product_orders'] = df.product_id.map(products.orders).astype(np.int32)
    df['product_reorders'] = df.product_id.map(products.reorders)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)

    print('user_X_product related features')
    df['z'] = df.user_id * 100000 + df.product_id
    df.drop(['user_id'], axis=1, inplace=True)
    df['UP_orders'] = df.z.map(userXproduct.nb_orders)
    df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
    df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders).astype(np.float32)
    df['UP_reorder_rate'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)
    df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == \
                                                  df.order_id.map(orders.order_dow)
    df.UP_same_dow_as_last_order = df.UP_same_dow_as_last_order.astype("category").cat.codes

    df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)
#    print(df.dtypes)
#    print(df.memory_usage())
    print("Shape of training set :", df.shape)
    return (df, labels)


In [11]:
df_train, labels = features(train_orders, labels_given = True) 
df_train.head()

build candidate list: (order_list, product_list)
order row 20000
order row 40000
order row 60000
order row 80000
order row 100000
order row 120000
Size of candidate list : 8474661
user related features
order related features
product related features
user_X_product related features
Shape of training set : (8474661, 23)


Unnamed: 0,order_id,product_id,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_average_basket,dow,order_hour_of_day,days_since_prior_order,...,product_orders,product_reorders,product_reorder_rate,UP_orders,UP_orders_ratio,UP_average_pos_in_cart,UP_reorder_rate,UP_orders_since_last,UP_delta_hour_vs_last,UP_same_dow_as_last_order
0,1187899,17122,11,59,18,19.0,5.3636,4,8,14.0,...,13880,9377.0,0.6756,1,0.0909,6.0,0.0909,6,7,1
1,1187899,196,11,59,18,19.0,5.3636,4,8,14.0,...,35791,27791.0,0.7765,10,0.9091,1.4,0.9091,1,0,1
2,1187899,26405,11,59,18,19.0,5.3636,4,8,14.0,...,1214,536.0,0.4415,2,0.1818,5.0,0.1818,7,1,1
3,1187899,46149,11,59,18,19.0,5.3636,4,8,14.0,...,8558,6953.0,0.8125,3,0.2727,3.0,0.2727,1,0,1
4,1187899,14084,11,59,18,19.0,5.3636,4,8,14.0,...,15935,12923.0,0.811,1,0.0909,2.0,0.0909,10,0,0


In [12]:
df_train = df_train.fillna(value = 0)

y = pd.DataFrame(labels, columns=["reordered"])
class_weight_0 = (y.shape[0] - np.sum(y.reordered)) / y.shape[0]
class_weight_1 = np.sum(y.reordered) / y.shape[0]
print("Class weight of 0 :", class_weight_0)
print("Class weight of 1 :", class_weight_1)

Class weight of 0 : 0.902199745807
Class weight of 1 : 0.0978002541931


In [13]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

from sklearn.linear_model import LogisticRegression

tree = DecisionTreeClassifier(criterion = 'entropy', 
                              max_depth = None, 
                              random_state = 0,
                              class_weight = {1: class_weight_1, 0: class_weight_0})




NameError: name 'DecisionTreeClassifier' is not defined

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df_train, y, test_size=0.2, random_state=0)

In [None]:
def validation(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val).reshape(y_val.shape[0], 1)
    print('Misclassified samples: %d' % (y_val != y_pred).sum())
    print('Accuracy : %.2f' % ((y_val == y_pred).sum() / y_val.shape[0]))
    print('Accuracy (sklearn): %.2f' % accuracy_score(y_val, y_pred))
    print("=== Confusion Matrix ===")
    print(confusion_matrix(y_val, y_pred, labels = [0,1], sample_weight = None))
    print("=== F1-score ===")
    print(f1_score(y_val, y_pred, labels=[0,1]))


In [None]:
validation(tree)

In [None]:
lg.fit(df_train, y)
print(str(lg), "training down")

df_test, _ = features(test_orders)

print('predict')
preds = lg.predict(df_test)

df_test['pred'] = preds

TRESHOLD = 0.22  # guess, should be tuned with crossval on a subset of train data

d = dict()
for row in df_test.itertuples():
    if row.pred > TRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in test_orders.order_id:
    if order not in d:
        d[order] = 'None'

sub = pd.DataFrame.from_dict(d, orient='index')

sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.to_csv('sub.csv', index=False)
