In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from IPython.display import display
import xgboost as xgb

print('loading aisles...')
aisles = pd.read_csv('aisles.csv', dtype={
        'aisle_id': np.uint16,
        'aisle': 'category'})

print('loading department...')
department = pd.read_csv('departments.csv', dtype={
            'department_id': np.uint8,
            'department': 'category'})

print('loading products...')
products = pd.read_csv('products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.uint32,
        'aisle_id': np.uint8,
        'department_id': np.uint8})

print('loading prior orders...')
prior = pd.read_csv('order_products__prior.csv', dtype={
        'order_id': np.uint32,
        'product_id': np.uint16,
        'add_to_cart_order': np.uint16,
        'reordered': np.uint16})

print('loading train orders...')
train = pd.read_csv('order_products__train.csv', dtype={
        'order_id': np.uint32,
        'product_id': np.uint16,
        'add_to_cart_order': np.uint16,
        'reordered': np.uint8})

print('loading orders...')
order = pd.read_csv('orders.csv' , dtype={
        'order_id': np.uint32,
        'user_id': np.uint32,
        'eval_set': 'category',
        'order_number': np.uint16,
        'order_dow': np.uint16,
        'order_hour_of_day': np.uint16,
        'days_since_prior_order': np.float32})

loading aisles...
loading department...
loading products...
loading prior orders...
loading train orders...
loading orders...


In [None]:
mean_days_since_prior_order = np.array(order['days_since_prior_order'].groupby(order.user_id).mean().astype(np.float32))
j=0
print('size of mean_days_since_prior_order: ', len(mean_days_since_prior_order))
for i, item in enumerate(order['days_since_prior_order']):
    if np.isnan(item):
        order.loc[i, 'days_since_prior_order'] = mean_days_since_prior_order[j]
        j+=1
order.to_csv('orders_with_NaN_estimate.csv', index=False)

In [2]:
train_orders = order[order.eval_set == 'train'].copy()
test_orders = order[order.eval_set == 'test'].copy()
prior_orders = order[order.eval_set == 'prior'].copy()
train_prior_orders = pd.concat([train_orders, prior_orders], ignore_index=True)

train_prior = pd.concat([train, prior],ignore_index=True)
order.set_index('order_id', inplace=True, drop=False)
train_prior = train_prior.join(order, on='order_id', rsuffix='_')
train_prior.drop('order_id_', inplace=True, axis=1)


In [3]:
users = pd.DataFrame()
users['nb_order'] = train_prior_orders.groupby('user_id').size().astype(np.uint16)
users['avg_days_between_order'] = train_prior.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
users['avg_hour_of_day'] = train_prior.groupby('user_id')['order_hour_of_day'].mean().astype(np.float32)
users['nb_total_items'] = train_prior.groupby('user_id').size().astype(np.uint16)
users['all_products'] = train_prior.groupby('user_id')['product_id'].apply(set) # apply 对每个行或者列调用一次函数
users['nb_distinct_items'] = (users['all_products'].map(len)).astype(np.uint16) #map 对每个元素(element-wise)调用一次函数
users['average_basket'] = (users.nb_total_items / users.nb_order).astype(np.float32)
users['min_days_of_week'] = train_prior.groupby('user_id')['order_dow'].apply(min).astype(np.uint8)
users['max_days_of_week'] = train_prior.groupby('user_id')['order_dow'].apply(max).astype(np.uint8)
users.set_index('user_id', inplace=True, drop=False)

KeyError: 'user_id'

In [None]:
# order_id X user_id related features
features = pd.DataFrame()
features['order_id'] = train_prior_orders['order_id']
features['user_id'] = train_prior_orders['user_id']
features['order_number'] = train_prior_orders['order_number']
features['order_dow'] = train_prior_orders['order_dow']
features['order_hour_of_day'] = train_prior_orders['order_hour_of_day']
# features['days_since_prior_order'] = train_prior_orders['days_since_prior_order']
features['user_nb_order'] = features.user_id.map(users.nb_order)
features['user_avg_days_between_order'] = features.user_id.map(users.avg_days_between_order)
features['user_avg_hour_of_day'] = features.user_id.map(users.avg_hour_of_day)
features['user_nb_total_items'] = features.user_id.map(users.nb_total_items)
features['user_nb_distinct_items'] = features.user_id.map(users.nb_distinct_items)
features['user_average_basket'] = features.user_id.map(users.average_basket)
features['user_min_days_of_week'] = features.user_id.map(users.min_days_of_week)
features['user_max_days_of_week'] = features.user_id.map(users.max_days_of_week)
features.set_index(['order_id', 'user_id'], inplace=True, drop=False)
del users


In [None]:
def gen_features(orders, info):
    g_features = pd.DataFrame
    g_features['order_id'] = orders.order_id
    g_features['user_id'] = orders.user_id
    g_features['order_number'] = orders.order_id.map(features.order_number)
    g_features['order_dow'] = orders.order_id.map(features.order_dow)
    g_features['order_hour_of_day'] = orders.order_id.map(features.order_hour_of_day)
    g_features['user_nb_order'] = orders.user_id.map(features.user_nb_order)
    g_features['user_avg_days_between_order'] = orders.user_id.map(features.user_avg_days_between_order)
    g_features['user_avg_hour_of_day'] = orders.user_id.map(features.user_avg_hour_of_day)
    g_features['user_nb_total_items'] = orders.user_id.map(features.user_nb_total_items)
    g_features['user_nb_distinct_items'] = orders.user_id.map(features.user_nb_distinct_items)
    g_features['user_average_basket'] = orders.user_id.map(features.user_average_basket)
    g_features['user_min_days_of_week'] = orders.user_id.map(features.user_min_days_of_week)
    g_features['user_max_days_of_week'] = orders.user_id.map(features.user_max_days_of_week)
    df_order = pd.DataFrame()
    df_order['order_id'] = info.groupby('order_id')['order_id'].apply(lambda x: x.iloc[0])
    df_order['nb_reorder'] = info.groupby('order_id')['reordered'].sum().apply(np.uint16)
    df_order['reorder_label'] = df_order['nb_reorder'].astype(np.int8)
    df_order.set_index('order_id', inplace=True, drop=False)
    g_features['label'] = g_features.order_id.map(df_order.reorder_label)
    return g_features
train_set = gen_features(prior_orders, prior)
valid_set = gen_features(train_orders, train)
        

In [None]:
display(prior_orders)