In [1]:
import pandas as pd
import numpy as np
np.random.seed(42)

In [2]:
# store the data to hdf format for faster I/O
# df = pd.read_excel("Online Retail.xlsx")
# df.to_hdf('online_retail.h5','original')

In [3]:
df_original = pd.read_hdf('online_retail.h5','original')
# df_original.head()

In [4]:
# count missing values
df_original.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [5]:
# remove rows without a customer id
df = df_original[df_original.CustomerID.notnull()].copy()

# convert columns to appropriate data types
df.CustomerID = df.CustomerID.astype(np.int16)

# remove canceled items
canceled = df[df.Quantity < 0]
canceled = canceled[['CustomerID','StockCode','Quantity']]
df = df.merge(canceled,how='left',on=['CustomerID','StockCode'],suffixes=('', '_c'))
df.Quantity_c = df.Quantity_c.fillna(0)
df.Quantity = df.Quantity + df.Quantity_c
df = df.drop('Quantity_c',1)

# remove non-commodity rows
df = df[(df.Quantity > 0) & (~df.StockCode.isin(['BANK CHARGES', 'C2', 'DOT', 'PADS', 'POST','M']))]

# remove customers with less then 4 orders
num_invoices = df.groupby('CustomerID')['InvoiceNo'].unique().apply(len)
customers_to_use = num_invoices[num_invoices > 3].index
df = df[df.CustomerID.isin(customers_to_use)]
# df.to_hdf('online_retail.h5','clean')
# df.head()

In [6]:
# extract products and customers to seperate tables
products = df[['StockCode','Description']].sort_values(['StockCode','Description']).drop_duplicates('StockCode')
customers = df.sort_values('CustomerID')[['CustomerID','Country']].drop_duplicates('CustomerID')

In [7]:
# extract orders to a seperate table
orders = df[['CustomerID','InvoiceNo','InvoiceDate']].sort_values(['CustomerID','InvoiceDate']).drop_duplicates('InvoiceNo')

# create new features based on the InvoiceDate feature
orders['order_number'] = orders.groupby('CustomerID').cumcount() + 1
orders['order_dow'] = orders.InvoiceDate.dt.dayofweek
orders['order_hour_of_day'] = orders.InvoiceDate.dt.hour
orders['days_since_prior'] = (orders.InvoiceDate - orders.groupby('CustomerID')['InvoiceDate'].shift(1)).dt.days

# split all orders into 'prior', 'train' and 'test'
last_orders = orders.groupby('CustomerID')['order_number'].max().reset_index()
n_customers = last_orders.shape[0]
split_point = int(np.ceil(n_customers * 0.8))
random_index = np.random.choice(n_customers,n_customers,replace=False)
train_index = random_index[:split_point]
test_index = random_index[split_point:]
last_orders['eval_set'] = 'placeholder'
last_orders['eval_set'].iloc[train_index] = 'train'
last_orders['eval_set'].iloc[test_index] = 'test'
orders = orders.merge(last_orders,how='left',on=['CustomerID','order_number'])
orders.eval_set = orders.eval_set.fillna('prior')
# orders.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [8]:
# create a new table to show what products are in each order
# and calculate add to cart order
order_products = df[['InvoiceNo','StockCode','Quantity']].copy()
order_products['add_to_cart_order'] = order_products.groupby('InvoiceNo').cumcount() + 1
order_products = order_products.merge(orders[['InvoiceNo','eval_set']],how='left',on='InvoiceNo')
# order_products.head()

In [9]:
# calculate whether a product has been ordered before
# takes more than 20 minutes
def previous_orders(InvoiceNo):
    row = orders[orders.InvoiceNo == InvoiceNo]
    CustomerID = int(row.CustomerID)
    order_number = int(row.order_number)
    prev_ord_nums = range(1,order_number)
    df = orders[(orders.CustomerID == CustomerID)&(orders.order_number.isin(prev_ord_nums))]
    return df.InvoiceNo.values

def previous_items(InvoiceNo):
    prev_ord = previous_orders(InvoiceNo)
    df = order_products[order_products.InvoiceNo.isin(prev_ord)]
    return df.StockCode.unique()

prev_item_dict = {}
count = 0
for inv_no in orders.InvoiceNo.unique():
    prev_item_dict[inv_no] = previous_items(inv_no)

order_products['reordered'] = -1
for i in range(order_products.shape[0]):
    row = order_products.iloc[i]
    prev_items = prev_item_dict[row.InvoiceNo]
    order_products.iloc[i,-1] = int(row.StockCode in prev_items)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000


In [14]:
# save data to hdf
# orders.to_hdf('online_retail.h5','orders')
# products.to_hdf('online_retail.h5','products')
# customers.to_hdf('online_retail.h5','customers')
# order_products.to_hdf('online_retail.h5','order_products')

Unnamed: 0,InvoiceNo,StockCode,Quantity,add_to_cart_order,eval_set,reordered
0,536365,85123A,6.0,1,prior,0
1,536365,71053,6.0,2,prior,0
2,536365,84406B,8.0,3,prior,0
3,536365,84029G,6.0,4,prior,0
4,536365,84029E,6.0,5,prior,0
5,536365,22752,2.0,6,prior,0
6,536365,21730,5.0,7,prior,0
7,536365,21730,5.0,8,prior,0
8,536366,22633,6.0,1,prior,0
9,536367,84879,32.0,1,prior,0
