In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, date

import sys
from itertools import combinations, groupby
from collections import Counter
from IPython.display import display

In [2]:
def size(obj):
    return "{0:.2f} MB".format(sys.getsizeof(obj) / (1000 * 1000))

In [3]:
orders = pd.read_csv('../data/monthwise/jun_AP.csv', index_col=0)
print('orders -- dimensions: {0};   size: {1}'.format(orders.shape, size(orders)))
display(orders.head())

orders -- dimensions: (90918, 4);   size: 9.82 MB


Unnamed: 0,TXNDATE,TXNID,SKU,QUANTITY
41446,2017-06-01 05:03:46,170601090330327106,70463,1.0
41447,2017-06-01 05:05:17,170601090449372306,65719,6.0
41448,2017-06-01 05:10:06,170601090934606506,71532,18.0
41449,2017-06-01 05:17:37,170601091713850006,71604,3.0
41450,2017-06-01 05:27:19,170601092159340006,65220,1.0


In [4]:
# Convert from DataFrame to a Series, with order_id as index and item_id as value

orders = orders.set_index('TXNID')['SKU'].rename('item_id')
display(orders.head(10))
type(orders)

TXNID
170601090330327106    70463
170601090449372306    65719
170601090934606506    71532
170601091713850006    71604
170601092159340006    65220
170601092159340006    70427
170601092159340006    64514
170601092159340006    70402
170601092159340006    67198
170601092159340006    71333
Name: item_id, dtype: int64

pandas.core.series.Series

In [5]:
print('dimensions: {0};   size: {1};   unique_orders: {2};   unique_items: {3}'
      .format(orders.shape, size(orders), len(orders.index.unique()), len(orders.value_counts())))

dimensions: (90918,);   size: 1.45 MB;   unique_orders: 40790;   unique_items: 8437


## Helper functions to the main association rules function

In [6]:
# Returns frequency counts for items and item pairs
def freq(iterable):
    if type(iterable) == pd.core.series.Series:
        return iterable.value_counts().rename("freq")
    else: 
        return pd.Series(Counter(iterable)).rename("freq")

    
# Returns number of unique orders
def order_count(order_item):
    return len(set(order_item.index))


# Returns generator that yields item pairs, one at a time
def get_item_pairs(order_item):
    order_item = order_item.reset_index().as_matrix()
    for order_id, order_object in groupby(order_item, lambda x: x[0]):
        item_list = [item[1] for item in order_object]
              
        for item_pair in combinations(item_list, 2):
            yield item_pair
            

# Returns frequency and support associated with item
def merge_item_stats(item_pairs, item_stats):
    return (item_pairs
                .merge(item_stats.rename(columns={'freq': 'freqA', 'support': 'supportA'}), left_on='item_A', right_index=True)
                .merge(item_stats.rename(columns={'freq': 'freqB', 'support': 'supportB'}), left_on='item_B', right_index=True))


# Returns name associated with item
def merge_item_name(rules, item_name):
    columns = ['itemA','itemB','freqAB','supportAB','freqA','supportA','freqB','supportB', 
               'confidenceAtoB','confidenceBtoA','lift']
    rules = (rules
                .merge(item_name.rename(columns={'item_name': 'itemA'}), left_on='item_A', right_on='item_id')
                .merge(item_name.rename(columns={'item_name': 'itemB'}), left_on='item_B', right_on='item_id'))
    return rules[columns] 

## Association rules function

In [7]:
def association_rules(order_item, min_support):

    print("Starting order_item: {:22d}".format(len(order_item)))


    # Calculate item frequency and support
    item_stats             = freq(order_item).to_frame("freq")
    item_stats['support']  = item_stats['freq'] / order_count(order_item) * 100


    # Filter from order_item items below min support 
    qualifying_items       = item_stats[item_stats['support'] >= min_support].index
    order_item             = order_item[order_item.isin(qualifying_items)]

    print("Items with support >= {}: {:15d}".format(min_support, len(qualifying_items)))
    print("Remaining order_item: {:21d}".format(len(order_item)))


    # Filter from order_item orders with less than 2 items
    order_size             = freq(order_item.index)
    qualifying_orders      = order_size[order_size >= 2].index
    order_item             = order_item[order_item.index.isin(qualifying_orders)]

    print("Remaining orders with 2+ items: {:11d}".format(len(qualifying_orders)))
    print("Remaining order_item: {:21d}".format(len(order_item)))


    # Recalculate item frequency and support
    item_stats             = freq(order_item).to_frame("freq")
    item_stats['support']  = item_stats['freq'] / order_count(order_item) * 100


    # Get item pairs generator
    item_pair_gen          = get_item_pairs(order_item)


    # Calculate item pair frequency and support
    item_pairs              = freq(item_pair_gen).to_frame("freqAB")
    item_pairs['supportAB'] = item_pairs['freqAB'] / len(qualifying_orders) * 100

    print("Item pairs: {:31d}".format(len(item_pairs)))


    # Filter from item_pairs those below min support
    item_pairs              = item_pairs[item_pairs['supportAB'] >= min_support]

    print("Item pairs with support >= {}: {:10d}\n".format(min_support, len(item_pairs)))


    # Create table of association rules and compute relevant metrics
    item_pairs = item_pairs.reset_index().rename(columns={'level_0': 'item_A', 'level_1': 'item_B'})
    item_pairs = merge_item_stats(item_pairs, item_stats)
    
    item_pairs['confidenceAtoB'] = item_pairs['supportAB'] / item_pairs['supportA']
    item_pairs['confidenceBtoA'] = item_pairs['supportAB'] / item_pairs['supportB']
    item_pairs['lift']           = item_pairs['supportAB'] / (item_pairs['supportA'] * item_pairs['supportB'])
    
    
    # Return association rules sorted by lift in descending order
    return item_pairs.sort_values('lift', ascending=False)

In [8]:
%%time
rules = association_rules(orders, 0.1) 

Starting order_item:                  90918
Items with support >= 0.1:             424
Remaining order_item:                 40374
Remaining orders with 2+ items:        8934
Remaining order_item:                 22165
Item pairs:                           12214
Item pairs with support >= 0.1:        157

CPU times: user 235 ms, sys: 49 ms, total: 284 ms
Wall time: 282 ms


In [9]:
# Replace item ID with item name and display association rules
item_name   = pd.read_csv('../data/itemlist.csv', index_col=0)
item_name   = item_name[(item_name['DEPID'] != 65) & (item_name['DEPID'] != 21)]
item_name   = item_name.rename(columns={'SKU':'item_id', 'KEYWORD':'item_name'})
rules_final = merge_item_name(rules, item_name).sort_values('lift', ascending=False)
display(rules_final)

Unnamed: 0,itemA,itemB,freqAB,supportAB,freqA,supportA,freqB,supportB,confidenceAtoB,confidenceBtoA,lift
0,KEG DEPOSIT,TAP DEPOSIT,65,0.727558,69,0.77233,65,0.727558,0.942029,1.0,1.294783
15,DI AMORE SAMBUCA 750 ML,MARLBORO GOLD 100,9,0.100739,22,0.24625,54,0.604433,0.409091,0.166667,0.676818
17,CK MONDAVI CABERNET 1.5 LT,CK MONDAVI CHARDONNAY 1.5 LT,12,0.134318,44,0.492501,37,0.414148,0.272727,0.324324,0.658526
18,R STRONG SONOMA CHARD 750 ML 2016,SIMI CHARDONNAY 750 ML,12,0.134318,40,0.447728,65,0.727558,0.3,0.184615,0.412338
19,PARLIAMENT WHITE,CONCHA Y TORO FRONTERA CAB 1.5 LT,12,0.134318,103,1.152899,27,0.302216,0.116505,0.444444,0.385502
20,CUPCAKE CHARDONNAY 750 ML,CHAT ST MICHELLE CHARDONNAY 750 ML,13,0.145512,58,0.649205,66,0.738751,0.224138,0.19697,0.303401
23,WOODBRIDGE CAB 1.5 LT,WOODBRIDGE PG 1.5 LT,11,0.123125,55,0.615626,60,0.671592,0.2,0.183333,0.2978
24,J LOHR CHARDONNAY RIVERSTONE 750 ML,HESS SELECT CHARDONNAY 750 ML,9,0.100739,70,0.783524,41,0.458921,0.128571,0.219512,0.28016
16,RUFFINO LUMINA PINOT GRIGI 1.5 LT,MARLBORO GOLD 100,10,0.111932,63,0.705171,54,0.604433,0.15873,0.185185,0.26261
25,CAVIT PINOT GRIGIO 750 ML,BOGLE MERLOT 750 ML,14,0.156705,70,0.783524,75,0.83949,0.2,0.186667,0.23824
