In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, date

import sys
from itertools import combinations, groupby
from collections import Counter
from IPython.display import display

from sklearn.cluster import KMeans

In [2]:
AP = 'dec_AP.csv'
ML = 'dec_ML.csv'

In [3]:
orders = pd.read_csv(f'../data/monthwise/{AP}', index_col=0)
item_name   = pd.read_csv('../data/itemlist.csv', index_col=0)
mlDF = pd.read_csv(f"../data/monthwise/{ML}")

In [4]:
def size(obj):
    return "{0:.2f} MB".format(sys.getsizeof(obj) / (1000 * 1000))

print('orders -- dimensions: {0};   size: {1}'.format(orders.shape, size(orders)))
display(orders.head())

orders -- dimensions: (63629, 4);   size: 6.87 MB


Unnamed: 0,TXNDATE,TXNID,SKU,QUANTITY
295495,2017-12-01 04:04:14,171201090408778106,71356,2.0
295496,2017-12-01 04:05:41,17120109050837406,65065,1.0
295497,2017-12-01 04:06:15,171201090611267306,64187,1.0
295498,2017-12-01 04:12:01,171201091134149406,70064,2.0
295499,2017-12-01 04:12:59,171201091237342506,71669,1.0


In [5]:
# Convert from DataFrame to a Series, with order_id as index and item_id as value
orders = orders.set_index('TXNID')['SKU'].rename('item_id')
display(orders.head(10))
type(orders)
print('dimensions: {0};   size: {1};   unique_orders: {2};   unique_items: {3}'
      .format(orders.shape, size(orders), len(orders.index.unique()), len(orders.value_counts())))

TXNID
171201090408778106    71356
17120109050837406     65065
171201090611267306    64187
171201091134149406    70064
171201091237342506    71669
171201091348409206    71347
171201091540122306    65353
171201091540122306    71097
171201091540122306    71096
171201091540122306    69512
Name: item_id, dtype: int64

dimensions: (63629,);   size: 1.02 MB;   unique_orders: 24903;   unique_items: 7445


## Helper functions to the main association rules function

In [6]:
# Returns frequency counts for items and item pairs
def freq(iterable):
    if type(iterable) == pd.core.series.Series:
        return iterable.value_counts().rename("freq")
    else: 
        return pd.Series(Counter(iterable)).rename("freq")

    
# Returns number of unique orders
def order_count(order_item):
    return len(set(order_item.index))


# Returns generator that yields item pairs, one at a time
def get_item_pairs(order_item):
    order_item = order_item.reset_index().as_matrix()
    for order_id, order_object in groupby(order_item, lambda x: x[0]):
        item_list = [item[1] for item in order_object]
              
        for item_pair in combinations(item_list, 2):
            yield item_pair
            

# Returns frequency and support associated with item
def merge_item_stats(item_pairs, item_stats):
    return (item_pairs
                .merge(item_stats.rename(columns={'freq': 'freqA', 'support': 'supportA'}), left_on='item_A', right_index=True)
                .merge(item_stats.rename(columns={'freq': 'freqB', 'support': 'supportB'}), left_on='item_B', right_index=True))


# Returns name associated with item
def merge_item_name(rules, item_name):
    columns = ['itemA','itemB','groupA','groupB','freqAB','freqA','freqB', 
               'confidenceAtoB','confidenceBtoA','lift']
    rules = (rules
                .merge(item_name.rename(columns={'item_name': 'itemA'}), left_on='item_A', right_on='item_id')
                .merge(item_name.rename(columns={'item_name': 'itemB'}), left_on='item_B', right_on='item_id'))
    return rules[columns] 

def merge_item_name2(rules, ML_Final):
    columns = ['item_A','item_B','groupA','groupB','freqAB','freqA','freqB', 
               'confidenceAtoB','confidenceBtoA','lift']
    rules = rules.merge(ML_Final, left_on = 'item_A', right_on = 'SKU')
    rules = rules.rename(columns={'LABEL':'groupA','COST':'costA','MARGIN':'marginA','QUANTITY':'quantityA'})
    rules = rules.rename(columns={'COST_L':'costA_L','MARGIN_L':'marginA_L','QUANTITY_L':'quantityA_L'})
    rules = rules.merge(ML_Final, left_on = 'item_B', right_on = 'SKU')
    rules = rules.rename(columns={'LABEL':'groupB','COST':'costB','MARGIN':'marginB','QUANTITY':'quantityB'})
    rules = rules.rename(columns={'COST_L':'costB_L','MARGIN_L':'marginB_L','QUANTITY_L':'quantityB_L'})
    return rules

## Association rules function

In [7]:
def association_rules(order_item, min_support):

    print("Starting order_item: {:22d}".format(len(order_item)))


    # Calculate item frequency and support
    item_stats             = freq(order_item).to_frame("freq")
    item_stats['support']  = item_stats['freq'] / order_count(order_item) * 100


    # Filter from order_item items below min support 
    qualifying_items       = item_stats[item_stats['support'] >= min_support].index
    order_item             = order_item[order_item.isin(qualifying_items)]

    print("Items with support >= {}: {:15d}".format(min_support, len(qualifying_items)))
    print("Remaining order_item: {:21d}".format(len(order_item)))


    # Filter from order_item orders with less than 2 items
    order_size             = freq(order_item.index)
    qualifying_orders      = order_size[order_size >= 2].index
    order_item             = order_item[order_item.index.isin(qualifying_orders)]

    print("Remaining orders with 2+ items: {:11d}".format(len(qualifying_orders)))
    print("Remaining order_item: {:21d}".format(len(order_item)))


    # Recalculate item frequency and support
    item_stats             = freq(order_item).to_frame("freq")
    item_stats['support']  = item_stats['freq'] / order_count(order_item) * 100


    # Get item pairs generator
    item_pair_gen          = get_item_pairs(order_item)


    # Calculate item pair frequency and support
    item_pairs              = freq(item_pair_gen).to_frame("freqAB")
    item_pairs['supportAB'] = item_pairs['freqAB'] / len(qualifying_orders) * 100

    print("Item pairs: {:31d}".format(len(item_pairs)))


    # Filter from item_pairs those below min support
    item_pairs              = item_pairs[item_pairs['supportAB'] >= min_support]

    print("Item pairs with support >= {}: {:10d}\n".format(min_support, len(item_pairs)))


    # Create table of association rules and compute relevant metrics
    item_pairs = item_pairs.reset_index().rename(columns={'level_0': 'item_A', 'level_1': 'item_B'})
    item_pairs = merge_item_stats(item_pairs, item_stats)
    
    item_pairs['confidenceAtoB'] = item_pairs['supportAB'] / item_pairs['supportA']
    item_pairs['confidenceBtoA'] = item_pairs['supportAB'] / item_pairs['supportB']
    item_pairs['lift']           = item_pairs['supportAB'] / (item_pairs['supportA'] * item_pairs['supportB'])
    
    
    # Return association rules sorted by lift in descending order
    return item_pairs.sort_values('lift', ascending=False)

In [8]:
%%time
rules = association_rules(orders, 0.1) 

Starting order_item:                  63629
Items with support >= 0.1:             538
Remaining order_item:                 28571
Remaining orders with 2+ items:        6631
Remaining order_item:                 18259
Item pairs:                           15795
Item pairs with support >= 0.1:        139

CPU times: user 245 ms, sys: 46.9 ms, total: 292 ms
Wall time: 284 ms


In [9]:
kmeans = KMeans(n_clusters=9)
kmeans.fit(mlDF[['COST','MARGIN','QUANTITY']])
mlDF['LABEL'] = kmeans.predict(mlDF[['COST','MARGIN','QUANTITY']])

In [10]:
LabelDF = mlDF.groupby(['LABEL']).agg({'COST': 'mean', 'MARGIN': 'mean', 'QUANTITY': 'mean'}).reset_index()
LabelDF=LabelDF.rename(columns={'COST':'COST_L','MARGIN':'MARGIN_L','QUANTITY':'QUANTITY_L'})
LabelDF.head(30)

Unnamed: 0,LABEL,COST_L,MARGIN_L,QUANTITY_L
0,0,10.508979,0.17898,34.043702
1,1,999.59,0.500203,1.0
2,2,9.89189,0.195253,256.416667
3,3,0.725,0.261939,898.5
4,4,90.360096,0.141947,3.867052
5,5,11.944525,0.134661,99.298429
6,6,9.62317,0.222412,5.350439
7,7,249.611667,0.123424,3.851852
8,8,34.118031,0.147846,4.623201


In [11]:
ML_Final = mlDF.merge(LabelDF, on='LABEL')

In [12]:
ML_Final.head()

Unnamed: 0,SKU,COST,MARGIN,QUANTITY,LABEL,COST_L,MARGIN_L,QUANTITY_L
0,50054,6.67,0.151244,2.0,6,9.62317,0.222412,5.350439
1,50234,20.84,0.227862,1.0,6,9.62317,0.222412,5.350439
2,50371,6.03,0.329255,10.0,6,9.62317,0.222412,5.350439
3,50409,9.34,0.150136,2.0,6,9.62317,0.222412,5.350439
4,50474,9.0,0.099099,15.0,6,9.62317,0.222412,5.350439


In [13]:
rules = merge_item_name2(rules, ML_Final)

In [14]:
# Replace item ID with item name and display association rules
item_name   = item_name[(item_name['DEPID'] != 65) & (item_name['DEPID'] != 21)]
item_name   = item_name.rename(columns={'SKU':'item_id', 'KEYWORD':'item_name'})
rules_final = merge_item_name(rules, item_name).sort_values('lift', ascending=False)

In [15]:
rules['MONTH']=12
rules.to_csv('Tableau/dec.csv')

In [16]:
rules_final

Unnamed: 0,itemA,itemB,groupA,groupB,freqAB,freqA,freqB,confidenceAtoB,confidenceBtoA,lift
13,MARLBORO MENTHOL GOLD 100'S,MARLBORO GOLD 100,0,5,13,17,48,0.764706,0.270833,1.056409
14,ANDRE EXTRA DRY 750 ML,STERLING VODKA 1.75 LT,5,0,10,34,23,0.294118,0.434783,0.847954
15,19 CRIMES HARD CHARDONNAY 750 ML,19 CRIMES SHIRAZ 2015 750 ML,0,0,9,24,36,0.375000,0.250000,0.690729
31,CONCHA Y TORO FRONTERA MERLOT 1.5 LT,CONCHA Y TORO FRONTERA CAB 1.5 LT,0,5,11,26,41,0.423077,0.268293,0.684250
33,APOTHIC DARK 2013 750 ML,APOTHIC CRUSH 750 ML,0,5,12,31,38,0.387097,0.315789,0.675484
18,19 CRIMES HARD CHARDONNAY 750 ML,19 CRIMES THE BANISHED 750 ML,0,5,12,24,57,0.500000,0.210526,0.581667
23,19 CRIMES SHIRAZ 2015 750 ML,19 CRIMES THE UPRISING 750 ML,0,0,11,36,37,0.305556,0.297297,0.547605
12,PEANUTS ALL TYPES 2/1.09,PEANUTS ALL TYPES 2/1.09,5,5,18,47,47,0.382979,0.382979,0.540326
21,19 CRIMES THE BANISHED 750 ML,19 CRIMES THE UPRISING 750 ML,5,0,17,57,37,0.298246,0.459459,0.534505
20,19 CRIMES HARD CHARDONNAY 750 ML,19 CRIMES THE UPRISING 750 ML,0,0,7,24,37,0.291667,0.189189,0.522714
