#### Codes from this jupyter notebook were taken or adapted from CSCI 5523 project 2 experiment 2.  

In [1]:
import pandas as pd
import numpy as np
import sys
from itertools import combinations, groupby
from collections import Counter
from IPython.display import display

In [2]:
# Function that returns the size of an object in MB
def size(obj):
    return "{0:.2f} MB".format(sys.getsizeof(obj) / (1000 * 1000))

### Data Input

#### Load order_products__prior

In [3]:
orders = pd.read_csv('order_products__prior.csv')
print('orders -- dimensions: {0};   size: {1}'.format(orders.shape, size(orders)))
display(orders.head())

orders -- dimensions: (32434489, 4);   size: 1037.90 MB


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


#### Convert order_products__prior into the format compatible with the Association Rules Function shown below.

In [4]:
# Convert from DataFrame to a Series, with order_id as index and item_id as value
orders = orders.set_index('order_id')['product_id'].rename('item_id')
display(orders.head(10))
type(orders)

order_id
2    33120
2    28985
2     9327
2    45918
2    30035
2    17794
2    40141
2     1819
2    43668
3    33754
Name: item_id, dtype: int64

pandas.core.series.Series

#### Summary statistics of order_products__prior

In [5]:
print('dimensions: {0};   size: {1};   unique_orders: {2};   unique_items: {3}'
      .format(orders.shape, size(orders), len(orders.index.unique()), len(orders.value_counts())))

dimensions: (32434489,);   size: 518.95 MB;   unique_orders: 3214874;   unique_items: 49677


### Association Rules Function
Credits to the CSCI 5523 instructor and TAs. 

#### Define helper functions for the main association rules function

In [6]:
# Returns frequency counts for items and item pairs
def freq(iterable):
    if type(iterable) == pd.core.series.Series:
        return iterable.value_counts().rename("freq")
    else: 
        return pd.Series(Counter(iterable)).rename("freq")

    
# Returns number of unique orders
def order_count(order_item):
    return len(set(order_item.index))


# Returns generator that yields item pairs, one at a time
def get_item_pairs(order_item):
    order_item = order_item.reset_index().to_numpy()
    for order_id, order_object in groupby(order_item, lambda x: x[0]):
        item_list = [item[1] for item in order_object]
              
        for item_pair in combinations(item_list, 2):
            yield item_pair
            

# Returns frequency and support associated with item
def merge_item_stats(item_pairs, item_stats):
    return (item_pairs
                .merge(item_stats.rename(columns={'freq': 'freqA', 'support': 'supportA'}), left_on='item_A', right_index=True)
                .merge(item_stats.rename(columns={'freq': 'freqB', 'support': 'supportB'}), left_on='item_B', right_index=True))


# Returns name associated with item
def merge_item_name(rules, item_name):
    columns = ['itemA','itemB','freqAB','supportAB','freqA','supportA','freqB','supportB', 
               'confidenceAtoB','confidenceBtoA','lift']
    rules = (rules
                .merge(item_name.rename(columns={'item_name': 'itemA'}), left_on='item_A', right_on='item_id')
                .merge(item_name.rename(columns={'item_name': 'itemB'}), left_on='item_B', right_on='item_id'))
    return rules[columns]               

#### Define association rules function

In [7]:
def association_rules(order_item, min_support):

    print("Starting order_item: {:22d}".format(len(order_item)))


    # Calculate item frequency and support
    item_stats             = freq(order_item).to_frame("freq")
    item_stats['support']  = item_stats['freq'] / order_count(order_item) * 100


    # Filter from order_item items below min support 
    qualifying_items       = item_stats[item_stats['support'] >= min_support].index
    order_item             = order_item[order_item.isin(qualifying_items)]

    print("Items with support >= {}: {:15d}".format(min_support, len(qualifying_items)))
    print("Remaining order_item: {:21d}".format(len(order_item)))


    # Filter from order_item orders with less than 2 items
    order_size             = freq(order_item.index)
    qualifying_orders      = order_size[order_size >= 2].index
    order_item             = order_item[order_item.index.isin(qualifying_orders)]

    print("Remaining orders with 2+ items: {:11d}".format(len(qualifying_orders)))
    print("Remaining order_item: {:21d}".format(len(order_item)))


    # Recalculate item frequency and support
    item_stats             = freq(order_item).to_frame("freq")
    item_stats['support']  = item_stats['freq'] / order_count(order_item) * 100


    # Get item pairs generator
    item_pair_gen          = get_item_pairs(order_item)


    # Calculate item pair frequency and support
    item_pairs              = freq(item_pair_gen).to_frame("freqAB")
    item_pairs['supportAB'] = item_pairs['freqAB'] / len(qualifying_orders) * 100

    print("Item pairs: {:31d}".format(len(item_pairs)))


    # Filter from item_pairs those below min support
    item_pairs              = item_pairs[item_pairs['supportAB'] >= min_support]

    print("Item pairs with support >= {}: {:10d}\n".format(min_support, len(item_pairs)))


    # Create table of association rules and compute relevant metrics
    item_pairs = item_pairs.reset_index().rename(columns={'level_0': 'item_A', 'level_1': 'item_B'})
    item_pairs = merge_item_stats(item_pairs, item_stats)
    
    item_pairs['confidenceAtoB'] = item_pairs['supportAB'] / item_pairs['supportA']
    item_pairs['confidenceBtoA'] = item_pairs['supportAB'] / item_pairs['supportB']
    item_pairs['lift']           = item_pairs['supportAB'] / (item_pairs['supportA'] * item_pairs['supportB'])
    
    
    # Return association rules sorted by lift in descending order
    return item_pairs.sort_values('lift', ascending=False)

### Association Rules Mining

In [8]:
!mkdir -p rules

#### Find rules using min_support = 0.05

In [9]:
%%time
rules_05 = association_rules(orders, 0.05)  
rules_05

Starting order_item:               32434489
Items with support >= 0.05:            3402
Remaining order_item:              24413014
Remaining orders with 2+ items:     2873696
Remaining order_item:              24158959
Item pairs:                         8089434
Item pairs with support >= 0.05:       4666

CPU times: user 2min 23s, sys: 3.75 s, total: 2min 27s
Wall time: 2min 27s


Unnamed: 0,item_A,item_B,freqAB,supportAB,freqA,supportA,freqB,supportB,confidenceAtoB,confidenceBtoA,lift
2263,38312,15984,1640,0.057069,5705,0.198525,4710,0.163900,0.287467,0.348195,1.753913
3613,4962,38544,1533,0.053346,6113,0.212723,6267,0.218082,0.250777,0.244615,1.149923
1033,44156,23296,1608,0.055956,7614,0.264955,6364,0.221457,0.211190,0.252671,0.953639
2458,26131,8490,1437,0.050005,7209,0.250862,6547,0.227825,0.199334,0.219490,0.874944
3780,44156,33548,1528,0.053172,7614,0.264955,7748,0.269618,0.200683,0.197212,0.744323
...,...,...,...,...,...,...,...,...,...,...,...
4194,25890,13176,1573,0.054738,49811,1.733343,374844,13.043968,0.031579,0.004196,0.002421
306,5876,24852,3408,0.118593,87330,3.038944,468088,16.288710,0.039024,0.007281,0.002396
2761,4605,13176,2182,0.075930,72935,2.538021,374844,13.043968,0.029917,0.005821,0.002294
3044,8174,24852,1477,0.051397,42579,1.481681,468088,16.288710,0.034688,0.003155,0.002130


In [10]:
rules_05.to_csv("rules/rules_05.csv",index=False)

#### Find rules using min_support = 0.04

In [11]:
%%time
rules_04 = association_rules(orders, 0.04)  
rules_04

Starting order_item:               32434489
Items with support >= 0.04:            4119
Remaining order_item:              25437917
Remaining orders with 2+ items:     2906194
Remaining order_item:              25200624
Item pairs:                        10668702
Item pairs with support >= 0.04:       6569

CPU times: user 2min 44s, sys: 4.54 s, total: 2min 48s
Wall time: 2min 48s


Unnamed: 0,item_A,item_B,freqAB,supportAB,freqA,supportA,freqB,supportB,confidenceAtoB,confidenceBtoA,lift
2803,38312,15984,1640,0.056431,5708,0.196408,4710,0.162068,0.287316,0.348195,1.772816
2804,48220,15984,1289,0.044354,4703,0.161827,4710,0.162068,0.274080,0.273673,1.691148
2802,38312,48220,1376,0.047347,5708,0.196408,4703,0.161827,0.241065,0.292579,1.489649
4666,4962,38544,1533,0.052749,6117,0.210481,6269,0.215712,0.250613,0.244537,1.161796
2665,12745,17553,1196,0.041153,8014,0.275756,4318,0.148579,0.149239,0.276980,1.004440
...,...,...,...,...,...,...,...,...,...,...,...
1292,16759,24852,1318,0.045351,35700,1.228411,468658,16.126177,0.036919,0.002812,0.002289
1626,8424,13176,1197,0.041188,41932,1.442849,375296,12.913660,0.028546,0.003189,0.002211
3853,8174,24852,1477,0.050822,42602,1.465904,468658,16.126177,0.034670,0.003152,0.002150
1357,28842,13176,1197,0.041188,45365,1.560976,375296,12.913660,0.026386,0.003189,0.002043


In [12]:
rules_04.to_csv("rules/rules_04.csv",index=False)

#### Find rules using min_support = 0.03

In [13]:
%%time
rules_03 = association_rules(orders, 0.03)  
rules_03

Starting order_item:               32434489
Items with support >= 0.03:            5213
Remaining order_item:              26655813
Remaining orders with 2+ items:     2941274
Remaining order_item:              26436500
Item pairs:                        14538268
Item pairs with support >= 0.03:      10049

CPU times: user 2min 56s, sys: 4.74 s, total: 3min
Wall time: 3min


Unnamed: 0,item_A,item_B,freqAB,supportAB,freqA,supportA,freqB,supportB,confidenceAtoB,confidenceBtoA,lift
3608,38312,15984,1640,0.055758,5711,0.194168,4714,0.160271,0.287165,0.347900,1.791751
3609,48220,15984,1289,0.043825,4703,0.159897,4714,0.160271,0.274080,0.273441,1.710109
5148,5491,30192,930,0.031619,5577,0.189612,3234,0.109952,0.166756,0.287570,1.516623
3607,38312,48220,1376,0.046782,5711,0.194168,4703,0.159897,0.240939,0.292579,1.506839
7811,35633,23291,970,0.032979,4648,0.158027,4644,0.157891,0.208692,0.208872,1.321749
...,...,...,...,...,...,...,...,...,...,...,...
7,13176,23909,1013,0.034441,375692,12.773104,36515,1.241469,0.002696,0.027742,0.002172
1682,28842,13176,1197,0.040697,45389,1.543175,375692,12.773104,0.026372,0.003186,0.002065
3568,29487,13176,1033,0.035121,40017,1.360533,375692,12.773104,0.025814,0.002750,0.002021
5527,12341,24852,1482,0.050386,49077,1.668563,469178,15.951523,0.030197,0.003159,0.001893


In [14]:
rules_03.to_csv("rules/rules_03.csv",index=False)

#### Find rules using min_support = 0.02

In [15]:
%%time
rules_02 = association_rules(orders, 0.02)  
rules_02

Starting order_item:               32434489
Items with support >= 0.02:            6985
Remaining order_item:              28050418
Remaining orders with 2+ items:     2975061
Remaining order_item:              27848926
Item pairs:                        20384055
Item pairs with support >= 0.02:      18153

CPU times: user 3min 20s, sys: 6.19 s, total: 3min 26s
Wall time: 3min 26s


Unnamed: 0,item_A,item_B,freqAB,supportAB,freqA,supportA,freqB,supportB,confidenceAtoB,confidenceBtoA,lift
6254,13269,44786,860,0.028907,2856,0.095998,2271,0.076335,0.301120,0.378688,3.944745
2139,10339,49519,660,0.022184,3108,0.104468,2025,0.068066,0.212355,0.325926,3.119850
11250,32018,38141,606,0.020369,2782,0.093511,2167,0.072839,0.217829,0.279649,2.990560
17767,44786,13269,642,0.021579,2271,0.076335,2856,0.095998,0.282695,0.224790,2.944798
8728,13269,6508,640,0.021512,2856,0.095998,2567,0.086284,0.224090,0.249318,2.597119
...,...,...,...,...,...,...,...,...,...,...,...
14284,44142,13176,1008,0.033882,42881,1.441349,376021,12.639102,0.023507,0.002681,0.001860
12419,24852,196,864,0.029041,469627,15.785458,32674,1.098263,0.001840,0.026443,0.001675
16225,16797,21137,706,0.023731,141660,4.761583,263285,8.849735,0.004984,0.002682,0.000563
16977,21137,16797,640,0.021512,263285,8.849735,141660,4.761583,0.002431,0.004518,0.000511


In [16]:
rules_02.to_csv("rules/rules_02.csv",index=False)

#### Find rules using min_support = 0.01

In [17]:
%%time
rules_01 = association_rules(orders, 0.01)  
rules_01

Starting order_item:               32434489
Items with support >= 0.01:           10906
Remaining order_item:              29843570
Remaining orders with 2+ items:     3013325
Remaining order_item:              29662716
Item pairs:                        30622410
Item pairs with support >= 0.01:      48751

CPU times: user 4min, sys: 9.08 s, total: 4min 9s
Wall time: 4min 9s


Unnamed: 0,item_A,item_B,freqAB,supportAB,freqA,supportA,freqB,supportB,confidenceAtoB,confidenceBtoA,lift
41617,29126,36361,306,0.010155,1163,0.038595,839,0.027843,0.263113,0.364720,9.449868
30308,7076,17766,318,0.010553,1809,0.060033,879,0.029170,0.175788,0.361775,6.026229
37869,12820,11212,349,0.011582,1518,0.050376,1249,0.041449,0.229908,0.279424,5.546732
22767,32201,44781,409,0.013573,1666,0.055288,1391,0.046162,0.245498,0.294033,5.318230
10217,28613,45636,351,0.011648,1731,0.057445,1149,0.038131,0.202773,0.305483,5.317849
...,...,...,...,...,...,...,...,...,...,...,...
38457,21137,16797,640,0.021239,263416,8.741706,141805,4.705931,0.002430,0.004513,0.000516
7134,47209,47766,464,0.015398,212785,7.061469,176241,5.848722,0.002181,0.002633,0.000373
22288,47766,47209,443,0.014701,176241,5.848722,212785,7.061469,0.002514,0.002082,0.000356
21868,24852,13176,654,0.021704,470096,15.600574,376367,12.490090,0.001391,0.001738,0.000111


In [18]:
rules_01.to_csv("rules/rules_01.csv",index=False)

#### Find rules using min_support = 0.005

In [19]:
%%time
rules_005 = association_rules(orders, 0.005)  
rules_005

Starting order_item:               32434489
Items with support >= 0.005:           15990
Remaining order_item:              31016304
Remaining orders with 2+ items:     3034553
Remaining order_item:              30846876
Item pairs:                        39601775
Item pairs with support >= 0.005:     123653

CPU times: user 4min 39s, sys: 12.4 s, total: 4min 51s
Wall time: 4min 51s


Unnamed: 0,item_A,item_B,freqAB,supportAB,freqA,supportA,freqB,supportB,confidenceAtoB,confidenceBtoA,lift
98710,3858,15692,157,0.005174,508,0.016741,260,0.008568,0.309055,0.603846,36.070928
111219,9497,8833,156,0.005141,516,0.017004,589,0.019410,0.302326,0.264856,15.575942
104388,41349,49570,182,0.005998,623,0.020530,581,0.019146,0.292135,0.313253,15.258152
85711,39739,11224,252,0.008304,914,0.030120,619,0.020398,0.275711,0.407108,13.516319
66321,19244,26488,161,0.005306,638,0.021025,712,0.023463,0.252351,0.226124,10.755236
...,...,...,...,...,...,...,...,...,...,...,...
10630,47209,47766,464,0.015291,212828,7.013488,176276,5.808961,0.002180,0.002632,0.000375
36414,47766,47209,443,0.014599,176276,5.808961,212828,7.013488,0.002513,0.002081,0.000358
111298,47626,5876,155,0.005108,152235,5.016719,87516,2.883983,0.001018,0.001771,0.000353
35607,24852,13176,654,0.021552,470321,15.498856,376512,12.407495,0.001391,0.001737,0.000112


In [20]:
rules_005.to_csv("rules/rules_005.csv",index=False)

#### Find rules using min_support = 0.003

In [21]:
%%time
rules_003 = association_rules(orders, 0.003)  
rules_003

Starting order_item:               32434489
Items with support >= 0.003:           20333
Remaining order_item:              31559134
Remaining orders with 2+ items:     3043897
Remaining order_item:              31394787
Item pairs:                        44529445
Item pairs with support >= 0.003:     237133

CPU times: user 5min 6s, sys: 16 s, total: 5min 22s
Wall time: 21min 11s


Unnamed: 0,item_A,item_B,freqAB,supportAB,freqA,supportA,freqB,supportB,confidenceAtoB,confidenceBtoA,lift
155527,3858,15692,157,0.005158,509,0.016722,260,0.008542,0.308448,0.603846,36.110913
159237,10781,42563,94,0.003088,297,0.009757,291,0.009560,0.316498,0.323024,33.106126
196978,26810,15697,147,0.004829,584,0.019186,337,0.011071,0.251712,0.436202,22.735502
196977,35208,15697,95,0.003121,400,0.013141,337,0.011071,0.237500,0.281899,21.451796
25713,7303,1998,146,0.004796,594,0.019514,356,0.011696,0.245791,0.410112,21.015821
...,...,...,...,...,...,...,...,...,...,...,...
185935,47626,5876,155,0.005092,152253,5.001910,87526,2.875459,0.001018,0.001771,0.000354
666,5876,47626,141,0.004632,87526,2.875459,152253,5.001910,0.001611,0.000926,0.000322
233214,6184,24852,102,0.003351,29839,0.980289,470390,15.453545,0.003418,0.000217,0.000221
47766,24852,13176,654,0.021486,470390,15.453545,376593,12.372068,0.001390,0.001737,0.000112


In [22]:
rules_003.to_csv("rules/rules_003.csv",index=False)

#### Find rules using min_support = 0.002

In [23]:
%%time
rules_002 = association_rules(orders, 0.002)  
rules_002

Starting order_item:               32434489
Items with support >= 0.002:           24077
Remaining order_item:              31854736
Remaining orders with 2+ items:     3048945
Remaining order_item:              31693162
Item pairs:                        47482696
Item pairs with support >= 0.002:     398380

CPU times: user 5min 16s, sys: 17.5 s, total: 5min 33s
Wall time: 5min 35s


Unnamed: 0,item_A,item_B,freqAB,supportAB,freqA,supportA,freqB,supportB,confidenceAtoB,confidenceBtoA,lift
172483,33209,677,66,0.002165,233,0.007642,144,0.004723,0.283262,0.458333,59.975671
292085,27740,47512,67,0.002197,101,0.003313,396,0.012988,0.663366,0.169192,51.074936
210242,9055,17573,89,0.002919,336,0.011020,205,0.006724,0.264881,0.434146,39.395486
213663,3858,15692,157,0.005149,509,0.016694,260,0.008528,0.308448,0.603846,36.170800
380215,25587,1272,67,0.002197,302,0.009905,192,0.006297,0.221854,0.348958,35.230290
...,...,...,...,...,...,...,...,...,...,...,...
88558,47766,5450,89,0.002919,176292,5.782066,48906,1.604030,0.000505,0.001820,0.000315
391247,37067,13176,62,0.002033,18551,0.608440,376616,12.352338,0.003342,0.000165,0.000271
376982,6184,24852,102,0.003345,29849,0.978994,470431,15.429304,0.003417,0.000217,0.000221
59276,24852,13176,654,0.021450,470431,15.429304,376616,12.352338,0.001390,0.001737,0.000113


In [24]:
rules_002.to_csv("rules/rules_002.csv",index=False)

#### Find rules using min_support = 0.001

In [25]:
%%time
rules_001 = association_rules(orders, 0.001)  
rules_001

Starting order_item:               32434489
Items with support >= 0.001:           30863
Remaining order_item:              32171119
Remaining orders with 2+ items:     3054047
Remaining order_item:              32012264
Item pairs:                        50899489
Item pairs with support >= 0.001:     914045

CPU times: user 5min 34s, sys: 20.9 s, total: 5min 54s
Wall time: 5min 58s


Unnamed: 0,item_A,item_B,freqAB,supportAB,freqA,supportA,freqB,supportB,confidenceAtoB,confidenceBtoA,lift
100827,20598,43400,51,0.001670,112,0.003667,85,0.002783,0.455357,0.600000,163.609661
703969,35604,32841,31,0.001015,98,0.003209,66,0.002161,0.316327,0.469697,146.375165
86413,5019,23425,35,0.001146,100,0.003274,91,0.002980,0.350000,0.384615,117.463346
754447,46735,23665,37,0.001212,95,0.003111,105,0.003438,0.389474,0.352381,113.282946
892200,2364,27377,37,0.001212,109,0.003569,100,0.003274,0.339450,0.370000,103.669485
...,...,...,...,...,...,...,...,...,...,...,...
308017,47209,196,34,0.001113,212876,6.970292,33181,1.086460,0.000160,0.001025,0.000147
810146,24852,37067,39,0.001277,470489,15.405428,18551,0.607424,0.000083,0.002102,0.000136
807183,12341,47766,37,0.001212,49301,1.614284,176299,5.772635,0.000750,0.000210,0.000130
80642,24852,13176,654,0.021414,470489,15.405428,376649,12.332783,0.001390,0.001736,0.000113


In [26]:
rules_001.to_csv("rules/rules_001.csv",index=False)