In [1]:
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
AP = 'may_AP.csv'
ML = 'may_ML.csv'

In [3]:
aprioriDF = pd.read_csv(f'../data/monthwise/{AP}', index_col=0)
item_name   = pd.read_csv('../data/itemlist.csv', index_col=0)
mlDF = pd.read_csv(f"../data/monthwise/{ML}")

In [4]:
item_name   = item_name[(item_name['DEPID'] != 65) & (item_name['DEPID'] != 21)]
item_name   = item_name[['SKU','KEYWORD']]
df = aprioriDF.merge(item_name, on='SKU')
basket = (df.groupby(['TXNID', 'SKU'])['QUANTITY'].sum().unstack().reset_index().fillna(0).set_index('TXNID'))
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = basket.applymap(encode_units)
frequent_itemsets = apriori(basket_sets, min_support=0.0003, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules = rules[rules['confidence'] >= 0.5].sort_values('lift', ascending=False)
rules['antecedent support'] = rules['antecedent support']*len(df)
rules['consequent support'] = rules['consequent support']*len(df)
rules['support'] = rules['support']*len(df)

support=[]
uniquedict = {}
for x in rules.iterrows():
    if x[1]['support'] not in support:
        support.append(x[1]['support'])
        uniquedict[x[1]['support']]=x[1]['antecedents'].union(x[1]['consequents'])
    else:
        uniquedict[x[1]['support']] = uniquedict[x[1]['support']].union(x[1]['antecedents']).union(x[1]['consequents'])
        

In [5]:
count = 0
countlist = []
item=[]
for x in uniquedict:
    count+=1
    for i in uniquedict[x]:
        countlist.append(count)
        item.append([i][0])
basketlist = pd.DataFrame({"Basket":countlist, "SKU":item})
basketlist = basketlist[['Basket', 'SKU']]
basketlist

Unnamed: 0,Basket,SKU
0,1,71171
1,1,69156
2,1,68875
3,1,69102
4,1,70800
5,1,70742
6,1,71645
7,1,71326
8,2,71032
9,2,71031


In [6]:
kmeans = KMeans(n_clusters=9)
kmeans.fit(mlDF[['COST','MARGIN','QUANTITY']])
mlDF['LABEL'] = kmeans.predict(mlDF[['COST','MARGIN','QUANTITY']])

In [7]:
LabelDF = mlDF.groupby(['LABEL']).agg({'COST': 'mean', 'MARGIN': 'mean', 'QUANTITY': 'mean'}).reset_index()
LabelDF.head(30)

Unnamed: 0,LABEL,COST,MARGIN,QUANTITY
0,0,37.436735,0.141421,3.996619
1,1,10.079678,0.174368,40.044192
2,2,8.131426,0.238715,520.066667
3,3,0.969542,0.382283,1376.0
4,4,10.976797,0.156344,105.290476
5,5,9.828227,0.218797,5.440774
6,6,9.106692,0.155451,246.960784
7,7,0.69,0.300817,2498.0
8,8,134.307951,0.111015,2.196721


In [8]:
FinalDF = basketlist.merge(mlDF, on='SKU').merge(item_name, on='SKU')

In [9]:
FinalDF = FinalDF[['Basket','KEYWORD','COST','MARGIN','QUANTITY']].sort_values('Basket')
FinalDF

Unnamed: 0,Basket,KEYWORD,COST,MARGIN,QUANTITY
0,1,CANADIAN CLUB 1.75 LT,19.09,0.047402,76.0
1,1,GALLO SWEET VERMOUTH 750 ML,4.85,0.102717,13.0
2,1,CARTLIDGE & BROWNE CHARDONNAY 750 ML,9.09,0.172884,23.0
3,1,CHAT ST JEAN CHARD N COAST 750 ML,9.33,0.066066,76.0
4,1,TOASTED HEAD CHARDONNAY 750 ML,7.33,0.266266,37.0
5,1,CUPCAKE CHARDONNAY 750 ML,8.09,0.100111,128.0
6,1,CHAT ST MICHELLE CHARDONNAY 750 ML,9.33,0.066066,138.0
7,1,KENDALL JACKSON AVANT CHARD 14 750 ML,11.33,0.055046,85.0
9,2,TAP DEPOSIT,60.0,0.0,23.0
8,2,KEG DEPOSIT,30.0,0.0,35.0
