In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from prettytable import PrettyTable
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth

In [7]:
main_data = pd.read_csv("C:\\Users\\vc185080\\Downloads\\ECommerce_consumer_behaviour\\ECommerce_consumer_behaviour.csv")
df = main_data.copy()
df.shape

(2019501, 12)

In [90]:
#df.info()
df['product_name'].unique()

array(['baking ingredients', 'soy lactosefree', 'butter',
       'fresh vegetables', 'yogurt', 'canned meals beans',
       'poultry counter', 'ice cream ice', 'fresh fruits', 'milk',
       'packaged cheese', 'bread', 'tea', 'bakery desserts',
       'frozen breakfast', 'cereal', 'eggs', 'buns rolls', 'cream',
       'water seltzer sparkling water', 'pickled goods olives',
       'packaged poultry', 'other creams cheeses',
       'honeys syrups nectars', 'coffee', 'refrigerated',
       'energy granola bars', 'soft drinks', 'latino foods',
       'plates bowls cups flatware', 'paper goods', 'oral hygiene',
       'diapers wipes', 'food storage', 'nuts seeds dried fruit', 'soap',
       'packaged vegetables fruits', 'hot dogs bacon sausage',
       'lunch meat', 'chips pretzels', 'meat counter',
       'fresh dips tapenades', 'prepared soups salads', 'condiments',
       'juice nectars', 'canned fruit applesauce',
       'preserved dips spreads', 'packaged produce',
       'canned jarr

In [9]:
# TAKING CARE OF NULL VALUES
for col in df.columns:
    if df[col].dtypes == 'float64':
        df[col].fillna(-1,inplace=True) 

In [14]:

all_transactions = [transaction[1]['product_name'].tolist() for transaction in list(df.groupby(['order_id', 'user_id']))]

In [34]:
df1 = df[['order_id','user_id','product_name']].drop_duplicates()

In [35]:
df1.head(20)

Unnamed: 0,order_id,user_id,product_name
0,2425083,49125,baking ingredients
1,2425083,49125,soy lactosefree
2,2425083,49125,butter
3,2425083,49125,fresh vegetables
6,2425083,49125,yogurt
7,2425083,49125,canned meals beans
8,2425083,49125,poultry counter
9,1944304,162867,ice cream ice
10,1944304,162867,fresh fruits
11,1944304,162867,fresh vegetables


In [37]:
newcol = {'order_id':'Invoice','user_id':'Member_number','product_name':'itemDescription'}
df1.rename(columns=newcol, inplace = True)
df1.head(5)

Unnamed: 0,Invoice,Member_number,itemDescription
0,2425083,49125,baking ingredients
1,2425083,49125,soy lactosefree
2,2425083,49125,butter
3,2425083,49125,fresh vegetables
6,2425083,49125,yogurt


In [41]:
all_transactions = [transaction[1]['itemDescription'].tolist() for transaction in list(df1.groupby(['Member_number', 'Invoice']))]
all_transactions[0:5]

[['fresh fruits',
  'packaged cheese',
  'crackers',
  'chips pretzels',
  'fresh dips tapenades'],
 ['fresh fruits',
  'popcorn jerky',
  'lunch meat',
  'packaged cheese',
  'prepared soups salads'],
 ['other creams cheeses',
  'packaged vegetables fruits',
  'nuts seeds dried fruit',
  'packaged cheese',
  'paper goods',
  'soy lactosefree',
  'fresh fruits',
  'water seltzer sparkling water',
  'crackers'],
 ['soy lactosefree', 'fresh fruits', 'packaged vegetables fruits'],
 ['spreads',
  'nuts seeds dried fruit',
  'fresh fruits',
  'ice cream ice',
  'frozen breakfast']]

In [42]:
# The following instructions transform the dataset into the required format 
trans_encoder = TransactionEncoder() # Instanciate the encoder
trans_encoder_matrix = trans_encoder.fit(all_transactions).transform(all_transactions)
trans_encoder_matrix = pd.DataFrame(trans_encoder_matrix, columns=trans_encoder.columns_)

In [43]:
trans_encoder_matrix.head()

Unnamed: 0,air fresheners candles,asian foods,baby accessories,baby bath body care,baby food formula,bakery desserts,baking ingredients,baking supplies decor,beauty,beers coolers,...,spreads,tea,tofu meat alternatives,tortillas flat bread,trail mix snack mix,trash bags liners,vitamins supplements,water seltzer sparkling water,white wines,yogurt
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False


In [238]:
def perform_rule_calculation(transact_items_matrix, rule_type="fpgrowth", min_support=0.01):
    """
    desc: this function performs the association rule calculation 
    @params:
        - transact_items_matrix: the transaction X Items matrix
        - rule_type: 
                    - apriori or Growth algorithms (default="fpgrowth")
                    
        - min_support: minimum support threshold value (default = 0.001)
        
    @returns:
        - the matrix containing 3 columns:
            - support: support values for each combination of items
            - itemsets: the combination of items
            - number_of_items: the number of items in each combination of items
            
        - the excution time for the corresponding algorithm
        
    """
    start_time = 0
    total_execution = 0
    
    if(not rule_type=="fpgrowth"):
        start_time = time.time()
        rule_items = apriori(transact_items_matrix, 
                       min_support=min_support, 
                       use_colnames=True)
        total_execution = time.time() - start_time
        print("Computed Apriori!")
        
    else:
        start_time = time.time()
        rule_items = fpgrowth(transact_items_matrix, 
                       min_support=min_support, 
                       use_colnames=True)
        total_execution = time.time() - start_time
        print("Computed Fp Growth!")
    
    rule_items['number_of_items'] = rule_items['itemsets'].apply(lambda x: len(x))
    
    return rule_items, total_execution

In [239]:
def compute_association_rule(rule_matrix, metric="lift", min_thresh=1):
    """
    @desc: Compute the final association rule
    @params:
        - rule_matrix: the corresponding algorithms matrix
        - metric: the metric to be used (default is lift)
        - min_thresh: the minimum threshold (default is 1)
        
    @returns:
        - rules: all the information for each transaction satisfying the given metric & threshold
    """
    rules = association_rules(rule_matrix, 
                              metric=metric, 
                              min_threshold=min_thresh)
    
    return rules

In [240]:
def plot_metrics_relationship(rule_matrix, col1, col2):
    """
    desc: shows the relationship between the two input columns 
    @params:
        - rule_matrix: the matrix containing the result of a rule (apriori or Fp Growth)
        - col1: first column
        - col2: second column
    """
    fit = np.polyfit(rule_matrix[col1], rule_matrix[col2], 1)
    fit_funt = np.poly1d(fit)
    plt.plot(rule_matrix[col1], rule_matrix[col2], 'yo', rule_matrix[col1], 
    fit_funt(rule_matrix[col1]))
    plt.xlabel(col1)
    plt.ylabel(col2)
    plt.title('{} vs {}'.format(col1, col2))
def compare_time_exec(algo1=list, alg2=list):
    """
    @desc: shows the execution time between two algorithms
    @params:
        - algo1: list containing the description of first algorithm, where
            
        - algo2: list containing the description of second algorithm, where
    """
    
    execution_times = [algo1[1], algo2[1]]
    algo_names = (algo1[0], algo2[0])
    y=np.arange(len(algo_names))
    
    plt.bar(y,execution_times,color=['orange', 'blue'])
    plt.xticks(y,algo_names)
    plt.xlabel('Algorithms')
    plt.ylabel('Time')
    plt.title("Execution Time (seconds) Comparison")
    plt.show()

In [241]:
val = {'name':12}
value = list(val.items())[0]
value

('name', 12)

In [242]:
fpgrowth_matrix, fp_growth_exec_time = perform_rule_calculation(trans_encoder_matrix) # Run the algorithm
print("Fp Growth execution took: {} seconds".format(fp_growth_exec_time))

Computed Fp Growth!
Fp Growth execution took: 19.035372018814087 seconds


In [243]:
fpgrowth_matrix.head()

Unnamed: 0,support,itemsets,number_of_items
0,0.555995,(fresh fruits),1
1,0.230995,(packaged cheese),1
2,0.169435,(chips pretzels),1
3,0.11497,(crackers),1
4,0.098205,(fresh dips tapenades),1


In [244]:
fpgrowth_matrix.tail()

Unnamed: 0,support,itemsets,number_of_items
2924,0.010215,"(specialty cheeses, packaged cheese)",2
2925,0.0121,"(specialty cheeses, fresh vegetables, fresh fr...",3
2926,0.013325,"(fresh fruits, cleaning products)",2
2927,0.01042,"(fresh vegetables, cleaning products)",2
2928,0.01118,"(fresh fruits, soap)",2


In [245]:
fp_growth_rule_lift = compute_association_rule(fpgrowth_matrix)
fp_growth_rule_lift.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(packaged cheese),(fresh fruits),0.230995,0.555995,0.1559,0.674906,1.213871,0.027468,1.365776,0.229114
1,(fresh fruits),(packaged cheese),0.555995,0.230995,0.1559,0.280398,1.213871,0.027468,1.068654,0.396819
2,(packaged cheese),(packaged vegetables fruits),0.230995,0.365415,0.11461,0.496158,1.357793,0.030201,1.259492,0.342664
3,(packaged vegetables fruits),(packaged cheese),0.365415,0.230995,0.11461,0.313643,1.357793,0.030201,1.120416,0.415249
4,(fresh vegetables),(packaged cheese),0.44436,0.230995,0.13585,0.305721,1.323494,0.033205,1.10763,0.439897
5,(packaged cheese),(fresh vegetables),0.230995,0.44436,0.13585,0.588108,1.323494,0.033205,1.348994,0.317845
6,(packaged cheese),(yogurt),0.230995,0.263675,0.08852,0.383212,1.453349,0.027612,1.193805,0.405633
7,(yogurt),(packaged cheese),0.263675,0.230995,0.08852,0.335716,1.453349,0.027612,1.157645,0.423636
8,(packaged cheese),(milk),0.230995,0.243325,0.07817,0.338406,1.390756,0.021963,1.143714,0.365364
9,(milk),(packaged cheese),0.243325,0.230995,0.07817,0.321258,1.390756,0.021963,1.132985,0.371317


In [246]:
fp_growth_rule = compute_association_rule(fpgrowth_matrix, metric="confidence", min_thresh=0.2)
fp_growth_rule.head(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(packaged cheese),(fresh fruits),0.230995,0.555995,0.1559,0.674906,1.213871,0.027468,1.365776,0.229114
1,(fresh fruits),(packaged cheese),0.555995,0.230995,0.1559,0.280398,1.213871,0.027468,1.068654,0.396819
2,(packaged cheese),(packaged vegetables fruits),0.230995,0.365415,0.11461,0.496158,1.357793,0.030201,1.259492,0.342664
3,(packaged vegetables fruits),(packaged cheese),0.365415,0.230995,0.11461,0.313643,1.357793,0.030201,1.120416,0.415249
4,(fresh vegetables),(packaged cheese),0.44436,0.230995,0.13585,0.305721,1.323494,0.033205,1.10763,0.439897


In [247]:
fp_growth_rule[['antecedents', 'consequents','support']].head(5)

Unnamed: 0,antecedents,consequents,support
0,(packaged cheese),(fresh fruits),0.1559
1,(fresh fruits),(packaged cheese),0.1559
2,(packaged cheese),(packaged vegetables fruits),0.11461
3,(packaged vegetables fruits),(packaged cheese),0.11461
4,(fresh vegetables),(packaged cheese),0.13585


In [248]:
#fp_growth_rule['antecedents'].info()
fp_growth_rule.iloc[48,0] == frozenset({'packaged cheese'})
#fp_growth_rule.head(10)
fp_growth_rule['antecedents'] = fp_growth_rule['antecedents'].apply(lambda x:list(x))
#pip install fuzzywuzzy

In [249]:
fp_growth_rule.head(25)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,[packaged cheese],(fresh fruits),0.230995,0.555995,0.1559,0.674906,1.213871,0.027468,1.365776,0.229114
1,[fresh fruits],(packaged cheese),0.555995,0.230995,0.1559,0.280398,1.213871,0.027468,1.068654,0.396819
2,[packaged cheese],(packaged vegetables fruits),0.230995,0.365415,0.11461,0.496158,1.357793,0.030201,1.259492,0.342664
3,[packaged vegetables fruits],(packaged cheese),0.365415,0.230995,0.11461,0.313643,1.357793,0.030201,1.120416,0.415249
4,[fresh vegetables],(packaged cheese),0.44436,0.230995,0.13585,0.305721,1.323494,0.033205,1.10763,0.439897
5,[packaged cheese],(fresh vegetables),0.230995,0.44436,0.13585,0.588108,1.323494,0.033205,1.348994,0.317845
6,[packaged cheese],(yogurt),0.230995,0.263675,0.08852,0.383212,1.453349,0.027612,1.193805,0.405633
7,[yogurt],(packaged cheese),0.263675,0.230995,0.08852,0.335716,1.453349,0.027612,1.157645,0.423636
8,[packaged cheese],(milk),0.230995,0.243325,0.07817,0.338406,1.390756,0.021963,1.143714,0.365364
9,[milk],(packaged cheese),0.243325,0.230995,0.07817,0.321258,1.390756,0.021963,1.132985,0.371317


In [250]:
fp_growth_rule['antecedents'] = fp_growth_rule['antecedents'].apply(lambda x:sorted(x))
fp_growth_rule['consequents'] = fp_growth_rule['consequents'].apply(lambda x:sorted(x))
fp_growth_rule.head(25)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,[packaged cheese],[fresh fruits],0.230995,0.555995,0.1559,0.674906,1.213871,0.027468,1.365776,0.229114
1,[fresh fruits],[packaged cheese],0.555995,0.230995,0.1559,0.280398,1.213871,0.027468,1.068654,0.396819
2,[packaged cheese],[packaged vegetables fruits],0.230995,0.365415,0.11461,0.496158,1.357793,0.030201,1.259492,0.342664
3,[packaged vegetables fruits],[packaged cheese],0.365415,0.230995,0.11461,0.313643,1.357793,0.030201,1.120416,0.415249
4,[fresh vegetables],[packaged cheese],0.44436,0.230995,0.13585,0.305721,1.323494,0.033205,1.10763,0.439897
5,[packaged cheese],[fresh vegetables],0.230995,0.44436,0.13585,0.588108,1.323494,0.033205,1.348994,0.317845
6,[packaged cheese],[yogurt],0.230995,0.263675,0.08852,0.383212,1.453349,0.027612,1.193805,0.405633
7,[yogurt],[packaged cheese],0.263675,0.230995,0.08852,0.335716,1.453349,0.027612,1.157645,0.423636
8,[packaged cheese],[milk],0.230995,0.243325,0.07817,0.338406,1.390756,0.021963,1.143714,0.365364
9,[milk],[packaged cheese],0.243325,0.230995,0.07817,0.321258,1.390756,0.021963,1.132985,0.371317


In [341]:
fp_growth_rule['antecedents'] = fp_growth_rule['antecedents'].apply(lambda x:set(x))
fp_growth_rule.head(25)
fp_growth_rule.to_csv('C:\\Users\\vc185080\\Downloads\\Retail_Proj\\Output.csv',index=False)

5
first loop
basket =  {'cat food care', 'packaged cheese', 'dish detergents', 'fresh fruits', 'packaged vegetables fruits'} recommendation = []
second loop
third loop
max length =  3
[['fresh dips tapenades'], ['fresh vegetables'], ['fresh vegetables', 'soy lactosefree'], ['soy lactosefree'], ['water seltzer sparkling water']]  ---> {'fresh dips tapenades', 'soy lactosefree', 'water seltzer sparkling water', 'fresh vegetables'}
basket =  {'cat food care', 'packaged cheese', 'dish detergents', 'fresh fruits', 'packaged vegetables fruits'} recommendation = ['fresh vegetables', 'yogurt']
