### Steps:
 1. Preprocess Dataset
 2. Write funcion which will create all combinations of given itemsets.
 3. For each combination generate $2^n - 2$ association rules
 4. Compute confidence and support for it & based on threshold include it in ruleset.
 5. Save all the rules to file.

In [17]:
import numpy as np
import os
import pandas as pd
from collections import Counter

In [90]:
def preprocess(file_path):
    data = []
    tmp_data = []
    with open(file_path, 'r') as file_input:
        for line in file_input:
#             line = line.strip('\n')
            tmp_line = []
            line = line.split(' ')
            for i in range(len(line)-1):
                no = int(line[i])
                tmp_line.append(no)
                tmp_data.append(no)
            data.append(tmp_line)
    tmp_cntr = Counter(tmp_data)
    elements = []
    counter = 0
    for item in tmp_cntr:
        if tmp_cntr[item]>5000:
            elements.append(item)
    return data, elements, len(tmp_cntr)
data, elements, disticnt_values = preprocess("retail.dat")
print(elements, "\nNo of sample in dataset: ", len(data))


    

[32, 38, 39, 41, 48] 
No of sample in dataset:  88162


In [91]:
def get_support(data, set_1, set_2):
    count_set1 = 0
    count_set2 = 0
    count_tot = 0
    for row in data:
        tmp_cnt1 = 0
        tmp_cnt2 = 0
        flg = 0
        for i in set_1:
            if i in row:
                tmp_cnt1+=1;
        if(tmp_cnt1 == len(set_1)):
            count_set1+=1
            flg +=1
        for i in set_2:
            if i in row:
                tmp_cnt2+=1
        if(tmp_cnt1 == len(set_2)):
            count_set2+=1
            flg+=1
        if flg ==2:
            count_tot += 1
    tmp_str = "     "
#     print(tmp_str, count_set1, count_set2, count_tot)
    support = count_tot/len(data)
    conf = count_tot/count_set1
    return support, conf

In [92]:
def generate_rules(data, elements, thr_support, thr_confidence):
    numbers = len(elements)
    length = pow(2, numbers)
    rules = []
    for i in range(length):
        if (i == 0 or i == length-1):
            continue
        set_1 = []
        set_2 = []
        binary = bin(i)[2:]
        binary = binary.zfill(len(elements))
        for i in range(len(elements)):
            if(binary[i] == '0'):
                set_1.append(elements[i])
            else:
                set_2.append(elements[i])
        support, confidence = get_support(data, set_1, set_2)
#         print('  ', set_1)
#         print('  ',set_2)
#         print('  ',support, confidence)
        if thr_confidence < confidence and thr_support < support:
            tmp_set = []
            tmp_set.append(set_1)
            tmp_set.append(set_2)
            tmp_set.append(support)
            tmp_set.append(confidence)
            rules.append(tmp_set)
    return rules

In [93]:
def generate_combinations(data, elements, threshold_support, threshold_confidence):
    no_elements = len(elements)
    no_of_samples = len(data)
    print('we are processing on the ', no_of_samples,' samples')
    print('out of that', no_elements, 'are frequently occuring')
    rules = []
    for i in range(1, pow(2, no_elements)):
        binary = bin(i)[2:]
        binary = binary.zfill(no_elements)
        ele_list = []
        for j in range(no_elements):
            if binary[j] == '1':
                ele_list.append(elements[j])
        print(ele_list)
        rules_tmp = generate_rules(data, ele_list, threshold_support, threshold_confidence)
        
        for rule in rules_tmp:
            rules.append(rule)
    return rules
    

In [96]:
def main():
    file_name = "retail.dat"
    data, elements, distinct_values = preprocess(file_name)
    print(len(data), elements, distinct_values)
    rules = generate_combinations(data, elements, 0.05, 0.7)
    print('vaibhav')
    for rule in rules:
        print(rule)
main()

88162 [32, 38, 39, 41, 48] 16470
we are processing on the  88162  samples
out of that 5 are frequently occuring
[48]
[41]
[41, 48]
[39]
[39, 48]
[39, 41]
[39, 41, 48]
[38]
[38, 48]
[38, 41]
[38, 41, 48]
[38, 39]
[38, 39, 48]
[38, 39, 41]
[38, 39, 41, 48]
[32]
[32, 48]
[32, 41]
[32, 41, 48]
[32, 39]
[32, 39, 48]
[32, 39, 41]
[32, 39, 41, 48]
[32, 38]
[32, 38, 48]
[32, 38, 41]
[32, 38, 41, 48]
[32, 38, 39]
[32, 38, 39, 48]
[32, 38, 39, 41]
[32, 38, 39, 41, 48]
vaibhav
[[41], [48], 0.16951747918604387, 1.0]
[[48], [41], 0.47792699802636057, 1.0]
[[39], [48], 0.5747941289898142, 1.0]
[[48], [39], 0.47792699802636057, 1.0]
[[39], [41], 0.5747941289898142, 1.0]
[[41], [39], 0.16951747918604387, 1.0]
[[38], [48], 0.17690161293981535, 1.0]
[[48], [38], 0.47792699802636057, 1.0]
[[38], [41], 0.17690161293981535, 1.0]
[[41], [38], 0.16951747918604387, 1.0]
[[38], [39], 0.17690161293981535, 1.0]
[[39], [38], 0.5747941289898142, 1.0]
[[38, 39], [41, 48], 0.1173408044282117, 1.0]
[[38, 48], [39, 41