encoding: utf-8
    
dataset: https://kth.instructure.com/courses/20000/files/3225774

In [1]:
import numpy as np
import pandas as pd
import itertools

In [2]:
def importing_dataset(number_of_baskets):
    """
    I/O function.
    
    input: number_of_baskets.
    """
    results = []
    with open('T10I4D100K.dat') as inputfile:
        for line in inputfile:
            results.append(line.strip().split(' '))
            #print(results)
    return results

In [3]:
# import all baskets from datafile
dataset = importing_dataset(-1)
print(f'number of baskets is {len(dataset)}')

number of baskets is 100000


1. to implement the Apriori algorithm for finding frequent itemsets with support at least s in a dataset of sales transactions.

In [4]:
# define support threshold s, typical s=1%
s = len(dataset)*0.01
print(f'support threshold is {s}')

support threshold is 1000.0


In [5]:
def support(basket):
    """
    Compute the support value of a given basket of items.
    
    input: a basket of items.
    """
    value = 0 # initiate support value to 0
    for transaction in dataset: # each line is a trasaction/basket
        items = set(transaction)
        basket = set(basket)
        if basket.issubset(items): # supp(subset) is at least as big as supp(superset)
            value += 1 # count occurances of the input basket
    return value

In [6]:
def filter_function(element):
    """
    Filter the C_k candidate set to L_k frequent set. 
    
    input: element, each candidate k-tuples.
    """
    supp = support(element)
    if supp > s:
        return element

In [7]:
# C1 step: find all candidate 1-tuple

all_items = np.hstack(np.array(dataset)) # flatting array
print(f'number of all items in dataset is {len(all_items)}')

all_unique_items = np.unique(all_items) # C1 list
print(f'number of unique items in dataset is {len(all_unique_items)}')

# read all baskets and count the occrrences of each individual item
item_dict = {} # C1
for item in all_unique_items:
    #print(item)
    count = 0
    for basket in dataset:
        if item in basket:
            count += 1
    item_dict[item] = count
    #print(item_dict)

number of all items in dataset is 1010228
number of unique items in dataset is 870


In [8]:
# L1 step: 1st filter to get L1
singletons = []
for item in item_dict:
    #print(item, item_dict[item])
    if item_dict[item] > s:
        singletons.append(item)
print(f'frequent items L1 = {singletons}')

frequent items L1 = ['1', '10', '100', '104', '105', '110', '111', '112', '115', '116', '12', '120', '122', '125', '126', '129', '130', '132', '140', '143', '145', '147', '151', '154', '157', '161', '162', '163', '168', '17', '170', '171', '173', '175', '177', '181', '183', '185', '192', '196', '197', '198', '201', '204', '205', '207', '208', '21', '210', '214', '217', '227', '229', '234', '236', '239', '240', '242', '25', '258', '259', '265', '266', '27', '274', '275', '276', '279', '28', '280', '283', '285', '290', '294', '296', '308', '309', '31', '310', '319', '32', '322', '325', '326', '33', '332', '334', '335', '336', '343', '346', '348', '349', '35', '350', '351', '354', '357', '361', '362', '366', '368', '37', '373', '377', '378', '38', '381', '385', '387', '39', '390', '392', '394', '4', '401', '403', '405', '41', '411', '413', '414', '419', '422', '423', '424', '427', '428', '429', '43', '438', '440', '448', '449', '45', '450', '458', '460', '461', '468', '469', '470', '471',

In [12]:
# C2 step: find  all candidate pairs of frequent items
candidate_pair = list(itertools.combinations(singletons, 2))
print(f'number of elements in C2 is {len(candidate_pair)}')
print(f'candidate pairs C2 = {candidate_pair}')

number of elements in C2 is 70125
candidate pairs C2 = [('1', '10'), ('1', '100'), ('1', '104'), ('1', '105'), ('1', '110'), ('1', '111'), ('1', '112'), ('1', '115'), ('1', '116'), ('1', '12'), ('1', '120'), ('1', '122'), ('1', '125'), ('1', '126'), ('1', '129'), ('1', '130'), ('1', '132'), ('1', '140'), ('1', '143'), ('1', '145'), ('1', '147'), ('1', '151'), ('1', '154'), ('1', '157'), ('1', '161'), ('1', '162'), ('1', '163'), ('1', '168'), ('1', '17'), ('1', '170'), ('1', '171'), ('1', '173'), ('1', '175'), ('1', '177'), ('1', '181'), ('1', '183'), ('1', '185'), ('1', '192'), ('1', '196'), ('1', '197'), ('1', '198'), ('1', '201'), ('1', '204'), ('1', '205'), ('1', '207'), ('1', '208'), ('1', '21'), ('1', '210'), ('1', '214'), ('1', '217'), ('1', '227'), ('1', '229'), ('1', '234'), ('1', '236'), ('1', '239'), ('1', '240'), ('1', '242'), ('1', '25'), ('1', '258'), ('1', '259'), ('1', '265'), ('1', '266'), ('1', '27'), ('1', '274'), ('1', '275'), ('1', '276'), ('1', '279'), ('1', '28'),

In [10]:
# L2 step: filter candidate pairs of frequent items to get doubletons
doubletons = []
for item in candidate_pair:
    #print(item)
    element = filter_function(item)
    if element != None:
        print(element)
        doubletons.append(element)

('217', '346')
('227', '390')
('368', '682')
('368', '829')
('39', '704')
('39', '825')
('390', '722')
('704', '825')
('789', '829')


In [48]:
def construct_candidate(L, length, singletons=singletons):
    """
    Construct C_k candidate k-tuples of frequent itemsets.
    
    input: L, L_k-1 tuples from last step.
            length, k.
    """
    all_elements  = np.hstack(np.array(L))
    all_unique_elements = np.unique(all_elements)
    #print(all_unique_elements)

    candidate = list(itertools.combinations(all_unique_elements,length))
    #print(candidate)
    possible_candidate = []

    for itemset in candidate:
        #print(itemset)
    
        sub_itemset = list(itertools.combinations(itemset,length-1))
        #print(sub_itemset)
    
        if all(sub_item in L for sub_item in sub_itemset):
            possible_candidate.append(itemset)    
    return possible_candidate

In [49]:
# C3 step: find all candidate triples
candidate_triple = construct_candidate(doubletons, 3)
print(f'number of elements in C3 is {len(candidate_triple)}')
print(f'candidate triple C3 = {candidate_triple}')

number of elements in C3 is 1
candidate triple C3 = [('39', '704', '825')]


In [50]:
# L3 step: filter candidate triples of frequent items to get tripletons
tripletons = []
for item in candidate_triple:
    #print(item)
    element = filter_function(item)
    if element != None:
        print(element)
        tripletons.append(element)

('39', '704', '825')


In [51]:
# C4 step
candidate_4 = construct_candidate(tripletons, 4)
print(f'number of elements in C4 is {len(candidate_4)}')
print(f'candidate C4 = {candidate_4}')

number of elements in C4 is 0
candidate C4 = []


2. develop and implement an algorithm for generating association rules between frequent itemsets discovered by using the Apriori algorithm in a dataset of sales transactions.The rules must have support at least s and confidence at least c, where s and c are given as input parameters.