<a href="https://colab.research.google.com/github/turatig/frequent_itemsets/blob/master/frequent_itemsets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**MARKET BASKET ANALYSIS NOTEBOOK**

Dowload and preprocess dataset

In [29]:
!pip install kaggle
!pip install pandas

import os,sys,time,zipfile,json,re
import functools as ft
import itertools as it
from datetime import datetime as dt

os.environ['KAGGLE_USERNAME']='giacomoturati1'
os.environ['KAGGLE_KEY']='7d34a1aefc3558065164b70c24ce27ed'

from kaggle.api.kaggle_api_extended import KaggleApi

def get_dataset():

  #execute only if the dataset was not already downloaded
  if 'old-newspaper.tsv' not in os.listdir():
    api=KaggleApi()
    api.authenticate()

    api.dataset_download_file('alvations/old-newspapers','old-newspaper.tsv')

    with zipfile.ZipFile('old-newspaper.tsv.zip','r') as _zip:
      _zip.extractall()

#languages: subset of languages to be considered during the market-basket analysis
def read_dataset_from_disk(languages=None,max_basket=None):
  count=0
  with open('old-newspaper.tsv','r') as f:
    #skip header line
    next(f)
    for line in f:
      l=line.split('\t')
      if languages is not None and l[0] not in languages: continue

      #get a list of words as basket skipping any sequence of non-alphabetical characters
      basket=re.split(r'[^a-zA-Z]+',l[3])
      #remove any empty string
      basket=[word.lower() for word in basket if word!='']
      count+=1
      yield basket

      if max_basket is not None and count>=max_basket: break
      
    f.close()

#create json wich contains an array of baskets (lists of words)
def create_test_json_dataset(languages=None,max_basket=None):
  baskets=[]

  #execute only if the dataset was not already created
  for line in read_dataset_from_disk(languages,max_basket):
    baskets.append(line)

  filename=ft.reduce(lambda i,j:i+'_'+j,languages).lower() if languages is not None else 'all_languages' 
  filename+=str(len(baskets))+'.json'

  with open(filename,'w') as f:
    f.write(json.dumps(baskets,indent='\t'))
    f.close()
  
  

#yield lists of words (baskets) from json file created with create_test_json_dataset 
def iter_baskets_from_json(filename):
  theres_next=True
  basket=[]

  with open(filename,'r') as f:
    #skip first square braket
    next(f)
    for line in f:

      m=re.search(r'[a-zA-Z]+|\[|\]',line)
      line=line[m.start():m.end()]

      if line=='[': 
        basket=[]
        theres_next=True

      elif line==']':
        if theres_next:
          theres_next=False
          yield basket

      else: basket.append(line)
    
    f.close()
    


get_dataset()
create_test_json_dataset(['Italian'],300)






A-priori algorithm implementation

In [48]:
#decorator to log time execution of function/method
def timeit(f):
  def _wrap(*args,**kwargs):
    start=time.time()
    res=f(*args,**kwargs)
    stop=time.time()
    print('Function {0} executed in {1} seconds'.format(f.__name__,stop-start))
    return res
  return _wrap

def sizeof_GB(obj): return "%f"%(sys.getsizeof(obj)/1000000000)

class Apriori:

  #it: iterable of baskets
  def __init__(self,it,*it_args,**it_kwargs):

    #basket file is supposed to be one of the previously defined iterables
    self._basket_file_it=it
    self._basket_file_args=it_args
    self._basket_file_kwargs=it_kwargs
    #support threshold: this value is set the first time the basket file is passed
    self._s=0
    #the k-1-th element is the set of frequent itemsets made of k elements
    self._frequent_itemsets=[]
    self._log=True
  
  #reset support threshold and frequent itemsets list and compute frequent singletons
  @timeit
  def _algo_init(self,s_thresh=None):

    self._frequent_itemsets=[]
    singletons=dict()
    baskets_count=0

    for basket in self._basket_file_it(*self._basket_file_args,**self._basket_file_kwargs):
      baskets_count+=1
      for word in basket:
        if (word,) not in singletons.keys(): singletons[(word,)]=1
        else: singletons[(word,)]+=1

    if s_thresh is None:
      #set threshold to 1% of the number of baskets analyzed
      self._s=baskets_count//100
    else: self._s=s_thresh
    
    self._frequent_itemsets.append(self._filter_ck(singletons))

    if self._log:
      print('Total number of singletons: {0}\nSize in GB: {1}'.\
            format(len(singletons),sizeof_GB(singletons)))

  #build candidate sets and count their occurence in baskets
  @timeit  
  def _get_ck(self,k):

    candidates=dict()
    for basket in self._basket_file_it(*self._basket_file_args,**self._basket_file_kwargs):
      #filter unfrequent singletons
      basket=[word for word in basket if (word,) in self._frequent_itemsets[0]]
      for kuple in it.combinations(basket,r=k):
        kuple=tuple(sorted(kuple))
        #exploit the monotonicity property: kuple is a possible itemset -> all immediate subsets are frequent itemsets.
        if all([tuple(sorted(el)) in self._frequent_itemsets[k-2] for el in it.combinations(kuple,r=k-1)]):
          if kuple not in candidates.keys(): candidates[kuple]=1
          else: candidates[kuple]+=1
    
    if self._log:
      print('Total number of candidate {0}-itemsets: {1}\nSize in GB: {2}'.\
            format(k,len(candidates),sizeof_GB(candidates)))

    return candidates

  @timeit
  def _filter_ck(self,candidates):
    return {k:v for k,v in candidates.items() if v>=self._s}



  def compute(self,max_k=None,s_thresh=None):
    self._algo_init(s_thresh)
    k=2

    if self._log:
      print('Number of frequent singletons: {0}\nSize in GB: {1}'.\
            format(len(self._frequent_itemsets[-1]),sizeof_GB(self._frequent_itemsets[-1])))
      print('-'*30)
      print('\n')

    #stop when no more frequent itemsets is found or k>max_k
    while self._frequent_itemsets[-1] and k<=max_k:
      self._frequent_itemsets.append(self._filter_ck(self._get_ck(k)))
      k+=1

      if self._log:
        print('Number of frequent {0}-itemsets: {1}\nSize in GB: {2}'.\
              format(k-1,len(self._frequent_itemsets[-1]),sizeof_GB(self._frequent_itemsets[-1])))
        print('-'*30)
        print('\n')
    
    if not self._frequent_itemsets[:-1]:
      #remove the last element if empty
      self._frequent_itemsets=self._frequent_itemsets[:-1]

  def dump_result(self):
    def remap(dic):
      return {str(k):v for k,v in dic.items()}

    header_info={'iterable':self._basket_file_it.__name__,'iterable_args':self._basket_file_args,\
                 'iterable_kwargs':self._basket_file_kwargs,'support_threshold':self._s}

    filename='apriori_market_basket_analysis_'+str(dt.today())[:10]+'_'+str(dt.today())[11:]+'.json'
    with open(filename,'w') as f:
      f.write(json.dumps([header_info]+[remap(dic) for dic in self._frequent_itemsets],indent='\t'))
      f.close()




In [49]:
algo=Apriori(iter_baskets_from_json,'italian300.json')
algo.compute(3)
algo.dump_result()


Function _filter_ck executed in 0.0007615089416503906 seconds
Total number of singletons: 6439
Size in GB: 0.000295
Function _algo_init executed in 0.04676508903503418 seconds
Number of frequent singletons: 1140
Size in GB: 0.000037
------------------------------


Total number of candidate 2-itemsets: 118747
Size in GB: 0.005243
Function _get_ck executed in 1.5234949588775635 seconds
Function _filter_ck executed in 0.027146100997924805 seconds
Number of frequent 2-itemsets: 37906
Size in GB: 0.001311
------------------------------


Total number of candidate 3-itemsets: 713202
Size in GB: 0.041943
Function _get_ck executed in 57.88164567947388 seconds
Function _filter_ck executed in 0.23906159400939941 seconds
Number of frequent 3-itemsets: 621306
Size in GB: 0.020972
------------------------------


