<a href="https://colab.research.google.com/github/turatig/frequent_itemsets/blob/master/frequent_itemsets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**MARKET BASKET ANALYSIS NOTEBOOK**

Dowload and preprocess dataset

In [24]:
!pip install kaggle


import os,sys,time,zipfile,json,re
import functools as ft
import itertools as it
from datetime import datetime as dt

os.environ['KAGGLE_USERNAME']='giacomoturati1'
os.environ['KAGGLE_KEY']='7d34a1aefc3558065164b70c24ce27ed'

from kaggle.api.kaggle_api_extended import KaggleApi

def get_dataset():

  #execute only if the dataset was not already downloaded
  if 'old-newspaper.tsv' not in os.listdir():
    api=KaggleApi()
    api.authenticate()

    api.dataset_download_file('alvations/old-newspapers','old-newspaper.tsv')

    with zipfile.ZipFile('old-newspaper.tsv.zip','r') as _zip:
      _zip.extractall()

#languages: subset of languages to be considered during the market-basket analysis
def read_dataset_from_disk(languages=None,max_basket=None):
  count=0
  with open('old-newspaper.tsv','r') as f:
    #skip header line
    next(f)
    for line in f:
      l=line.split('\t')
      if languages is not None and l[0] not in languages: continue

      #get a list of words as basket skipping any sequence of non-alphabetical characters
      basket=re.split(r'[^a-zA-Z]+',l[3])
      #remove any empty string
      basket=[word.lower() for word in basket if word!='']
      count+=1
      yield basket

      if max_basket is not None and count>=max_basket: break
      
    f.close()

#create json wich contains an array of baskets (lists of words)
def create_test_json_dataset(languages=None,max_basket=None):
  baskets=[]

  #execute only if the dataset was not already created
  for line in read_dataset_from_disk(languages,max_basket):
    baskets.append(line)

  filename=ft.reduce(lambda i,j:i+'_'+j,languages).lower() if languages is not None else 'all_languages' 
  filename+=str(len(baskets))+'.json'

  with open(filename,'w') as f:
    f.write(json.dumps(baskets,indent='\t'))
    f.close()
  
  

#yield lists of words (baskets) from json file created with create_test_json_dataset 
def iter_baskets_from_json(filename):
  theres_next=True
  basket=[]

  with open(filename,'r') as f:
    #skip first square braket
    next(f)
    for line in f:

      m=re.search(r'[a-zA-Z]+|\[|\]',line)
      line=line[m.start():m.end()]

      if line=='[': 
        basket=[]
        theres_next=True

      elif line==']':
        if theres_next:
          theres_next=False
          yield basket

      else: basket.append(line)
    
    f.close()
    


get_dataset()
create_test_json_dataset(['Italian'],300)






Utilities to log the execution

In [25]:
def sizeof_GB(obj): return "%f"%(sys.getsizeof(obj)/1000000000)

#decorator to log time execution of function/method
def time_it(f):
  def _wrap(*args,**kwargs):
    start=time.time()
    res=f(*args,**kwargs)
    stop=time.time()
    
    print('\n'+'-'*30)
    print('Function {0} executed in {1} seconds'.format(f.__name__,stop-start))
    print('-'*30+'\n')
    return res
  return _wrap

#decorator to log the memory space used before and after a candidate itemset filtering operation
def log_filter(f):
  def _wrap(*args,**kwargs):
    if len(args)>1:
      print('Total number of candidate {0}-itemsets: {1}\nSize in GB: {2}'.\
            format(len(list(args[1].keys())[0]),len(args[1]),sizeof_GB(args[1])))
      
    #argument was given by key=value
    else:
      print('Total number of candidate {0}-itemsets: {1}\nSize in GB: {2}'.\
            format(len(list(kwargs['candidates'].keys())[0]),len(kwargs['candidates']),sizeof_GB(kwargs['candidates'])))
    
    res=f(*args,**kwargs)

    print('Number of frequent {0}-itemsets: {1}\nSize in GB: {2}'.\
              format(len(list(res.keys())[0]),len(res),sizeof_GB(res)))
    print('-'*30+'\n')
    return res

  return _wrap

A-priori algorithm implementation

In [26]:
class Apriori:

  #basket_file_it: iterable of baskets
  def __init__(self,basket_file_it):
    #basket file is supposed to be one of the previously defined iterables
    self._basket_file_it=basket_file_it
    #support threshold: this value is set the first time the basket file is passed
    self._s=0
    #the k-1-th element is the set of frequent itemsets made of k elements
    self._frequent_itemsets=[]
  
  #reset support threshold and frequent itemsets list and compute frequent singletons
  @time_it
  def _algo_init(self,s_thresh=None):

    self._frequent_itemsets=[]
    singletons=dict()
    baskets_count=0

    for basket in self._basket_file_it():
      baskets_count+=1
      for word in basket:
        if (word,) not in singletons.keys(): singletons[(word,)]=1
        else: singletons[(word,)]+=1

    if s_thresh is None:
      #set threshold to 1% of the number of baskets analyzed
      self._s=baskets_count//100
    else: self._s=s_thresh
    
    self._frequent_itemsets.append(self._filter_ck(singletons))

  #build candidate sets and count their occurence in baskets
  @time_it  
  def _get_ck(self,k):

    candidates=dict()
    for basket in self._basket_file_it():
      #filter unfrequent singletons
      basket=[word for word in basket if (word,) in self._frequent_itemsets[0]]
      for kuple in it.combinations(basket,r=k):
        #sort tuple in order to avoid duplication of the same itemset considered in different order
        kuple=tuple(sorted(kuple))
        #exploit the monotonicity property: kuple is a possible itemset -> all immediate subsets are frequent itemsets.
        if all([tuple(sorted(el)) in self._frequent_itemsets[k-2] for el in it.combinations(kuple,r=k-1)]):
          if kuple not in candidates.keys(): candidates[kuple]=1
          else: candidates[kuple]+=1

    return candidates

  @log_filter
  @time_it
  def _filter_ck(self,candidates):
    return {k:v for k,v in candidates.items() if v>=self._s}


  def compute(self,max_k=None,s_thresh=None):
    self._algo_init(s_thresh)
    k=2

    #stop when no more frequent itemsets is found or k>max_k
    while self._frequent_itemsets[-1] and k<=max_k:
      self._frequent_itemsets.append(self._filter_ck(self._get_ck(k)))
      k+=1
    
    if not self._frequent_itemsets[:-1]:
      #remove the last element if empty
      self._frequent_itemsets=self._frequent_itemsets[:-1]

  def dump_result(self):
    def remap(dic):
      return {str(k):v for k,v in dic.items()}

    header_info={'iterable':str(self._basket_file_it),'support_threshold':self._s}

    filename='{0}_market_basket_analysis_'.format(self.__class__.__name__)\
              +str(dt.today())[:10]+'_'+str(dt.today())[11:]+'.json'
              
    with open(filename,'w') as f:
      f.write(json.dumps([header_info]+[remap(dic) for dic in self._frequent_itemsets],indent='\t'))
      f.close()



In [27]:
from functools import partial

basket_file=partial(iter_baskets_from_json,'italian300.json')
algo=Apriori(basket_file)
algo.compute(3)

Total number of candidate 1-itemsets: 6439
Size in GB: 0.000295

------------------------------
Function _filter_ck executed in 0.0009257793426513672 seconds
------------------------------

Number of frequent 1-itemsets: 1140
Size in GB: 0.000037
------------------------------


------------------------------
Function _algo_init executed in 0.0482485294342041 seconds
------------------------------


------------------------------
Function _get_ck executed in 1.6167480945587158 seconds
------------------------------

Total number of candidate 2-itemsets: 118747
Size in GB: 0.005243

------------------------------
Function _filter_ck executed in 0.02085566520690918 seconds
------------------------------

Number of frequent 2-itemsets: 37906
Size in GB: 0.001311
------------------------------


------------------------------
Function _get_ck executed in 63.704822063446045 seconds
------------------------------

Total number of candidate 3-itemsets: 713202
Size in GB: 0.041943

-----------

PCY implementation

In [28]:
!pip install bitmap
from bitmap import BitMap

#map a tuple to a bucket of a table of size=s
def hash_tuple(t,s): return hash(t)%s




In [29]:
class PCY(Apriori):
  def __init__(self,basket_file_it,bm_size):
    super().__init__(basket_file_it)
    self._bm_size=bm_size
    self._bm=BitMap(self._bm_size)
  
  @time_it
  def _init_algo(self,s_thresh):
    self._frequent_itemsets=[]
    self._bm=BitMap(self._bm_size)
    singletons=dict()
    baskets_count=0
    frequent_buckets=[0 for i in range(self._bm_size)]

    for basket in self._basket_file_it():
      baskets_count+=1
      for word in basket:
        if (word,) not in singletons.keys(): singletons[(word,)]=1
        else: singletons[(word,)]+=1
      #PCY variant: during the first pass hash couples to bucket
      for couple in it.combinations(basket,r=2):
        frequent_buckets[hash_tuple(sorted(couple),self._bm_size)]+=1

    if s_thresh is None:
      #set threshold to 1% of the number of baskets analyzed
      self._s=baskets_count//100
    else: self._s=s_thresh

    #set bit of frequent buckets in the bitmap
    for i in range(len(frequent_buckets)):
      if frequent_buckets[i]>=self._s: self._bm.set(i)
    
    self._frequent_itemsets.append(self._filter_ck(singletons))
  
  @time_it
  def _get_ck(self,k):

    candidates=dict()
    for basket in self._basket_file_it():
      #filter unfrequent singletons
      basket=[word for word in basket if (word,) in self._frequent_itemsets[0]]
      for kuple in it.combinations(basket,r=k):
        #sort tuple in order to avoid duplication of the same itemset considered in different order
        kuple=tuple(sorted(kuple))

        #PCY variant: added constraint for couple -> must hash to a frequent bucket
        if k==2 and self._bm[hash_tuple(kuple,self._bm_size)]:
          #exploit the monotonicity property: kuple is a possible itemset -> all immediate subsets are frequent itemsets.
          if all([tuple(sorted(el)) in self._frequent_itemsets[k-2] for el in it.combinations(kuple,r=k-1)]):
            if kuple not in candidates.keys(): candidates[kuple]=1
            else: candidates[kuple]+=1

    return candidates


Spark installation

In [30]:
!sudo apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-eu.apache.org/dist/spark/spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz
!tar xf spark-2.4.8-bin-hadoop2.7.tgz
!pip install -q findspark

SON algorithm implementation

In [31]:
import findspark

os.environ['JAVA_HOME']='/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['SPARK_HOME']='spark-2.4.8-bin-hadoop2.7'

findspark.init()

from pyspark.sql import SparkSession
import pyspark



class SON():
  #read the basket file from json and convert it to rdd
  def __init__(self,spark_context,it,*it_args,**it_kwargs):
    self._sc=spark_context
    self._basket_file=it
    self._basket_file_args=it_args
    self._basket_file_kwargs=it_kwargs
    self._basket_rdd=self._sc.parallelize(self._basket_file(\
                        *self._basket_file_args,**self._basket_file_kwargs))


In [32]:
spark=SparkSession.builder.master("local[*]").getOrCreate()
algo=SON(spark.sparkContext,iter_baskets_from_json,'italian300.json')
algo._basket_rdd.collect()


[['con',
  'l',
  'editore',
  'crocetti',
  'il',
  'poeta',
  'e',
  'drammaturgo',
  'wole',
  'soyinka',
  'premio',
  'nobel',
  'per',
  'la',
  'letteratura',
  'aprir',
  'la',
  'milanesiana',
  'domenica',
  'prossima',
  'alle',
  'presso',
  'la',
  'sala',
  'buzzati',
  'di',
  'via',
  'balzan',
  'milano',
  'con',
  'lui',
  'nacer',
  'khemir',
  'tahar',
  'ben',
  'jelloun',
  'biyi',
  'bandele',
  'jean',
  'hatzfeld',
  'ben',
  'okri',
  'leggeranno',
  'testi',
  'sul',
  'tema',
  'bugie',
  'e',
  'verit',
  'dello',
  'stesso',
  'argomento',
  'parler',
  'guido',
  'barbujani',
  'il',
  'giugno',
  'alle',
  'con',
  'un',
  'intervento',
  'che',
  'pubblichiamo',
  'a',
  'pagina'],
 ['angoli', 'a', 'per', 'il', 'lecce'],
 ['cantu',
  'dicembre',
  'quattro',
  'giorni',
  'dopo',
  'il',
  'trionfo',
  'di',
  'belgrado',
  'con',
  'la',
  'conquista',
  'di',
  'un',
  'posto',
  'nelle',
  'top',
  'di',
  'eurolega',
  'l',
  'emporio',
  'armani',