<a href="https://colab.research.google.com/github/turatig/frequent_itemsets/blob/master/frequent_itemsets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**MARKET BASKET ANALYSIS NOTEBOOK**

Dowload and preprocess dataset

In [11]:
!pip install kaggle


import os,sys,time,zipfile,json,re
import functools as ft
import itertools as it
from datetime import datetime as dt

os.environ['KAGGLE_USERNAME']='giacomoturati1'
os.environ['KAGGLE_KEY']='7d34a1aefc3558065164b70c24ce27ed'

from kaggle.api.kaggle_api_extended import KaggleApi

def get_dataset():

  #execute only if the dataset was not already downloaded
  if 'old-newspaper.tsv' not in os.listdir():
    api=KaggleApi()
    api.authenticate()

    api.dataset_download_file('alvations/old-newspapers','old-newspaper.tsv')

    with zipfile.ZipFile('old-newspaper.tsv.zip','r') as _zip:
      _zip.extractall()

#Yield baskets (list of lists) reading from tsv
#languages: subset of languages to be considered during the market-basket analysis
def read_dataset_from_tsv(languages=None,max_basket=-1):
  count=0
  with open('old-newspaper.tsv','r') as f:
    #skip header line
    next(f)
    for line in f:
      l=line.split('\t')
      if languages is not None and l[0] not in languages: continue

      #get a list of words as basket skipping any sequence of non-alphabetical characters
      basket=re.split(r'[^a-zA-Z]+',l[3])
      #remove any empty string
      basket=[word.lower() for word in basket if word!='']
      count+=1
      yield basket

      if max_basket>0 and count>=max_basket: break
      
    f.close()

#Create json wich contains an array of baskets (lists of words)
def create_test_json_dataset(languages=None,max_basket=None):
  baskets=[]

  #execute only if the dataset was not already created
  for line in read_dataset_from_tsv(languages,max_basket):
    baskets.append(line)

  filename=ft.reduce(lambda i,j:i+'_'+j,languages).lower() if languages is not None else 'all_languages' 
  filename+=str(len(baskets))+'.json'

  with open(filename,'w') as f:
    f.write(json.dumps(baskets,indent='\t'))
    f.close()
  
  

#Yield baskets from json structures as array of arrays
def iter_baskets_from_json(filename,max_basket=-1):
  theres_next=True
  basket=[]
  count=0

  with open(filename,'r') as f:
    #skip first square braket
    next(f)
    for line in f:

      m=re.search(r'[a-zA-Z]+|\[|\]',line)
      line=line[m.start():m.end()]

      if line=='[': 
        basket=[]
        theres_next=True

      elif line==']':
        if theres_next:
          theres_next=False
          yield basket
          count+=1
          if max_basket>0 and count>=max_basket:break

      else: basket.append(line)
    
    f.close()

#Just to iterate agnostically over basket file
def iter_baskets(basket_file,max_basket=-1):
  if callable(basket_file):
    for basket in basket_file(max_basket): yield basket
  else:
    count=0
    for basket in basket_file:
      yield basket
      count+=1
      if count>=max_basket: break 

    


get_dataset()
create_test_json_dataset(['Italian'],300)






Utilities to log the execution

In [12]:
def sizeof_GB(obj): return "%f"%(sys.getsizeof(obj)/1000000000)

#decorator to log time execution of function/method
def time_it(f):
  def _wrap(*args,**kwargs):
    start=time.time()
    res=f(*args,**kwargs)
    stop=time.time()
    
    print('\n'+'-'*30)
    print('Function {0} executed in {1} seconds'.format(f.__name__,stop-start))
    print('-'*30+'\n')
    return res
  return _wrap

#decorator to log the memory space used before and after a candidate itemset filtering operation
def log_filter(f):
  def _wrap(*args,**kwargs):
    if len(args)>1:
      print('Total number of candidate {0}-itemsets: {1}\nSize in GB: {2}'.\
            format(len(list(args[1].keys())[0]),len(args[1]),sizeof_GB(args[1])))
      
    #argument was given by key=value
    else:
      print('Total number of candidate {0}-itemsets: {1}\nSize in GB: {2}'.\
            format(len(list(kwargs['candidates'].keys())[0]),len(kwargs['candidates']),sizeof_GB(kwargs['candidates'])))
    
    res=f(*args,**kwargs)

    if res:
      print('Number of frequent {0}-itemsets: {1}\nSize in GB: {2}'.\
              format(len(list(res.keys())[0]),len(res),sizeof_GB(res)))
    else:
      print('Number of frequent {0}-itemsets: {1}\nSize in GB: {2}'.\
              format(0,0,0))
    print('-'*30+'\n')
    return res

  return _wrap

A-priori algorithm implementation

In [36]:
class Apriori:

  """
    basket_file_it: iterable of baskets
    s:support_threshold
    max_basket:max_number of baskets to be analyzed
    basket_count: total number of baskets
    frequent_itemsets:the k-1-th element is the set of frequent itemsets made of k elements
  """
  def __init__(self,basket_file_it,s=0,max_basket=-1):
    self._basket_file_it=basket_file_it
    self._s=s
    self._max_basket=max_basket
    self._basket_count=0
    self._frequent_itemsets=[]
  
  """
  Private:
    Increment singletons set counter
  """
  def _up_singles_count(self,basket):
    for word in basket:
        if (word,) not in self._frequent_itemsets[0].keys(): self._frequent_itemsets[0][(word,)]=1
        else: self._frequent_itemsets[0][(word,)]+=1
  
  """
    First pass: initialize frequent singletons set 
  """
  @time_it
  def _algo_init(self):
    self._frequent_itemsets=[dict()]
    baskets_count=0

    for basket in iter_baskets(self._basket_file_it,self._max_basket):
      baskets_count+=1
      self._up_singles_count(basket)

    self._basket_count=baskets_count
    #set the threshold to 1% of the basket count
    if self._s<=0: self._s=self._basket_count//100
    self._frequent_itemsets[0]=self._filter_ck(self._frequent_itemsets[0])

  """
    Check monotonicity property
    kuple is a possible itemset -> all immediate subsets are frequent itemsets.
  """ 
  def _check_mono_prop(self,kuple,k):
    return all([tuple(sorted(el)) in self._frequent_itemsets[k-2] for el in it.combinations(kuple,r=k-1)])

  """
    Return candidate itemsets during an iteration step
  """
  @time_it  
  def _get_ck(self,k):

    candidates=dict()
    for basket in iter_baskets(self._basket_file_it,self._max_basket):
      #filter unfrequent singletons
      basket=[word for word in basket if (word,) in self._frequent_itemsets[0]]

      for kuple in it.combinations(basket,r=k):
        #sort tuple in order to avoid duplication of the same itemset considered in different order
        kuple=tuple(sorted(kuple))

        if self._check_mono_prop(kuple,k):
          if kuple not in candidates.keys(): candidates[kuple]=1
          else: candidates[kuple]+=1

    return candidates

  """
    Filter candidate set of itemsets according to suppport threshold 
  """
  @log_filter
  @time_it
  def _filter_ck(self,candidates):
    return {k:v for k,v in candidates.items() if v>=self._s}

  """
  Public:
    Compute the algorithm
  """
  def compute(self,max_k=-1):
    self._algo_init()
    k=2

    #stop when no more frequent itemsets are found or k>max_k
    while self._frequent_itemsets[-1] and (max_k<0 or k<=max_k):
      self._frequent_itemsets.append(self._filter_ck(self._get_ck(k)))
      k+=1
    
    if not self._frequent_itemsets[:-1]:
      #remove the last element if empty
      self._frequent_itemsets=self._frequent_itemsets[:-1]

  """
    Dump the current state of the algorithm on json file
  """
  def dump_result(self):
    def remap(dic):
      return {str(k):v for k,v in dic.items()}

    header_info={'iterable':str(self._basket_file_it),'support_threshold':self._s,'total_n_baskets':self._basket_count}

    filename='{0}_market_basket_analysis_'.format(self.__class__.__name__)\
              +str(dt.today())[:10]+'_'+str(dt.today())[11:]+'.json'
              
    with open(filename,'w') as f:
      f.write(json.dumps([header_info]+[remap(dic) for dic in self._frequent_itemsets],indent='\t'))
      f.close()
    
  """
    Functional interface to compute the Apriori algorithm
  """
  @staticmethod
  def mb_analysis(basket_file_it,s=0,max_basket=-1,max_k=-1,log=False):
    algo=Apriori(basket_file_it,s,max_basket)
    algo.compute(max_k)
    if log: algo.dump_result()

    return algo._frequent_itemsets



In [37]:
from functools import partial

basket_file=partial(iter_baskets_from_json,'italian300.json')
Apriori.mb_analysis(basket_file)


Total number of candidate 1-itemsets: 6439
Size in GB: 0.000295

------------------------------
Function _filter_ck executed in 0.0007603168487548828 seconds
------------------------------

Number of frequent 1-itemsets: 1140
Size in GB: 0.000037
------------------------------


------------------------------
Function _algo_init executed in 0.054376840591430664 seconds
------------------------------


------------------------------
Function _get_ck executed in 1.6819777488708496 seconds
------------------------------

Total number of candidate 2-itemsets: 118747
Size in GB: 0.005243

------------------------------
Function _filter_ck executed in 0.0189669132232666 seconds
------------------------------

Number of frequent 2-itemsets: 37906
Size in GB: 0.001311
------------------------------


------------------------------
Function _get_ck executed in 62.7361958026886 seconds
------------------------------

Total number of candidate 3-itemsets: 713202
Size in GB: 0.041943

------------

KeyboardInterrupt: ignored

PCY implementation

In [15]:
!pip install bitmap
from bitmap import BitMap

#map a tuple to a bucket of a table of size=s
def hash_tuple(t,s): return hash(t)%s


Collecting bitmap
  Downloading https://files.pythonhosted.org/packages/3a/18/0cf0116c3faf5023a6e971549055d42907020bca37320b3ada380e07f3ff/bitmap-0.0.7-py3-none-any.whl
Installing collected packages: bitmap
Successfully installed bitmap-0.0.7


In [16]:
class PCY(Apriori):
  """
    bm_size: BitMap of frequent buckets count
    bm: BitMap of frequent buckets
  """
  def __init__(self,basket_file_it,s=0,max_basket=-1,bm_size=256):
    super().__init__(basket_file_it,s,max_basket)
    self._bm_size=bm_size
    self._bm=BitMap(self._bm_size)
  
  """
    Algorithm initialization modified: initialize bucket table
  """
  @time_it
  def _algo_init(self):
    self._frequent_itemsets=[dict()]
    self._bm=BitMap(self._bm_size)

    baskets_count=0
    frequent_buckets=[0 for i in range(self._bm_size)]

    for basket in iter_baskets(self._basket_file_it,self._max_basket):
      baskets_count+=1
      self._up_singles_count(basket)

      #PCY variant: during the first pass hash couples to bucket
      for couple in it.combinations(basket,r=2):
        frequent_buckets[hash_tuple(tuple(sorted(couple)),self._bm_size)]+=1
    

    self._basket_count=baskets_count
    if self._s<=0: self._s=self._basket_count//100

    self._frequent_itemsets[0]=self._filter_ck(self._frequent_itemsets[0])

    #set bit of frequent buckets in the bitmap
    for i in range(len(frequent_buckets)):
      if frequent_buckets[i]>=self._s: self._bm.set(i)
  
  """
    Algorithm iteration modified: consider couple that hash to a frequent bucket
  """
  @time_it
  def _get_ck(self,k):

    candidates=dict()
    for basket in iter_baskets(self._basket_file_it,self._max_basket):
      #filter unfrequent singletons
      basket=[word for word in basket if (word,) in self._frequent_itemsets[0]]
      for kuple in it.combinations(basket,r=k):
        #sort tuple in order to avoid duplication of the same itemset considered in different order
        kuple=tuple(sorted(kuple))

        #PCY variant: added constraint for couple -> must hash to a frequent bucket
        if self._check_mono_prop(kuple,k) and (k!=2 or self._bm[hash_tuple(kuple,self._bm_size)]):
          if kuple not in candidates.keys(): candidates[kuple]=1
          else: candidates[kuple]+=1

    return candidates


In [17]:
algo=PCY(basket_file)
algo.compute(3)

Total number of candidate 1-itemsets: 6439
Size in GB: 0.000295

------------------------------
Function _filter_ck executed in 0.000873565673828125 seconds
------------------------------

Number of frequent 1-itemsets: 1140
Size in GB: 0.000037
------------------------------


------------------------------
Function _algo_init executed in 0.895531415939331 seconds
------------------------------


------------------------------
Function _get_ck executed in 2.1663713455200195 seconds
------------------------------

Total number of candidate 2-itemsets: 118747
Size in GB: 0.005243

------------------------------
Function _filter_ck executed in 0.018021583557128906 seconds
------------------------------

Number of frequent 2-itemsets: 37906
Size in GB: 0.001311
------------------------------


------------------------------
Function _get_ck executed in 63.767906188964844 seconds
------------------------------

Total number of candidate 3-itemsets: 713202
Size in GB: 0.041943

------------

Spark installation

In [38]:
!sudo apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-eu.apache.org/dist/spark/spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz
!tar xf spark-2.4.8-bin-hadoop2.7.tgz
!pip install -q findspark

SON algorithm implementation

In [39]:
import findspark

os.environ['JAVA_HOME']='/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['SPARK_HOME']='spark-2.4.8-bin-hadoop2.7'

findspark.init()

from pyspark.sql import SparkSession
import pyspark


In [40]:
spark=SparkSession.builder.master("local[*]").getOrCreate()
sc=spark.sparkContext
basket_rdd=sc.parallelize(basket_file())
basket_rdd.mapPartitions(lambda chunk: Apriori.mb_analysis(chunk)).collect()


Py4JJavaError: ignored