<a href="https://colab.research.google.com/github/turatig/frequent_itemsets/blob/master/frequent_itemsets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**MARKET BASKET ANALYSIS NOTEBOOK**

Dowload and preprocess dataset

In [59]:
!pip install kaggle
!pip install pandas

import os,zipfile,json,re
import functools as ft

os.environ['KAGGLE_USERNAME']='giacomoturati1'
os.environ['KAGGLE_KEY']='7d34a1aefc3558065164b70c24ce27ed'

from kaggle.api.kaggle_api_extended import KaggleApi

def get_dataset():

  #execute only if the dataset was not already downloaded
  if 'old-newspaper.tsv' not in os.listdir():
    api=KaggleApi()
    api.authenticate()

    api.dataset_download_file('alvations/old-newspapers','old-newspaper.tsv')

    with zipfile.ZipFile('old-newspaper.tsv.zip','r') as _zip:
      _zip.extractall()

#languages: subset of languages to be considered during the market-basket analysis
def read_dataset_from_disk(languages=None):
  with open('old-newspaper.tsv','r') as f:
    #skip header line
    next(f)
    for line in f:
      l=line.split('\t')
      if languages is not None and l[0] not in languages: continue

      #get a list of words as basket skipping any sequence of non-alphabetical characters
      basket=re.split(r'[^a-zA-Z]+',l[3])
      #remove any empty string
      basket=[word.lower() for word in basket if word!='']
      yield basket
      
    f.close()

#create json wich contains an array of baskets (lists of words)
def create_test_json_dataset(languages=None):
  baskets=[]

  filename=ft.reduce(lambda i,j:i+'_'+j,languages) if languages is not None else "all_languages"
  filename+='.json'

  #execute only if the dataset was not already created
  if filename not in os.listdir():
    for line in read_dataset_from_disk(languages):
      baskets.append(line)

    f=open(filename,'w')
    f.write(json.dumps(baskets,indent="\t"))
    f.close()

#yield lists of words (baskets) from json file created with  
def iter_baskets_from_json(filename):
  theres_next=True
  basket=[]

  with open(filename,'r') as f:
    #skip first square braket
    next(f)
    for line in f:

      m=re.search(r'[a-zA-Z]+|\[|\]',line)
      line=line[m.start():m.end()]

      if line=="[": 
        basket=[]
        theres_next=True

      elif line=="]":
        if theres_next:
          theres_next=False
          yield basket

      else: basket.append(line)
    
    f.close()
    


get_dataset()
create_test_json_dataset(['Italian'])






A-priori algorithm implementation

In [68]:
import itertools as it

class Apriori:

  #it: iterable of baskets
  def __init__(self,it,it_arg=None):

    #basket file is supposed to be one of the previously defined iterables
    self._basket_file=it
    self._basket_file_arg=it_arg
    #support threshold: this value is set the first time the basket file is passed
    self._s=0

    #the k-1-th element is the set of frequent itemsets made of k elements
    self._frequent_itemsets=[]
  
  #reset support threshold and frequent itemsets list with frequent singletons
  def _singletons_init(self):
    self._frequent_itemsets=[]
    singletons=dict()
    baskets_count=0

    for basket in self._basket_file(self._basket_file_arg):
      baskets_count+=1
      for word in basket:
        if (word,) not in singletons.keys(): singletons[(word,)]=1
        else: singletons[(word,)]+=1
    #1% of the number of baskets analyzed
    self._s=baskets_count//100
    
    self._frequent_itemsets.append(self._filter_ck(singletons))

  #build candidate sets and count their occurence in baskets  
  def _get_ck(self,k):
    candidates=dict()
    for basket in self._basket_file(self._basket_file_arg):
      for kuple in it.combinations(basket,r=k):
        #exploit the monotonicity property: all immediate subsets must be frequent itemsets
        if all([el in self._frequent_itemsets[k-2] for el in it.combinations(kuple,r=k-1)]):
          if kuple not in candidates.keys(): candidates[kuple]=1
          else: candidates[kuple]+=1
    
    return candidates

  def _filter_ck(self,candidates):
    return {k for k,v in candidates.items() if v>=self._s}



  def compute(self):
    self._singletons_init()
    k=2
    while self._frequent_itemsets[-1]:
      self._frequent_itemsets.append(self._filter_ck(self._get_ck(k)))
      k+=1
      print("k: ",k)
      if k==5: break
    
    #remove the last element
    self._frequent_itemsets=self._frequent_itemsets[:-1]

  def __str__(self): return "Support threshold: {0}\nFrequent itemsets:\n{1}".format(self._s,self._frequent_itemsets)


In [69]:
algo=Apriori(iter_baskets_from_json,'Italian.json')
algo.compute()
print(algo)

KeyboardInterrupt: ignored