<a href="https://colab.research.google.com/github/turatig/frequent_itemsets/blob/master/frequent_itemsets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**MARKET BASKET ANALYSIS NOTEBOOK**

Dowload and preprocess dataset

In [18]:
!pip install kaggle


import os,sys,time,zipfile,json,re
import functools as ft
import itertools as it
from datetime import datetime as dt
from random import uniform,shuffle
from nltk.corpus import stopwords
import nltk 
nltk.download('stopwords')

#add kaggle crediantials json
from google.colab import files

files.upload()
os.environ['KAGGLE_CONFIG_DIR']='.'

from kaggle.api.kaggle_api_extended import KaggleApi

def get_dataset():

  #execute only if the dataset was not already downloaded
  if 'old-newspaper.tsv' not in os.listdir():
    api=KaggleApi()
    api.authenticate()

    api.dataset_download_file('alvations/old-newspapers','old-newspaper.tsv')

    with zipfile.ZipFile('old-newspaper.tsv.zip','r') as _zip:
      _zip.extractall()

#Yield baskets (list of lists) reading from tsv
#languages: subset of languages to be considered during the market-basket analysis
def iter_baskets_from_tsv(languages=None,max_basket=-1,skip=0):
  count=0
  with open('old-newspaper.tsv','r') as f:
    #skip header line
    next(f)
    for line in f:
      l=line.split('\t')
      if languages is not None and l[0] not in languages: continue

      #get a list of words as basket skipping any sequence of non-alphabetical characters
      basket=re.split(r'[^a-zA-Z]+',l[3])
      #remove any empty string
      basket=[word.lower() for word in basket if word!='']
      if basket:
        count+=1
        if count>skip: yield basket
        if max_basket>0 and count-skip>=max_basket: break 
      
    f.close()

#Create txt dataset. Structure: every line (basket) is a sequence of words (items) separated from commas
def create_txt_dataset(languages=None,max_basket=-1,skip=0,remove_stopwords=False,max_items_per_basket=-1):
  baskets=[]
  stopw=set()
  #compute the mean of the number of items per basket
  n_items=0

  if remove_stopwords and languages:
    for lang in languages: stopw|=set(stopwords.words(lang.lower()))

  #execute only if the dataset was not already created
  for line in iter_baskets_from_tsv(languages,max_basket,skip=0):
    if stopw: line=[word for word in line if word not in stopw]
    if line:
      if max_items_per_basket>0:
        #avoid bias on order during the analysis
        shuffle(line)
        line=line[:max_items_per_basket]
      line=set(line)
      baskets.append(ft.reduce(lambda w1,w2: w1+','+w2,line))
      n_items+=len(line)

  filename=ft.reduce(lambda i,j:i+'_'+j,languages).lower() if languages is not None else 'all_languages' 
  filename+=str(len(baskets))+'.txt'

  with open(filename,'w') as f:
    for basket in baskets:
      f.write(basket+'\n')

    f.close()

  print("Dataset {0} was created with {1} baskets with {2} items on average".format(filename,len(baskets),n_items//len(baskets)))
  print("-"*30)

#Yield baskets from txt files where every line is a basket and every basket is a sequence item1,item2...
def iter_baskets_from_txt(filename,max_basket=-1,skip=0):
  theres_next=True
  basket=[]
  count=0

  with open(filename,'r') as f:
    eol=''
    for line in f:
      line=line.split(',')
      count+=1
      #trim \n
      if count>skip: yield line[:-1]+[line[-1][:-1]]
      if max_basket>0 and count-skip>=max_basket: break 
    
    f.close()

#Scan the basket file and extract a basket with fixed probability p
def get_rand_sample(basket_file,p):
  basket_count=0
  sample=[]
  all_items=set()

  for basket in basket_file:
    basket_count+=1
    if uniform(0,1)<=p:
      sample.append(basket)

    all_items|=set(basket)
  
  return sample,basket_count,all_items



get_dataset()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Saving kaggle.json to kaggle (1).json


Utilities to log the execution

In [19]:
def sizeof_GB(obj): return "%f"%(sys.getsizeof(obj)/1000000000)

#decorator to log time execution of function/method
def time_it(f):
  def _wrap(*args,**kwargs):
    start=time.time()
    res=f(*args,**kwargs)
    stop=time.time()
    
    print('\n'+'-'*30)
    print('Function {0} executed in {1} seconds'.format(f.__name__,stop-start))
    print('-'*30+'\n')
    return res
  return _wrap

#decorator to log the memory space used before and after a candidate itemset filtering operation
def log_filter(f):
  def _wrap(*args,**kwargs):
    if len(args)>0:
      if args[0]:
        print('Total number of candidate {0}-itemsets: {1}\nSize in GB: {2}'.\
            format(len(list(args[0].keys())[0]),len(args[0]),sizeof_GB(args[0])))
      else:
        print('No more candidates were found')
      
    #argument was given by key=value
    else:
      if kwargs['candidates']:
        print('Total number of candidate {0}-itemsets: {1}\nSize in GB: {2}'.\
            format(len(list(kwargs['candidates'].keys())[0]),len(kwargs['candidates']),sizeof_GB(kwargs['candidates'])))
      else:
        print('No more candidates were found')
    
    res=f(*args,**kwargs)

    if res:
      print('Number of frequent {0}-itemsets: {1}\nSize in GB: {2}'.\
              format(len(list(res.keys())[0]),len(res),sizeof_GB(res)))
    else:
      print('No frequent itemset were found')
    print('-'*30+'\n')
    return res

  return _wrap

#Dump on json file the result of an algorithm run
def dump_result(algo,s,basket_count,freq_it_sets):
  def remap(dic):
    return {str(k):v for k,v in dic.items()}

  header_info={'support_threshold':s,'total_n_baskets':basket_count}

  filename=algo+'_market_basket_analysis_'+str(dt.today())[:10]+'_'+str(dt.today())[11:]+'.json'
              
  with open(filename,'w') as f:
    f.write(json.dumps({'header':header_info,'frequent itemsets':remap(freq_it_sets)},indent='\t'))
    f.close()

"""
  Utility to clean output of analysis. Take [{freq_0_it_set},{freq_1_it_set}...] and return {freq_non_empty_it_set}
"""
def clean_output(freq_it_sets):
  if not freq_it_sets[-1]:
    #remove the last element if empty
    freq_it_sets=freq_it_sets[:-1]
  #remove empty itemset set
  freq_it_sets=freq_it_sets[1:]
  fitsets=dict()

  #create a single dict with frequent itemsets and their occurences
  for fis in freq_it_sets:
    for k,v in fis.items():
      fitsets[k]=v

  return fitsets

A-priori algorithm implementation

In [20]:
"""
  Filter candidate set of itemsets according to suppport threshold 
"""
@log_filter
def filter_ck(candidates,s):
  return {k:v for k,v in candidates.items() if v>=s}

"""
  Discard unfrequent singletons from a basket
"""
def freq_sing(freq_it_sets,basket):
  return [word for word in basket if (word,) in freq_it_sets[1]]

"""
  Check monotonicity property
  kuple is a possible k-itemset -> all immediate subsets (k-1 itemsets) are frequent itemsets.
""" 
def check_mono_prop(kuple,k,freq_it_sets):
  return all([tuple(sorted(el)) in freq_it_sets[k-1] for el in it.combinations(kuple,r=k-1)])


"""
  Return candidate k-itemsets found after a basket_file pass
"""
def get_ck(basket_file,k,freq_it_sets):
  candidates=dict()

  for basket in basket_file:
    basket=set(freq_sing(freq_it_sets,basket))
              
    for kuple in it.combinations(basket,r=k):
        #sort tuple in order to avoid duplication caused by taking the same itemset ordered in a different way
        kuple=tuple(sorted(kuple))

        if check_mono_prop(kuple,k,freq_it_sets):
          if kuple not in candidates.keys(): candidates[kuple]=1
          else: candidates[kuple]+=1

  return candidates

"""
  First pass of apriori. Build set of candidates singletons
"""
def first_pass(basket_file):
  basket_count=0
  candidates=dict()

  for basket in basket_file:
    basket_count+=1

    for item in set(basket):
      if (item,) not in candidates.keys(): candidates[(item,)]=1
      else: candidates[(item,)]+=1
  
  return candidates,basket_count

"""
  Second pass of apriori. Build set of candidates couples
"""
def second_pass(basket_file,freq_it_sets):
  basket_count=0
  candidates=dict()

  for basket in basket_file:
    basket=set(freq_sing(freq_it_sets,basket))

    for couple in it.combinations(basket,r=2):
      couple=tuple(sorted(couple))
      if couple not in candidates.keys(): candidates[couple]=1
      else: candidates[couple]+=1
  
  return candidates

"""
  Loop for counting k-itemsets with k>2
"""
def main_loop(basket_file,freq_it_sets,s,max_k=-1):
  k=3
  #stop when no more frequent itemsets are found or k>max_k
  while freq_it_sets[-1] and (max_k<0 or k<=max_k):

    #duplicate generator for multiple iterations
    basket_file,_bf=it.tee(basket_file,2)
    ck=get_ck(basket_file,k,freq_it_sets)
    freq_it_sets.append(filter_ck(ck,s))
    k+=1
    basket_file=_bf

"""
  Apriori algorithm iteration
"""
@time_it
def apriori(basket_file,s=-1,max_k=-1,log=False):

    freq_it_sets=[{tuple():1}]
    basket_file,_bf=it.tee(basket_file,2)
    ck,basket_count=first_pass(_bf)
    
    #set threshold to the 1% of the total number of baskets
    if s<0: s=basket_count//100
    freq_it_sets.append(filter_ck(ck,s))

    basket_file,_bf=it.tee(basket_file,2)
    freq_it_sets.append(filter_ck(second_pass(_bf,freq_it_sets),s))

    main_loop(basket_file,freq_it_sets,s,max_k)
    freq_it_sets=clean_output(freq_it_sets)
    if log:dump_result("apriori",s,basket_count,freq_it_sets)

    return freq_it_sets

PCY implementation

In [21]:
!pip install -q bitmap
from bitmap import BitMap

"""
  PCY first pass. Count items' occurence and build buckets table
"""
def pcy_first_pass(basket_file,bm_size):
  basket_count=0
  candidates=dict()
  buckets=[0 for i in range(bm_size)]
  
  for basket in basket_file:
    basket_count+=1

    for item in set(basket):
      if (item,) not in candidates.keys(): candidates[(item,)]=1
      else: candidates[(item,)]+=1

    #PCY variant: during the first pass hash couples to buckets
    for couple in it.combinations(set(basket),r=2):
      buckets[( hash(tuple(sorted(couple))) ) %bm_size]+=1
  
  return candidates,basket_count,buckets


def pcy_second_pass(basket_file,freq_it_sets,bm):
  basket_count=0
  candidates=dict()

  for basket in basket_file:
    basket=set(freq_sing(freq_it_sets,basket))

    for couple in it.combinations(basket,r=2):
      couple=tuple(sorted(couple))
      #PCY variant: added constraint for couple -> must hash to a frequent bucket
      if bm[( hash(tuple(sorted(couple))) ) %bm.size()]:
        if couple not in candidates.keys(): candidates[couple]=1
        else: candidates[couple]+=1
  
  return candidates


"""
  PCY algorithm iteration
"""
@time_it
def pcy(basket_file,s=-1,max_k=-1,bm_size=256,log=False):
    freq_it_sets=[{tuple():1}]
    bm=BitMap(bm_size)

    freq_it_sets=[{tuple():1}]
    basket_file,_bf=it.tee(basket_file,2)
    ck,basket_count,buckets=pcy_first_pass(_bf,bm.size())
    #set threshold to the 1% of the total number of baskets
    if s<0: s=basket_count//100
    freq_it_sets.append(filter_ck(ck,s))
    
    #PCY variant:set bit of frequent buckets in the bitmap for couples
    for i in range(len(buckets)):
      if buckets[i]>=s: bm.set(i)
    
    basket_file,_bf=it.tee(basket_file,2)
    freq_it_sets.append(filter_ck(pcy_second_pass(_bf,freq_it_sets,bm),s))

    main_loop(basket_file,freq_it_sets,s,max_k)
    freq_it_sets=clean_output(freq_it_sets)
    if log:dump_result("apriori",s,basket_count,freq_it_sets)

    return freq_it_sets

SON algorithm implementation

In [22]:
!sudo apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-eu.apache.org/dist/spark/spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz
!tar xf spark-2.4.8-bin-hadoop2.7.tgz
!pip install -q findspark

In [23]:
import findspark

os.environ['JAVA_HOME']='/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['SPARK_HOME']='spark-2.4.8-bin-hadoop2.7'

findspark.init()

from pyspark.sql import SparkSession
import pyspark

spark=SparkSession.builder.master("local[*]").config("spark.driver.memory", "15g").getOrCreate()
sc=spark.sparkContext

In [24]:
"""
  Utilities to convert from dict{k:v,...} to list[(k,v)] and viceversa for pyspark compliance
"""
def dict_to_list(itemsets_dict):
  return [(k,v) for k,v in itemsets_dict.items()]

def list_to_dict(itemsets_list):
  itemsets_dict=dict()
  for itemset in itemsets_list:
    if itemset[0] not in itemsets_dict: 
      itemsets_dict[itemset[0]]=itemset[1]

  return itemsets_dict

"""
  This is the second map function: count occurence of candidates in the chunks 
"""
def count_occurence(chunk,candidates):
  ck=dict()

  for basket in chunk:
    basket=set(basket)
    for candidate in candidates:
      if set(candidate[0]).issubset(basket):
        if candidate[0] not in ck: ck[candidate[0]]=1
        else: ck[candidate[0]]+=1

  return dict_to_list(ck)

@time_it
def SON(sc,basket_file,s,num_p=2,max_k=-1):
  
  basket_file_rdd=sc.parallelize(basket_file,num_p)
  freq_in_chunks=basket_file_rdd.mapPartitions(lambda chunk: dict_to_list(apriori(chunk,int((1/num_p)*s),max_k)))

  first_map=freq_in_chunks.map(lambda fis: (fis[0],1))
  first_reduce=first_map.reduceByKey(lambda el1,el2: 1)
  candidates=first_reduce.collect()

  second_map=basket_file_rdd.mapPartitions(lambda chunk:count_occurence(chunk,candidates))

  return list_to_dict(second_map.reduceByKey(lambda el1,el2: el1+el2).filter(lambda el: el[1]>=s).collect())


Toivonen algorithm implementation

In [25]:
from math import ceil,floor

"""
  Utility to build negative border efficiently.
  Return all k-itemsets made from k-1 frequent itemsets found in sample (only for k>=2)
"""
def nb_candidates(freq_in_sample,k):
  immediate_subs=[itemset for itemset in freq_in_sample if len(itemset)==k-1]
  self_join=[set(kuple[0])|set(kuple[1]) for kuple in it.product(immediate_subs,immediate_subs)]

  return {tuple(sorted(itemset)) for itemset in self_join if len(itemset)==k}

"""
  Function that builds the negative border
"""
def build_neg_border(freq_in_sample,all_items):
  neg_border=set()
  max_k=len(max(freq_in_sample,key=lambda el:len(el)))+1 if freq_in_sample else 1

  for k in range(1,max_k+1):
    #add singletons not found in sample
    if k==1:
      for singleton in all_items:
        if (singleton,) not in freq_in_sample:
          neg_border|={(singleton,)}
    else:
      for itemset in nb_candidates(freq_in_sample,k):
        if itemset not in freq_in_sample:
          neg_border|={itemset}


  return neg_border


@time_it
def toivonen(basket_file,s=-1,p=0.25,scaling=0.9,max_k=-1,log=False):
  negative_border=set()
  #freq_it_sets:the k-1-th element is the set of frequent itemsets made of k elements
  freq_it_sets=dict()
  basket_file,_bf=it.tee(basket_file,2)

  sample,basket_count,all_items=get_rand_sample(_bf,p)
  if s<0: s=basket_count//100
  
  ps=floor(scaling*p*s)

  #keeping only itemsets and not their counters
  freq_in_sample={k for k in apriori(sample,s=ps,max_k=max_k).keys()}

  neg_border=build_neg_border(freq_in_sample,all_items)

  max_k=len(max(neg_border,key=lambda el:len(el))) if max_k<0 else max_k

  #add all the items not present in the sample to build the complete negative border
  ck=freq_in_sample | neg_border

  #total pass of toivonen algorithm
  for basket in basket_file:
    basket=set(basket)
    for k in range(1,max_k+1):
      for kuple in it.combinations(basket,r=k):
        kuple=tuple(sorted(kuple))

        if kuple in ck:
          if kuple not in freq_it_sets:
            freq_it_sets[kuple]=1
          else:
            freq_it_sets[kuple]+=1

  #filter according to threshold
  freq_it_sets=filter_ck(freq_it_sets,s)
  #check that no elements of the negative border are frequent in the sample
  if not [itemset for itemset in neg_border if itemset in freq_it_sets]:
    if log:dump_result("toivonen",s,basket_count,freq_it_sets)
    return freq_it_sets
  else:
    return None



Validate implementation by comparing it with apyori's implementation on a small size dataset

In [26]:
create_txt_dataset(['Italian'],300)

Dataset italian300.txt was created with 300 baskets with 54 items on average
------------------------------


In [27]:
!pip install apyori
import apyori as ap
from functools import partial
from random import randint

s_thresh=70
apriori_res=apriori(iter_baskets_from_txt('italian300.txt'),s=s_thresh,max_k=3)
pcy_res=pcy(iter_baskets_from_txt('italian300.txt'),s=s_thresh,max_k=3,bm_size=10000)
son_res=SON(sc,iter_baskets_from_txt('italian300.txt'),s=s_thresh,max_k=3)
toivonen_res=toivonen(iter_baskets_from_txt('italian300.txt'),s=s_thresh,p=0.5,scaling=0.8,max_k=3)

bf=[i for i in iter_baskets_from_txt('italian300.txt')]
basket_count=len(bf)

start=time.time()
apyori_res=list(ap.apriori(bf,min_support=s_thresh/basket_count,max_length=3))
stop=time.time()

print('\n'+'-'*30)
print('Function apyori.apriori executed in {0} seconds'.format(stop-start))
print('-'*30+'\n')

#TEST
def test(target,tested,basket_count):
  failed=False

  if len(target)!=len(tested): failed=True
  test_itemsets={tuple(sorted(i.items)):i.support for i in target}

  for k,v in tested.items():
    if k not in test_itemsets:
      failed=True
      break
    #apyori lib compute support as itemset_count/basket_count
    elif test_itemsets[k]!=v/basket_count:
      failed=True
      break

  if failed:
    print("-"*30+"TEST FAILED"+"-"*30)
  else:
    print("-"*30+"TEST PASSED"+"-"*30)

test(apyori_res,apriori_res,basket_count)
test(apyori_res,pcy_res,basket_count)
test(apyori_res,son_res,basket_count)
if toivonen_res is not None:
  test(apyori_res,toivonen_res,basket_count)
else:
  print('Toivonen didn\'t find any frequent itemset')


Total number of candidate 1-itemsets: 6439
Size in GB: 0.000295
Number of frequent 1-itemsets: 24
Size in GB: 0.000001
------------------------------

Total number of candidate 2-itemsets: 276
Size in GB: 0.000009
Number of frequent 2-itemsets: 130
Size in GB: 0.000005
------------------------------

Total number of candidate 3-itemsets: 419
Size in GB: 0.000019
Number of frequent 3-itemsets: 249
Size in GB: 0.000009
------------------------------


------------------------------
Function apriori executed in 0.8207533359527588 seconds
------------------------------

Total number of candidate 1-itemsets: 6439
Size in GB: 0.000295
Number of frequent 1-itemsets: 24
Size in GB: 0.000001
------------------------------

Total number of candidate 2-itemsets: 276
Size in GB: 0.000009
Number of frequent 2-itemsets: 130
Size in GB: 0.000005
------------------------------

Total number of candidate 3-itemsets: 419
Size in GB: 0.000019
Number of frequent 3-itemsets: 249
Size in GB: 0.000009
------

Experiment: 

*   build a dataset with english newspapers (baskets)
*   consider 3 different orders of magnitude (10^3,10^5,10^6) for the number of baskets to analyze
*   Evaluate performance of the 3 algorithms

In [28]:
create_txt_dataset(['English'],1000,remove_stopwords=True,max_items_per_basket=15)
create_txt_dataset(['English'],100000,remove_stopwords=True,max_items_per_basket=15)
create_txt_dataset(['English'],1000000,remove_stopwords=True,max_items_per_basket=15)

Dataset english999.txt was created with 999 baskets with 11 items on average
------------------------------
Dataset english99944.txt was created with 99944 baskets with 11 items on average
------------------------------
Dataset english999318.txt was created with 999318 baskets with 11 items on average
------------------------------


In [29]:
#sc is the spark context, support threshold here is given as percentage of basket file size
def experiment(basket_files,sc,s=1):
  for basket_file in basket_files:

    bf=[i for i in iter_baskets_from_txt(basket_file)]
    basket_count=len(bf)
    s_thresh=int(s*(basket_count//100))

    print("Support threshold: {0}".format(s_thresh))
    print("-"*30)

    apriori_res=apriori(iter_baskets_from_txt(basket_file),s=s_thresh,max_k=2)
    pcy_res=pcy(iter_baskets_from_txt(basket_file),s=s_thresh,bm_size=40000,max_k=2)
    son_res=SON(sc,iter_baskets_from_txt(basket_file),s=s_thresh,max_k=2)
    toivonen_res=toivonen(iter_baskets_from_txt(basket_file),s=s_thresh,p=0.4,scaling=0.8,max_k=2)

    start=time.time()
    apyori_res=list(ap.apriori(bf,min_support=s_thresh/basket_count,max_length=2))
    stop=time.time()
    print('\n'+'-'*30)
    print('Function apyori.apriori executed in {0} seconds'.format(stop-start))
    print('-'*30+'\n')

    print('\n'+'*'*30)
    print('Test on basket file {0}'.format(basket_file))
    print('*'*30+'\n')

    print('Apriori results')
    test(apyori_res,apriori_res,basket_count)
    print('PCY results')
    test(apyori_res,pcy_res,basket_count)
    print('SON results')
    test(apyori_res,son_res,basket_count)
    print('Toivonen results')
    if toivonen_res is not None:
      test(apyori_res,toivonen_res,basket_count)
    else:
      print('Toivonen didn\'t find any frequent itemset')
    
  return apriori_res

In [30]:
#given a set of frequent itemsets, the total number of baskets, find rules of type itemset -> item with confidence>min_conf
def find_rules(freq_it_sets,basket_count,k=2,min_conf=0):
  itemsets={key:v for key,v in freq_it_sets.items() if len(key)==k}
  rules=dict()

  for key,v in itemsets.items():
    for item in key:
      i_minus_item=tuple(sorted(set(key)-{item}))
      if freq_it_sets[key]/freq_it_sets[i_minus_item]>=min_conf:
        rules[(i_minus_item,item)]=freq_it_sets[key]/freq_it_sets[i_minus_item]
  return rules

In [31]:
basket_files=['english999.txt','english99944.txt','english999318.txt']

freq_it_sets=experiment(basket_files,sc)

Support threshold: 9
------------------------------
Total number of candidate 1-itemsets: 5566
Size in GB: 0.000295
Number of frequent 1-itemsets: 166
Size in GB: 0.000005
------------------------------

Total number of candidate 2-itemsets: 3100
Size in GB: 0.000148
Number of frequent 2-itemsets: 6
Size in GB: 0.000000
------------------------------


------------------------------
Function apriori executed in 0.019451618194580078 seconds
------------------------------

Total number of candidate 1-itemsets: 5566
Size in GB: 0.000295
Number of frequent 1-itemsets: 166
Size in GB: 0.000005
------------------------------

Total number of candidate 2-itemsets: 29
Size in GB: 0.000001
Number of frequent 2-itemsets: 6
Size in GB: 0.000000
------------------------------


------------------------------
Function pcy executed in 0.07381987571716309 seconds
------------------------------


------------------------------
Function SON executed in 0.5472507476806641 seconds
-----------------------

In [32]:
for k,v in find_rules(freq_it_sets,basket_count).items():
  print("Rules {0} -> {1} has confidence {2}".format(k[0],k[1],v))

Rules ('would',) -> said has confidence 0.27342027039984657
Rules ('said',) -> would has confidence 0.06353857380802505


In [33]:
create_txt_dataset(['English'],1000000,remove_stopwords=True)

Dataset english999318.txt was created with 999318 baskets with 18 items on average
------------------------------


In [36]:
freq_it_sets=apriori(iter_baskets_from_txt('english999318.txt'),s=int(0.8*(999318//100)),max_k=2)

for k,v in find_rules(freq_it_sets,basket_count).items():
  print("Rules {0} -> {1} has confidence {2}".format(k[0],k[1],v))

Total number of candidate 1-itemsets: 211748
Size in GB: 0.010486
Number of frequent 1-itemsets: 311
Size in GB: 0.000009
------------------------------

Total number of candidate 2-itemsets: 48205
Size in GB: 0.002622
Number of frequent 2-itemsets: 35
Size in GB: 0.000001
------------------------------


------------------------------
Function apriori executed in 60.58247208595276 seconds
------------------------------

Rules ('st',) -> louis has confidence 0.403933507296801
Rules ('louis',) -> st has confidence 0.9085869565217392
Rules ('new',) -> jersey has confidence 0.1289073794454975
Rules ('jersey',) -> new has confidence 0.7658173489836817
Rules ('said',) -> city has confidence 0.03721278499968986
Rules ('city',) -> said has confidence 0.23596673596673598
Rules ('said',) -> new has confidence 0.056308761109781925
Rules ('new',) -> said has confidence 0.20414752465705016
Rules ('said',) -> good has confidence 0.04432393155576823
Rules ('good',) -> said has confidence 0.365389532