In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from scipy.sparse import csr_matrix
import warnings
from numba import jit
warnings.filterwarnings('ignore')

In [3]:
import collections
from distutils.version import LooseVersion as Version


warnings.simplefilter("always", DeprecationWarning)


# def setup_fptree(df, min_support):
#     num_itemsets = len(df.index)  # number of itemsets in the database

#     is_sparse = False
#     if hasattr(df, "sparse"):
#         # DataFrame with SparseArray (pandas >= 0.24)
#         if df.size == 0:
#             itemsets = df.values
#         else:
#             itemsets = df.sparse.to_coo().tocsr()
#             is_sparse = True
#     else:
#         # dense DataFrame
#         itemsets = df.values

#     # support of each individual item
#     # if itemsets is sparse, np.sum returns an np.matrix of shape (1, N)
#     item_support = np.array(np.sum(itemsets, axis=0) / float(num_itemsets))
#     item_support = item_support.reshape(-1)

#     items = np.nonzero(item_support >= min_support/float(num_itemsets))[0]
#     # Define ordering on items for inserting into FPTree
#     print(items)
#     indices = item_support[items].argsort()
#     rank = {item: i for i, item in enumerate(items[indices])}

#     if is_sparse:
#         # Ensure that there are no zeros in sparse DataFrame
#         itemsets.eliminate_zeros()

#     # Building tree by inserting itemsets in sorted order
#     # Heuristic for reducing tree size is inserting in order
#     #   of most frequent to least frequent
#     tree = FPTree(rank)
#     for i in range(num_itemsets):
#         if is_sparse:
#             # itemsets has been converted to CSR format to speed-up the line
#             # below.  It has 3 attributes:
#             #  - itemsets.data contains non null values, shape(#nnz,)
#             #  - itemsets.indices contains the column number of non null
#             #    elements, shape(#nnz,)
#             #  - itemsets.indptr[i] contains the offset in itemset.indices of
#             #    the first non null element in row i, shape(1+#nrows,)
#             nonnull = itemsets.indices[itemsets.indptr[i] : itemsets.indptr[i + 1]]
#         else:
#             nonnull = np.where(itemsets[i, :])[0]
#         itemset = [item for item in nonnull if item in rank]
#         itemset.sort(key=rank.get, reverse=True)
#         tree.insert_itemset(itemset)

#     return tree, rank


# def generate_itemsets(generator, num_itemsets, colname_map):
#     itemsets = []
#     supports = []
#     for sup, iset in generator:
#         itemsets.append(frozenset(iset))
#         supports.append(sup / num_itemsets)

#     res_df = pd.DataFrame({"support": supports, "itemsets": itemsets})

#     if colname_map is not None:
#         res_df["itemsets"] = res_df["itemsets"].apply(
#             lambda x: frozenset([colname_map[i] for i in x])
#         )

#     return res_df


def valid_input_check(df):
    if f"{type(df)}" == "<class 'pandas.core.frame.SparseDataFrame'>":
        msg = (
            "SparseDataFrame support has been deprecated in pandas 1.0,"
            " and is no longer supported in mlxtend. "
            " Please"
            " see the pandas migration guide at"
            " https://pandas.pydata.org/pandas-docs/"
            "stable/user_guide/sparse.html#sparse-data-structures"
            " for supporting sparse data in DataFrames."
        )
        raise TypeError(msg)

    if df.size == 0:
        return
    if hasattr(df, "sparse"):
        if not isinstance(df.columns[0], str) and df.columns[0] != 0:
            raise ValueError(
                "Due to current limitations in Pandas, "
                "if the sparse format has integer column names,"
                "names, please make sure they either start "
                "with `0` or cast them as string column names: "
                "`df.columns = [str(i) for i in df.columns`]."
            )

    # Fast path: if all columns are boolean, there is nothing to checks
    all_bools = df.dtypes.apply(pd.api.types.is_bool_dtype).all()
    if not all_bools:
        warnings.warn(
            "DataFrames with non-bool types result in worse computational"
            "performance and their support might be discontinued in the future."
            "Please use a DataFrame with bool type",
            DeprecationWarning,
        )
        # Pandas is much slower than numpy, so use np.where on Numpy arrays
        if hasattr(df, "sparse"):
            if df.size == 0:
                values = df.values
            else:
                values = df.sparse.to_coo().tocoo().data
        else:
            values = df.values
        idxs = np.where((values != 1) & (values != 0))
        if len(idxs[0]) > 0:
            # idxs has 1 dimension with sparse data and 2 with dense data
            val = values[tuple(loc[0] for loc in idxs)]
            s = (
                "The allowed values for a DataFrame"
                " are True, False, 0, 1. Found value %s" % (val)
            )
            raise ValueError(s)


# class FPTree(object):
#     def __init__(self, rank=None):
#         self.root = FPNode(None)
#         self.nodes = collections.defaultdict(list)
#         self.cond_items = []
#         self.rank = rank

#     def conditional_tree(self, cond_item, minsup):
#         """
#         Creates and returns the subtree of self conditioned on cond_item.

#         Parameters
#         ----------
#         cond_item : int | str
#             Item that the tree (self) will be conditioned on.
#         minsup : int
#             Minimum support threshold.

#         Returns
#         -------
#         cond_tree : FPtree
#         """
#         # Find all path from root node to nodes for item
#         branches = []
#         count = collections.defaultdict(int)
#         for node in self.nodes[cond_item]:
#             branch = node.itempath_from_root()
#             branches.append(branch)
#             for item in branch:
#                 count[item] += node.count

#         # Define new ordering or deep trees may have combinatorially explosion
#         items = [item for item in count if count[item] >= minsup]
#         items.sort(key=count.get)
#         rank = {item: i for i, item in enumerate(items)}

#         # Create conditional tree
#         cond_tree = FPTree(rank)
#         for idx, branch in enumerate(branches):
#             branch = sorted(
#                 [i for i in branch if i in rank], key=rank.get, reverse=True
#             )
#             cond_tree.insert_itemset(branch, self.nodes[cond_item][idx].count)
#         cond_tree.cond_items = self.cond_items + [cond_item]

#         return cond_tree

#     def insert_itemset(self, itemset, count=1):
#         """
#         Inserts a list of items into the tree.

#         Parameters
#         ----------
#         itemset : list
#             Items that will be inserted into the tree.
#         count : int
#             The number of occurrences of the itemset.
#         """
#         self.root.count += count

#         if len(itemset) == 0:
#             return

#         # Follow existing path in tree as long as possible
#         index = 0
#         node = self.root
#         for item in itemset:
#             if item in node.children:
#                 child = node.children[item]
#                 child.count += count
#                 node = child
#                 index += 1
#             else:
#                 break

#         # Insert any remaining items
#         for item in itemset[index:]:
#             child_node = FPNode(item, count, node)
#             self.nodes[item].append(child_node)
#             node = child_node

#     def is_path(self):
#         if len(self.root.children) > 1:
#             return False
#         for i in self.nodes:
#             if len(self.nodes[i]) > 1 or len(self.nodes[i][0].children) > 1:
#                 return False
#         return True

#     def print_status(self, count, colnames):
#         cond_items = [str(i) for i in self.cond_items]
#         if colnames:
#             cond_items = [str(colnames[i]) for i in self.cond_items]
#         cond_items = ", ".join(cond_items)
#         print(
#             "\r%d itemset(s) from tree conditioned on items (%s)" % (count, cond_items),
#             end="\n",
#         )


# class FPNode(object):
#     def __init__(self, item, count=0, parent=None):
#         self.item = item
#         self.count = count
#         self.parent = parent
#         self.children = collections.defaultdict(FPNode)

#         if parent is not None:
#             parent.children[item] = self

#     def itempath_from_root(self):
#         """Returns the top-down sequence of items from self to
#         (but not including) the root node."""
#         path = []
#         if self.item is None:
#             return path

#         node = self.parent
#         while node.item is not None:
#             path.append(node.item)
#             node = node.parent

#         path.reverse()
#         return path

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin


class TransactionEncoder(BaseEstimator, TransformerMixin):
    """Encoder class for transaction data in Python lists

    Parameters
    ------------
    None

    Attributes
    ------------
    columns_: list
      List of unique names in the `X` input list of lists

    Examples
    ------------
    For usage examples, please see
    https://rasbt.github.io/mlxtend/user_guide/preprocessing/TransactionEncoder/

    """

    def __init__(self):
        return None

    def fit(self, X):
        """Learn unique column names from transaction DataFrame

        Parameters
        ------------
        X : list of lists
          A python list of lists, where the outer list stores the
          n transactions and the inner list stores the items in each
          transaction.

          For example,
          [['Apple', 'Beer', 'Rice', 'Chicken'],
           ['Apple', 'Beer', 'Rice'],
           ['Apple', 'Beer'],
           ['Apple', 'Bananas'],
           ['Milk', 'Beer', 'Rice', 'Chicken'],
           ['Milk', 'Beer', 'Rice'],
           ['Milk', 'Beer'],
           ['Apple', 'Bananas']]

        """
        unique_items = set()
        for transaction in X:
            for item in transaction:
                unique_items.add(item)
        self.columns_ = sorted(unique_items)
        columns_mapping = {}
        for col_idx, item in enumerate(self.columns_):
            columns_mapping[item] = col_idx
        self.columns_mapping_ = columns_mapping
        return self

    def transform(self, X, sparse=False):
        """Transform transactions into a one-hot encoded NumPy array.

        Parameters
        ------------
        X : list of lists
          A python list of lists, where the outer list stores the
          n transactions and the inner list stores the items in each
          transaction.

          For example,
          [['Apple', 'Beer', 'Rice', 'Chicken'],
           ['Apple', 'Beer', 'Rice'],
           ['Apple', 'Beer'],
           ['Apple', 'Bananas'],
           ['Milk', 'Beer', 'Rice', 'Chicken'],
           ['Milk', 'Beer', 'Rice'],
           ['Milk', 'Beer'],
           ['Apple', 'Bananas']]

        sparse: bool (default=False)
          If True, transform will return Compressed Sparse Row matrix
          instead of the regular one.

        Returns
        ------------
        array : NumPy array [n_transactions, n_unique_items]
           if sparse=False (default).
           Compressed Sparse Row matrix otherwise
           The one-hot encoded boolean array of the input transactions,
           where the columns represent the unique items found in the input
           array in alphabetic order. Exact representation depends
           on the sparse argument

           For example,
           array([[True , False, True , True , False, True ],
                  [True , False, True , False, False, True ],
                  [True , False, True , False, False, False],
                  [True , True , False, False, False, False],
                  [False, False, True , True , True , True ],
                  [False, False, True , False, True , True ],
                  [False, False, True , False, True , False],
                  [True , True , False, False, False, False]])
          The corresponding column labels are available as self.columns_, e.g.,
          ['Apple', 'Bananas', 'Beer', 'Chicken', 'Milk', 'Rice']
        """
        if sparse:
            indptr = [0]
            indices = []
            for transaction in X:
                # set is necessary because conversion to SparseDataFrame
                # will fail if there are duplicate items
                """efficient storage way"""
                for item in set(transaction):
                    col_idx = self.columns_mapping_[item]
                    indices.append(col_idx)
                indptr.append(len(indices))
            non_sparse_values = [True] * len(indices)
            array = csr_matrix((non_sparse_values, indices, indptr), dtype=bool)
        else:
            array = np.zeros((len(X), len(self.columns_)), dtype=bool)
            for row_idx, transaction in enumerate(X):
                for item in transaction:
                    col_idx = self.columns_mapping_[item]
                    array[row_idx, col_idx] = True
        return array

    def inverse_transform(self, array):
        """Transforms an encoded NumPy array back into transactions.

        Parameters
        ------------
        array : NumPy array [n_transactions, n_unique_items]
            The NumPy one-hot encoded boolean array of the input transactions,
            where the columns represent the unique items found in the input
            array in alphabetic order

            For example,
            ```
            array([[True , False, True , True , False, True ],
                  [True , False, True , False, False, True ],
                  [True , False, True , False, False, False],
                  [True , True , False, False, False, False],
                  [False, False, True , True , True , True ],
                  [False, False, True , False, True , True ],
                  [False, False, True , False, True , False],
                  [True , True , False, False, False, False]])
            ```
            The corresponding column labels are available as self.columns_,
            e.g., ['Apple', 'Bananas', 'Beer', 'Chicken', 'Milk', 'Rice']

        Returns
        ------------
        X : list of lists
            A python list of lists, where the outer list stores the
            n transactions and the inner list stores the items in each
            transaction.

          For example,
          ```
          [['Apple', 'Beer', 'Rice', 'Chicken'],
           ['Apple', 'Beer', 'Rice'],
           ['Apple', 'Beer'],
           ['Apple', 'Bananas'],
           ['Milk', 'Beer', 'Rice', 'Chicken'],
           ['Milk', 'Beer', 'Rice'],
           ['Milk', 'Beer'],
           ['Apple', 'Bananas']]
          ```

        """
        return [
            [self.columns_[idx] for idx, cell in enumerate(row) if cell]
            for row in array
        ]

    def fit_transform(self, X, sparse=False):
        """Fit a TransactionEncoder encoder and transform a dataset."""
        return self.fit(X).transform(X, sparse=sparse)

In [5]:

def generate_new_combinations(old_combinations):
    """
    Generator of all combinations based on the last state of Apriori algorithm
    Parameters
    -----------
    old_combinations: np.array
        All combinations with enough support in the last step
        Combinations are represented by a matrix.
        Number of columns is equal to the combination size
        of the previous step.
        Each row represents one combination
        and contains item type ids in the ascending order
        ```
               0        1
        0      15       20
        1      15       22
        2      17       19
        ```

    Returns
    -----------
    Generator of all combinations from the last step x items
    from the previous step.

    Examples
    -----------
    For usage examples, please see
    https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori

    """

    items_types_in_previous_step = np.unique(old_combinations.flatten())
    for old_combination in old_combinations:
        max_combination = old_combination[-1]
        mask = items_types_in_previous_step > max_combination
        valid_items = items_types_in_previous_step[mask]
        old_tuple = tuple(old_combination)
        for item in valid_items:
            yield from old_tuple
            yield item


def generate_new_combinations_low_memory(old_combinations, X, min_support, is_sparse):
    """
    Generator of all combinations based on the last state of Apriori algorithm
    Parameters
    -----------
    old_combinations: np.array
        All combinations with enough support in the last step
        Combinations are represented by a matrix.
        Number of columns is equal to the combination size
        of the previous step.
        Each row represents one combination
        and contains item type ids in the ascending order
        ```
               0        1
        0      15       20
        1      15       22
        2      17       19
        ```

    X: np.array or scipy sparse matrix
      The allowed values are either 0/1 or True/False.
      For example,

    ```
        0     True False  True  True False  True
        1     True False  True False False  True
        2     True False  True False False False
        3     True  True False False False False
        4    False False  True  True  True  True
        5    False False  True False  True  True
        6    False False  True False  True False
        7     True  True False False False False
    ```

    min_support : float (default: 0.5)
      A float between 0 and 1 for minumum support of the itemsets returned.
      The support is computed as the fraction
      `transactions_where_item(s)_occur / total_transactions`.

    is_sparse : bool True if X is sparse

    Returns
    -----------
    Generator of all combinations from the last step x items
    from the previous step. Every combination contains the
    number of transactions where this item occurs, followed
    by item type ids in the ascending order.
    No combination other than generated
    do not have a chance to get enough support

    Examples
    -----------
    For usage examples, please see
    https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/generate_new_combinations/

    """

    items_types_in_previous_step = np.unique(old_combinations.flatten())
    rows_count = X.shape[0]
    threshold = min_support * rows_count
    for old_combination in old_combinations:
        max_combination = old_combination[-1]
        mask = items_types_in_previous_step > max_combination
        valid_items = items_types_in_previous_step[mask]
        old_tuple = tuple(old_combination)
        if is_sparse:
            mask_rows = X[:, old_tuple].toarray().all(axis=1)
            X_cols = X[:, valid_items].toarray()
            supports = X_cols[mask_rows].sum(axis=0)
        else:
            mask_rows = X[:, old_tuple].all(axis=1)
            supports = X[mask_rows][:, valid_items].sum(axis=0)
        valid_indices = (supports >= threshold).nonzero()[0]
        for index in valid_indices:
            yield supports[index]
            yield from old_tuple
            yield valid_items[index]


def apriori(
    df, min_support=0.5, use_colnames=False, max_len=None, verbose=0, low_memory=False, count = False
):
    """Get frequent itemsets from a one-hot DataFrame

    Parameters
    -----------
    df : pandas DataFrame
      pandas DataFrame the encoded format. Also supports
      DataFrames with sparse data; for more info, please
      see (https://pandas.pydata.org/pandas-docs/stable/
           user_guide/sparse.html#sparse-data-structures)

      Please note that the old pandas SparseDataFrame format
      is no longer supported in mlxtend >= 0.17.2.

      The allowed values are either 0/1 or True/False.
      For example,

    ```
             Apple  Bananas   Beer  Chicken   Milk   Rice
        0     True    False   True     True  False   True
        1     True    False   True    False  False   True
        2     True    False   True    False  False  False
        3     True     True  False    False  False  False
        4    False    False   True     True   True   True
        5    False    False   True    False   True   True
        6    False    False   True    False   True  False
        7     True     True  False    False  False  False
    ```

    min_support : float (default: 0.5)
      A float between 0 and 1 for minumum support of the itemsets returned.
      The support is computed as the fraction
      `transactions_where_item(s)_occur / total_transactions`.

    use_colnames : bool (default: False)
      If `True`, uses the DataFrames' column names in the returned DataFrame
      instead of column indices.

    max_len : int (default: None)
      Maximum length of the itemsets generated. If `None` (default) all
      possible itemsets lengths (under the apriori condition) are evaluated.

    verbose : int (default: 0)
      Shows the number of iterations if >= 1 and `low_memory` is `True`. If
      >=1 and `low_memory` is `False`, shows the number of combinations.

    low_memory : bool (default: False)
      If `True`, uses an iterator to search for combinations above
      `min_support`.
      Note that while `low_memory=True` should only be used for large dataset
      if memory resources are limited, because this implementation is approx.
      3-6x slower than the default.


    Returns
    -----------
    pandas DataFrame with columns ['support', 'itemsets'] of all itemsets
      that are >= `min_support` and < than `max_len`
      (if `max_len` is not None).
      Each itemset in the 'itemsets' column is of type `frozenset`,
      which is a Python built-in type that behaves similarly to
      sets except that it is immutable
      (For more info, see
      https://docs.python.org/3.6/library/stdtypes.html#frozenset).

    Examples
    -----------
    For usage examples, please see
    https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/

    """

    def _support(_x, _n_rows, _is_sparse, count = False):
        """DRY private method to calculate support as the
        row-wise sum of values / number of rows

        Parameters
        -----------

        _x : matrix of bools or binary

        _n_rows : numeric, number of rows in _x

        _is_sparse : bool True if _x is sparse

        Returns
        -----------
        np.array, shape = (n_rows, )

        Examples
        -----------
        For usage examples, please see
        https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/

        """
        if count is True:
            n = 1
        else:
            n = _n_rows
        out = np.sum(_x, axis=0) / n
        return np.array(out).reshape(-1)

    if min_support <= 0.0:
        raise ValueError(
            "`min_support` must be a positive "
            "number within the interval `(0, 1]`. "
            "Got %s." % min_support
        )

    valid_input_check(df)

    if hasattr(df, "sparse"):
        # DataFrame with SparseArray (pandas >= 0.24)
        if df.size == 0:
            X = df.values
        else:
            X = df.sparse.to_coo().tocsc()
        is_sparse = True
    else:
        # dense DataFrame
        X = df.values
        is_sparse = False
    support = _support(X, X.shape[0], is_sparse, count)
    ary_col_idx = np.arange(X.shape[1])
    # frequency of data?
    support_dict = {1: support[support >= min_support]}
    # item that is frequent? 
    itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
    max_itemset = 1
    rows_count = float(X.shape[0])

    all_ones = np.ones((int(rows_count), 1))

    while max_itemset and max_itemset < (max_len or float("inf")):
        next_max_itemset = max_itemset + 1

        # With exceptionally large datasets, the matrix operations can use a
        # substantial amount of memory. For low memory applications or large
        # datasets, set `low_memory=True` to use a slower but more memory-
        # efficient implementation.
        if low_memory:
            combin = generate_new_combinations_low_memory(
                itemset_dict[max_itemset], X, min_support, is_sparse
            )
            # slightly faster than creating an array from a list of tuples
            combin = np.fromiter(combin, dtype=int)
            combin = combin.reshape(-1, next_max_itemset + 1)

            if combin.size == 0:
                break
            if verbose:
                print(
                    "\rProcessing %d combinations | Sampling itemset size %d"
                    % (combin.size, next_max_itemset),
                    end="",
                )

            itemset_dict[next_max_itemset] = combin[:, 1:]
            if count is True:
                rows_count = 1
            support_dict[next_max_itemset] = combin[:, 0].astype(float) / rows_count
            max_itemset = next_max_itemset
        else:
            combin = generate_new_combinations(itemset_dict[max_itemset])
            combin = np.fromiter(combin, dtype=int)
            combin = combin.reshape(-1, next_max_itemset)

            if combin.size == 0:
                break
            if verbose:
                print(
                    "\rProcessing %d combinations | Sampling itemset size %d"
                    % (combin.size, next_max_itemset),
                    end="",
                )

            if is_sparse:
                _bools = X[:, combin[:, 0]] == all_ones
                for n in range(1, combin.shape[1]):
                    _bools = _bools & (X[:, combin[:, n]] == all_ones)
            else:
                _bools = np.all(X[:, combin], axis=2)

            support = _support(np.array(_bools), rows_count, is_sparse, count)
            _mask = (support >= min_support).reshape(-1)
            if any(_mask):
                itemset_dict[next_max_itemset] = np.array(combin[_mask])
                support_dict[next_max_itemset] = np.array(support[_mask])
                max_itemset = next_max_itemset
            else:
                # Exit condition
                break

    all_res = []
    for k in sorted(itemset_dict):
        # frequency value?
        support = pd.Series(support_dict[k])
        # the frequent item(s) combination
        itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]], dtype="object")

        res = pd.concat((support, itemsets), axis=1)
        all_res.append(res)
    # concat row by row
    res_df = pd.concat(all_res)
    if count is True:
        col = 'count'
    else:
        col = 'support'
    res_df.columns = [col, "itemsets"]
    if use_colnames:
        mapping = {idx: item for idx, item in enumerate(df.columns)}
        res_df["itemsets"] = res_df["itemsets"].apply(
            lambda x: frozenset([mapping[i] for i in x])
        )
    res_df = res_df.reset_index(drop=True)

    if verbose:
        print()  # adds newline if verbose counter was used

    return res_df

In [6]:
apples = [True, True, True, True, False, False, False, True]
bananas = [False, False, False, True, False, False, False, True]
beer = [True, True, True, False, True, True, True, False]
chicken = [True, False, False, False, True, False, False, False]
milk = [False, False, False, False, True, True, True, False]
rice = [True, True, False, False, True, True, False, False]
df = pd.DataFrame(zip(apples,bananas,beer,chicken,milk,rice), columns = ['Apple','Banana','Beer','Chicken','Milk','Rice'])
df

Unnamed: 0,Apple,Banana,Beer,Chicken,Milk,Rice
0,True,False,True,True,False,True
1,True,False,True,False,False,True
2,True,False,True,False,False,False
3,True,True,False,False,False,False
4,False,False,True,True,True,True
5,False,False,True,False,True,True
6,False,False,True,False,True,False
7,True,True,False,False,False,False


In [19]:
#tree, rank = setup_fptree(df,1)

In [20]:
#rank

In [21]:
#np.nonzero(item_support >= min_support)

In [8]:
print(hasattr(df, "sparse"))

False


In [46]:
df.sparse.to_coo()

AttributeError: Can only use the '.sparse' accessor with Sparse data.

In [7]:
df_ohe = pd.get_dummies(df)
df_ohe = csr_matrix(df_ohe.values)

In [8]:
df = pd.DataFrame.sparse.from_spmatrix(df_ohe, columns = ['Apple','Banana','Beer','Chicken','Milk','Rice'])

In [9]:
type(df)

pandas.core.frame.DataFrame

In [10]:
apriori(df, min_support=0.5,use_colnames=True, count = True)

Unnamed: 0,count,itemsets
0,5.0,(Apple)
1,2.0,(Banana)
2,6.0,(Beer)
3,2.0,(Chicken)
4,3.0,(Milk)
5,4.0,(Rice)
6,2.0,"(Apple, Banana)"
7,3.0,"(Apple, Beer)"
8,1.0,"(Apple, Chicken)"
9,2.0,"(Apple, Rice)"


- big: https://www.kaggle.com/datasets/conorsully1/simulated-transactions
- small: https://www.kaggle.com/datasets/hunter0007/ecommerce-dataset-for-predictive-marketing-2023

In [11]:
from tqdm import tqdm

In [52]:
import time
reader = pd.read_csv('../Dataset/transactions.csv', chunksize = 1000000, parse_dates = ['DATE'])
#preps = pd.DataFrame(columns = ["CUST_ID",'DATE','EXP_TYPE'])
preps = pd.DataFrame(columns = ['Bills and Utilities','Clothing','Education','Entertainment','Fines','Gambling',
                                'Groceries','Health','Housing','Motor/Travel','Savings','Tax'])
dic = {}
total = 0
start_time = time.time()
for chunk in tqdm(reader):
    prep = chunk[['CUST_ID', 'DATE', 'EXP_TYPE']]
    prep = prep.groupby(['CUST_ID','DATE'])['EXP_TYPE'].apply(set).apply(list).reset_index()
    prep_ohe = pd.get_dummies(prep['EXP_TYPE'].apply(list).explode()).groupby(level = 0).sum()
    prep_ohe2 = csr_matrix(prep_ohe.values)
    df4 = pd.DataFrame.sparse.from_spmatrix(prep_ohe2, columns = prep_ohe.columns)
    #preps = pd.concat([preps,df4]).reset_index(drop=True)
    temp = apriori(df4, min_support=0.000001,use_colnames=True, count = True)
    #temp['count'] = temp['count'].apply(lambda x: x*len(df4))
    preps = pd.concat([preps, temp])
    total+=len(df4)
#df2 = pd.concat([chunk[['CUST_ID', 'DATE', 'EXP_TYPE']] for chunk in reader])
prep_time_large = time.time()-start_time

0it [00:00, ?it/s]

18it [11:59, 41.79s/it]

In [162]:
print(prep_time_large, total)

12295.005112409592 87861639


In [165]:
itemsets = preps[['itemsets','count']].groupby('itemsets').sum().reset_index()
itemsets['support'] = itemsets['count']/total
itemsets['support'] = itemsets['support'].map('{:.10f}'.format)

In [146]:
itemsets.to_csv('itemsets.csv', encoding='utf-8', index=False)

In [13]:
import ast
itemsets = pd.read_csv('../Dataset/itemsets.csv')
itemsets['support'] = itemsets['support'].map('{:.10f}'.format)
itemsets['itemsets'] = itemsets['itemsets'].apply(lambda x: '['+x.split("{")[1].split("}")[0]+']')
itemsets['itemsets'] = itemsets['itemsets'].apply(lambda x: ast.literal_eval(x))
itemsets

Unnamed: 0,itemsets,count,support
0,[Bills and Utilities],8373041.0,0.0952980288
1,"[Education, Bills and Utilities, Health, Enter...",72.0,0.0000008195
2,"[Gambling, Education, Bills and Utilities, Hea...",40.0,0.0000004553
3,"[Clothing, Education, Savings, Entertainment, ...",11.0,0.0000001252
4,"[Clothing, Gambling, Education, Motor/Travel, ...",24.0,0.0000002732
...,...,...,...
2530,"[Clothing, Gambling, Health, Savings, Groceries]",199.0,0.0000022649
2531,"[Clothing, Gambling, Motor/Travel, Housing, Gr...",573.0,0.0000065216
2532,"[Clothing, Gambling, Tax, Housing, Groceries]",232.0,0.0000026405
2533,"[Clothing, Gambling, Motor/Travel, Groceries, ...",1005.0,0.0000114384


In [14]:
itemsets.sort_values(by = 'count',ascending = False).reset_index(drop=True).head(20)

Unnamed: 0,itemsets,count,support
0,[Entertainment],67857929.0,0.7723271472
1,[Groceries],35821790.0,0.4077068264
2,"[Entertainment, Groceries]",24745470.0,0.2816413429
3,[Motor/Travel],22958999.0,0.2613085672
4,"[Motor/Travel, Entertainment]",15859310.0,0.1805032342
5,"[Motor/Travel, Groceries]",8390830.0,0.0955004948
6,[Bills and Utilities],8373041.0,0.0952980288
7,[Health],7214011.0,0.0821064925
8,[Clothing],7127945.0,0.0811269296
9,"[Motor/Travel, Entertainment, Groceries]",5801228.0,0.0660268584


In [78]:
df2

Unnamed: 0,CUST_ID,DATE,EXP_TYPE
0,CI6XLYUMQK,2015-09-11,Motor/Travel
1,CI6XLYUMQK,2017-02-08,Motor/Travel
2,CI6XLYUMQK,2015-08-01,Housing
3,CI6XLYUMQK,2019-03-16,Entertainment
4,CI6XLYUMQK,2015-05-15,Entertainment
...,...,...,...
261969714,CIQJZREHDP,2013-05-24,Groceries
261969715,CIQJZREHDP,2013-05-02,Groceries
261969716,CIQJZREHDP,2015-02-04,Groceries
261969717,CIQJZREHDP,2012-01-22,Bills and Utilities


In [16]:
com = pd.read_csv('../Dataset/SampleSuperstore.csv')
com = com[['Postal Code','Sub-Category']]
com = com.drop_duplicates()
com = com.groupby('Postal Code').agg(list).reset_index()
com.head()

Unnamed: 0,Postal Code,Sub-Category
0,1040,[Bookcases]
1,1453,"[Binders, Storage, Paper, Labels]"
2,1752,[Furnishings]
3,1810,"[Storage, Chairs, Paper, Binders]"
4,1841,"[Furnishings, Storage, Fasteners, Labels, Bind..."


In [17]:
product = list(com['Sub-Category'])
te = TransactionEncoder()
te_ary = te.fit(product).transform(product)
df = pd.DataFrame(te_ary, columns=te.columns_)
df

Unnamed: 0,Accessories,Appliances,Art,Binders,Bookcases,Chairs,Copiers,Envelopes,Fasteners,Furnishings,Labels,Machines,Paper,Phones,Storage,Supplies,Tables
0,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,True,False,False,False,False,False,False,True,False,True,False,True,False,False
2,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
3,False,False,False,True,False,True,False,False,False,False,False,False,True,False,True,False,False
4,False,True,True,True,False,True,True,False,True,True,True,False,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
626,False,False,False,False,False,True,False,False,False,True,False,False,True,True,True,False,False
627,False,True,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False
628,True,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,False
629,True,False,True,False,False,True,False,False,False,False,False,True,False,False,True,False,True


In [18]:
apriori(df, min_support=0.1, use_colnames=True) 

Unnamed: 0,support,itemsets
0,0.462758,(Accessories)
1,0.353407,(Appliances)
2,0.469097,(Art)
3,0.637084,(Binders)
4,0.218700,(Bookcases)
...,...,...
2368,0.101426,"(Chairs, Storage, Art, Furnishings, Binders, L..."
2369,0.110935,"(Chairs, Storage, Art, Furnishings, Binders, P..."
2370,0.112520,"(Accessories, Chairs, Appliances, Storage, Art..."
2371,0.104596,"(Accessories, Appliances, Storage, Art, Furnis..."


In [51]:
len(food)

1630

In [19]:
food = pd.read_csv('../Dataset/Datafiniti_Fast_Food_Restaurants.csv')
food = food[['name', 'province']]
food = food.drop_duplicates()
food = food.groupby('name').agg(list).reset_index()
food

Unnamed: 0,name,province
0,7-Eleven,"[NY, TX, IL, MD, CA, DE, FL, UT, PA]"
1,90 Miles Cuban Cafe,[IL]
2,A&W/Long John Silvers,[PA]
3,A's Ace Burgers,[CA]
4,Abruzzi Pizza,[PA]
...,...,...
566,Zaxby's,[NC]
567,Zaxby's Chicken Fingers & Buffalo Wings,[GA]
568,Zio's Italian Kitchen,[OK]
569,b.good,[MA]


In [20]:
product = list(food['province'])
te2 = TransactionEncoder()
te_ary2 = te2.fit(product).transform(product)
df2 = pd.DataFrame(te_ary2, columns=te2.columns_)
df2

Unnamed: 0,AK,AL,AR,AZ,CA,CO,CT,DE,FL,GA,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
0,False,False,False,False,True,False,False,True,True,False,...,False,False,True,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
566,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
567,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
568,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
569,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [21]:
apriori(df2, min_support=0.023, use_colnames=True) 

Unnamed: 0,support,itemsets
0,0.042032,(AR)
1,0.080560,(AZ)
2,0.236427,(CA)
3,0.057793,(CO)
4,0.031524,(CT)
...,...,...
3335,0.024518,"(FL, KY, OK, AZ, TX, PA, CA, OH)"
3336,0.024518,"(FL, KY, OK, AZ, TX, VA, CA, OH)"
3337,0.024518,"(FL, NC, AZ, TX, PA, NJ, CA, OH)"
3338,0.024518,"(FL, IL, MN, NC, TX, MI, CA, OH)"
