# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [4]:
# importujemy wszystkie potrzebne pakiety

from more_itertools import powerset

In [5]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.001

In [6]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)

## Część 2. - obliczanie wskaźników

In [15]:
# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`

def stringify_tuple(tpl: tuple[str]):
    return ','.join(sorted(list(tpl)))

def get_supports(baskets: list[tuple[str]], all_products: list[str], epsilon: float):
    # calculate all supports and omit those with support < epsilon
    # support with 1 product > epsilon, then go with it to 2 products, etc.
    # if we get 2 products > epsilon, then go with it to 3 products, etc.
    # use dictionary as a data structure, key = tuple of products, value = support
    # remake the keys so it is a string of products separated by comma - note two same sets has to has the same string
    
    result = {}

    def recursive_support(current_products: tuple[str]):
        for product in all_products:
            if product in current_products: #skip already used product
                continue
            new_product_key = stringify_tuple((product,))
            if new_product_key in result and result[new_product_key] < epsilon: # don't consider low support products
                continue
                
            new_products = current_products + (product,)
            new_products_key = stringify_tuple(new_products)
            if new_products_key in result: # if already calculated
                continue
            occurrences = 0
            for basket in baskets:
                if set(new_products).issubset(basket):
                    occurrences += 1
            calculated_support = occurrences / len(baskets)
            if calculated_support >= epsilon:
                result[new_products_key] = calculated_support
                print(new_products, calculated_support)
                recursive_support(new_products)

          
    recursive_support(())
        
    return result
    
supports = get_supports(baskets, products, EPSILON)
supports

('abrasive cleaner',) 0.0014702933903628951
('artif. sweetener',) 0.0019381140145692708
('baking powder',) 0.008086613646995923
('bathroom cleaner',) 0.0011361358016440553
('beef',) 0.03395041101383412
('beef', 'bottled beer') 0.0010693042839002875
('beef', 'bottled water') 0.0013366303548753592
('beef', 'brown bread') 0.0015371249081066632
('beef', 'butter') 0.0011361358016440553
('beef', 'canned beer') 0.0010024727661565194
('beef', 'citrus fruit') 0.001804450979081735
('beef', 'curd') 0.0012697988371315912
('beef', 'domestic eggs') 0.0011361358016440553
('beef', 'frankfurter') 0.0010024727661565194
('beef', 'frozen vegetables') 0.0012697988371315912
('beef', 'fruit/vegetable juice') 0.0010693042839002875
('beef', 'margarine') 0.001403461872619127
('beef', 'newspapers') 0.001670787943594199
('beef', 'other vegetables') 0.002806923745238254
('beef', 'pastry') 0.0012029673193878234
('beef', 'rolls/buns') 0.001603956425850431
('beef', 'root vegetables') 0.001670787943594199
('beef', 'sh

{'abrasive cleaner': 0.0014702933903628951,
 'artif. sweetener': 0.0019381140145692708,
 'baking powder': 0.008086613646995923,
 'bathroom cleaner': 0.0011361358016440553,
 'beef': 0.03395041101383412,
 'beef,bottled beer': 0.0010693042839002875,
 'beef,bottled water': 0.0013366303548753592,
 'beef,brown bread': 0.0015371249081066632,
 'beef,butter': 0.0011361358016440553,
 'beef,canned beer': 0.0010024727661565194,
 'beef,citrus fruit': 0.001804450979081735,
 'beef,curd': 0.0012697988371315912,
 'beef,domestic eggs': 0.0011361358016440553,
 'beef,frankfurter': 0.0010024727661565194,
 'beef,frozen vegetables': 0.0012697988371315912,
 'beef,fruit/vegetable juice': 0.0010693042839002875,
 'beef,margarine': 0.001403461872619127,
 'beef,newspapers': 0.001670787943594199,
 'beef,other vegetables': 0.002806923745238254,
 'beef,pastry': 0.0012029673193878234,
 'beef,rolls/buns': 0.001603956425850431,
 'beef,root vegetables': 0.001670787943594199,
 'beef,shopping bags': 0.0012697988371315912,


In [None]:
# definiujemy funkcje obliczajace support, confidence i lift

def support(supports, products: tuple[str]) -> float:
    products_key = stringify_tuple(products)
    if products_key in supports:
        return supports[products_key]
    else:
        return 0.0

def confidence(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    new_products = prior_products + following_products
    new_products_key = stringify_tuple(new_products)
    if new_products_key in supports:
        return supports[new_products_key] / support(supports, prior_products)
    else:
        return 0.0
    
def lift(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    new_products = prior_products + following_products
    new_products_key = stringify_tuple(new_products)
    if new_products_key in supports:
        return supports[new_products_key] / (support(supports, prior_products) * support(supports, following_products))
    else:
        return 0.0

In [None]:
print(support(supports, {'whole milk', 'rolls/buns'}))
print(confidence(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))
print(lift(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))

## Część 3. - generowanie rekomendacji

In [None]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i mozliwie wysokie confidence

def generate_next_product_candidates(basket: tuple[str], products: list[str], supports) -> list[tuple[str, tuple[str], float, float]]:
    # return [(item, subbasket, confidence, lift)]
    raise NotImplementedError()

In [None]:
print(baskets[1])
generate_next_product_candidates(baskets[1], products, supports)

In [None]:
print(baskets[33])
generate_next_product_candidates(baskets[33], products, supports)