# Avtomatski izbor artiklov za personaliziran e-katalog

## Branje datotek
Incializacija in deklaracija seznamov, ki hranijo objekte za vsako vrstico iz csv datotek.

In [1]:
class OrderProduct:
    def __init__(self, order_id, product_id, add_to_cart_order, reordered):
        """
        Initialization of an order product

        :param str order_id: Id of an order
        :param str product_id: Id of a product
        :param str add_to_cart_order: Sequence in which products were added to cart
        :param str reordered: Prouct is reordered (1-yes, 0-no)
        """
        self.order_id = order_id
        self.product_id = product_id
        self.add_to_cart_order = add_to_cart_order
        self.reordered = reordered
        
class Order:
    def __init__(self, order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order):
        """
        Initialization of an order

        :param str order_id: Id of an order
        :param str user_id: Id of an user
        :param str eval_set: Train or test or prior set
        :param str order_number: The sequence of the user's purchase
        :param str order_dow: Day the product was purchased (0-6)
        :param str order_hour_of_day: Hour the product was purchased (00-23)
        :param str days_since_prior_order: Days from the previous order
        """
        self.order_id = order_id
        self.user_id = user_id
        self.eval_set = eval_set
        self.order_number = order_number
        self.order_dow = order_dow
        self.order_hour_of_day = order_hour_of_day
        self.days_since_prior_order = days_since_prior_order
        
class Product:
    def __init__(self, product_id, product_name, aisle_id, department_id):
        """
        Initialization of a product

        :param str product_id: Id of a product
        :param str product_name: Name of a product
        :param str aisle_id: Id of an aisle
        :param str department_id: Id of a department
        """
        self.product_id = product_id
        self.product_name = product_name
        self.aisle_id = aisle_id
        self.department_id = department_id
        
class Aisle:
    def __init__(self, aisle_id, aisle):
        """
        Initialization of an aisle

        :param str aisle_id: Id of an aisle
        :param str aisle: Name of an aisle
        """
        self.aisle_id = aisle_id
        self.aisle = aisle

class Department:
    def __init__(self, department_id, department):
        """
        Initialization of a department

        :param str department_id: Id of an department
        :param str department: Name of a department
        """
        self.department_id = department_id
        self.department = department

def make_list(path, class_type):
    """
    Makes a list of objects from csv file
    
    :param str path: Path of a csv file
    :param str class_type: Determines which objects are stored to the list
    :return: List of objects
    """
    csv_list = []
    with open(path, mode="r", encoding="utf-8") as f:
        for line in f.readlines()[1:]:
            # Adds an object to the list with as many arguments as there are cells in the row provided by the csv file
            line_split = line.replace("\n", "").split(",")
            csv_list.append(class_type(*line_split))
    return csv_list

# Lists of objects
_order_products_prior = make_list("Data/order_products_prior.csv", OrderProduct)
_order_products_last = make_list("Data/order_products_last.csv", OrderProduct)
_orders = make_list("Data/orders.csv", Order)
_products = make_list("Data/products.csv", Product)
_aisles = make_list("Data/aisles.csv", Aisle)
_departments = make_list("Data/departments.csv", Department)

## Pomožne funkcije
- Izpis besedila na podlagi danega ID-ja
- Filter za izključitev izdelkov, ki so že priporočeni uporabnikom
- Povezava med naročilom in uporabnikom za hitrejše izvajanje programa
- Zadnje naročilo uporabnikov
- Slovar vseh izdelkov z vrednostmi oddelka in prehoda

In [2]:
import random
import math

def get_product_name(product_id):
    """
    Finds the product name according to its id
    
    :param str product_id: Id of a product
    :return: Name of a product or text if doesn't exist
    """
    for product in _products:
        if product_id == product.product_id:
            return product.product_name
    return "Product id doesn't exist!"

def get_aisle_name(aisle_id):
    """
    Finds the aisle name according to its id
    
    :param str aisle_id: Id of an aisle
    :return: Name of an aisle or text if doesn't exist
    """
    for aisle in _aisles:
        if aisle_id == aisle.aisle_id:
            return aisle.aisle
    return "Aisle id doesn't exist!"

def get_department_name(department_id):
    """
    Finds the department name according to its id
    
    :param str department_id: Id of a department
    :return: Name of a product or text if doesn't exist
    """
    for department in _departments:
        if department_id == department.department_id:
            return department.department
    return "Department id doesn't exist!"

def create_filter():
    """
    Creates a filter to exclude products
    
    :return: Filter dictionary with user ids as keys and product ids as values
    """
    filter_dict = {}
    for order in _orders:
        filter_dict[order.user_id] = set()
    return filter_dict

def create_order_user():
    """
    Connects orders with user ids to speed up the loops
    
    :return: Dictionary with order ids as keys and user ids as values
    """
    order_user_dict = {}
    for order in _orders:
        order_user_dict[order.order_id] = order.user_id
    return order_user_dict

def create_user_last_order():
    """
    Creates a dictionary of users' last orders
    
    :return: Dictionary with user ids as keys and set of purchased products (product ids) as values
    """
    last_order_dict = {}
    for order_product in _order_products_last:
        user_id = _order_user[order_product.order_id]
        last_order_dict[user_id] = last_order_dict.get(user_id, set()).union({order_product.product_id})
    return last_order_dict

def create_product_category():
    """
    Creates a double dictionary of product ids
    
    :return: Dictionary with product ids as keys and dictionary of its aisle and department as value
    """
    product_dict = {}
    for product in _products:
        product_dict[product.product_id] = {"aisle": product.aisle_id, "department": product.department_id}
    return product_dict

## Izdelava kataloga
Uporabljene metode:
- Najbolj prodajani izdelki
- Gradientno pospeševanje - Izdelki, ki jih bo uporabnik kupil v nasledjnem naročilu
- Skupinsko filtriranje - Mogoče bi vas zanimalo tudi
- Dvonivojska napoved - Izbor po kategorijah

## Najbolj prodajani izdelki
Metodi za priporočnje najbolj prodajanih izdelkov.

Najbolj prodajani izdelki med vsemi uporabniki.

In [3]:
class BestSellingAll:
    def __init__(self):
        """
        Initialization of a method for best selling products
        """
        self.predicted_products = {}
    
    def fit(self, number_predict, allow_filter = False, update_filter = False):
        """
        Trains the model
        
        :param int number_predict: Number of predicted products
        :param allow_filter: Allow the filter to interfere with the model
        :param update_filter: Update the filter when model is trained
        :type allow_filter: Boolean
        :type update_filter: Boolean
        """
        # Check the number of predictions
        if number_predict <= 0:
            print("The number of predictions must be greater than zero.")
        product_scores = {}
        unavailable_products = {}
        
        # Get unavailable products for all users
        if allow_filter:
            for user_id, products in _filter.items():
                unavailable_products.update(products)
        
        # Adds the number of purchases for each product
        for order_product in _order_products_prior:
            product_id = order_product.product_id
            
            if not (allow_filter and product_id in unavailable_products):
                product_scores[product_id] = product_scores.get(product_id, 0) + 1
        
        # Sorts the dictionary by values from most purchases to least purchases of a product
        product_scores = {k: v for k, v in sorted(product_scores.items(), key=lambda item: item[1], reverse=True)}
        
        # Save predicted products
        count_predict = 0
        for product_id in product_scores:
            self.predicted_products[product_id] = product_scores[product_id]
            
            if update_filter:
                for user_id in _filter:
                    _filter[user_id].add(product_id)
            
            # Check the number of stored products
            count_predict += 1
            if count_predict >= number_predict:
                break
    
    def predict(self, user_id, printout = False):
        """
        Returns best selling products
        
        :param str user_id: Id of an user
        :param printout: Print best selling products and the number of their purchases
        :type printout: Boolean
        :return: Product ids with highest score (best selling products)
        """
        if user_id not in _user_last_order:
            return None
        
        if printout:
            print("Best selling products")
            print("-"*21)
            for product_id, product_purchases in self.predicted_products.items():
                print(get_product_name(product_id), "-", product_purchases)
        
        return set(self.predicted_products.keys())
    
    def evaluate(self, printout = False):
        """
        Evaluation of BestSellingAll method
        
        :param printout: Print precision and recall of BestSellingAll method
        :type printout: Boolean
        :return: precision and recall of BestSellingAll method
        """
        precision_all = []
        recall_all = []
        for user_id in _user_last_order:
            intersect = _user_last_order[user_id].intersection(self.predicted_products)
            precision_all.append(len(intersect) / len(self.predicted_products))
            recall_all.append(len(intersect) / len(_user_last_order[user_id]))
            
        precision = sum(precision_all) / len(precision_all)
        recall = sum(recall_all) / len(recall_all)
        if printout:
            print("Precision:", precision)
            print("Recall:", recall)
            print("F-score:", 2 * precision * recall / (precision + recall))
        
        return (precision, recall)

Najbolj prodajani izdelki za posameznega uporabnika.

In [4]:
class BestSelling:
    def __init__(self):
        """
        Initialization of a method for best selling products
        """
        self.predicted_products = {}
    
    def fit(self, number_predict, allow_filter = False, update_filter = False):
        """
        Trains the model
        
        :param int number_predict: Number of predicted products
        :param allow_filter: Allow the filter to interfere with the model
        :param update_filter: Update the filter when model is trained
        :type allow_filter: Boolean
        :type update_filter: Boolean
        """
        # Check the number of predictions
        if number_predict <= 0:
            print("The number of predictions must be greater than zero.")
        product_scores = {}
        
        # For each user, it adds the number of purchases for each product
        for order_product in _order_products_prior:            
            product_id = order_product.product_id
            user_id = _order_user[order_product.order_id]
            if user_id not in product_scores:
                product_scores[user_id] = {}
            
            if not (allow_filter and product_id in _filter[user_id]):
                product_scores[user_id][product_id] = product_scores[user_id].get(product_id, 0) + 1
        
        # For each user, it sorts the dictionary by values from most purchases to least purchases of a product
        for user_id in product_scores:
            product_scores[user_id] = {k: v for k, v in sorted(product_scores[user_id].items(), key=lambda item: item[1], reverse=True)}
        
        # Save predicted products
        for user_id in product_scores:
            count_predict = 0
            self.predicted_products[user_id] = {}
            
            for product_id in product_scores[user_id]:
                self.predicted_products[user_id][product_id] = product_scores[user_id][product_id]

                if update_filter:
                    _filter[user_id].add(product_id)

                # Check the number of stored products
                count_predict += 1
                if count_predict >= number_predict:
                    break
            
            # Add products if there aren't enough predictions
            if count_predict < number_predict:
                while count_predict < number_predict:
                    if allow_filter:
                        available_products = _product_category.keys() - _filter[user_id].union(self.predicted_products[user_id])
                    else:
                        available_products = _product_category.keys() - self.predicted_products[user_id]
                    random_product_id = random.choice(list(available_products))
                    self.predicted_products[user_id][random_product_id] = 0
                    
                    if update_filter:
                        _filter[user_id].add(random_product_id)
                    
                    count_predict += 1
    
    def predict(self, user_id, printout = False):
        """
        Returns best selling products
        
        :param str user_id: Id of an user
        :param printout: Print best selling products and the number of their purchases
        :type printout: Boolean
        :return: Product ids with highest score (best selling products)
        """
        if user_id not in _user_last_order:
            return None
        
        if printout:
            print("User id:", user_id)
            print("Best selling products")
            print("-"*21)
            
            for product_id, product_purchases in self.predicted_products[user_id].items():
                print(get_product_name(product_id), "-", product_purchases)
        
        return set(self.predicted_products[user_id].keys())
    
    def evaluate(self, printout = False):
        """
        Evaluation of BestSelling method
        
        :param printout: Print precision and recall of BestSelling method
        :type printout: Boolean
        :return: precision and recall of BestSelling method
        """
        precision_all = []
        recall_all = []
        for user_id in _user_last_order:
            intersect = _user_last_order[user_id].intersection(self.predicted_products[user_id])
            precision_all.append(len(intersect) / len(self.predicted_products[user_id]))
            recall_all.append(len(intersect) / len(_user_last_order[user_id]))
            
        precision = sum(precision_all) / len(precision_all)
        recall = sum(recall_all) / len(recall_all)
        if printout:
            print("Precision:", precision)
            print("Recall:", recall)
            print("F-score:", 2 * precision * recall / (precision + recall))
        
        return (precision, recall)

## Gradientno pospeševanje
Metoda za napoved izdelkov, ki jih bo uporabnik kupil v naslednjem naročilu.

Branje datotek in inicializacija ter deklaracija seznamov.

In [5]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import lightgbm as lgb

_orders_gb = pd.read_csv("data/orders.csv", dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

_products_gb = pd.read_csv("data/products.csv", dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])

_prior_gb = pd.read_csv("data/order_products_prior.csv", dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

_last_gb = pd.read_csv("data/order_products_last.csv", dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

Ustvarjanje značilk oziroma atributov za dodaten opis kupljenega izdelka.

In [6]:
# Product attributes
temp_products_gb = pd.DataFrame()
temp_products_gb['orders'] = _prior_gb.groupby(_prior_gb.product_id).size().astype(np.int32) # Number of each products purchased
temp_products_gb['reorders'] = _prior_gb['reordered'].groupby(_prior_gb.product_id).sum().astype(np.float32) # Number of each product reorders
temp_products_gb['reorder_rate'] = (temp_products_gb.reorders / temp_products_gb.orders).astype(np.float32) # Ratio of each product reorders
_products_gb = _products_gb.join(temp_products_gb, on='product_id')
_products_gb.set_index('product_id', drop=False, inplace=True)
del temp_products_gb

# Orders and Order products (prior) merge
_orders_gb.set_index('order_id', inplace=True, drop=False)
_prior_gb = _prior_gb.join(_orders_gb, on='order_id', rsuffix='_')
_prior_gb.drop('order_id_', inplace=True, axis=1)

# User attributes
temp_users_gb = pd.DataFrame()
temp_users_gb['average_days_between_orders'] = _orders_gb.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32) # Average days between orders for each user
temp_users_gb['nb_orders'] = _orders_gb.groupby('user_id').size().astype(np.int16) # Number of orders for each user
_users_gb = pd.DataFrame()
_users_gb['total_items'] = _prior_gb.groupby('user_id').size().astype(np.int16) # Number of products bought for each user
_users_gb['all_products'] = _prior_gb.groupby('user_id')['product_id'].apply(set) # Set of products bought (product ids) for each user
_users_gb['total_distinct_items'] = (_users_gb.all_products.map(len)).astype(np.int16) # Number of distinct products bought for each user
_users_gb = _users_gb.join(temp_users_gb)
del temp_users_gb
_users_gb['average_basket'] = (_users_gb.total_items / _users_gb.nb_orders).astype(np.float32) # Average number of products in order

# Atributes of merged users and products
_prior_gb['user_product'] = _prior_gb.product_id + _prior_gb.user_id * 100000 # Merged user ids and product ids
_prior_gb['user_max_order_num'] = _prior_gb.groupby('user_id')['order_number'].transform('max') # Number of a user's last prior order

def create_userXproduct_attributes():
    """
    Creates attributes for each users individually purchased product
    
    :return: Dictionary with merged user ids and product ids as keys and array of attributes as values
    """
    d = dict()
    for row in _prior_gb.itertuples():
        z = row.user_product
        if z not in d:
            d[z] = (1,
                    (row.order_number, row.order_id),
                    row.add_to_cart_order, 1 if row.order_number > row.user_max_order_num-5 else 0)
        else:
            d[z] = (d[z][0] + 1,
                    max(d[z][1], (row.order_number, row.order_id)),
                    d[z][2] + row.add_to_cart_order, d[z][3]+1 if row.order_number > row.user_max_order_num-5 else d[z][3])
    
    return d

_userXproduct_gb = pd.DataFrame.from_dict(create_userXproduct_attributes(), orient='index')
_userXproduct_gb.columns = ['nb_orders', 'last_order_id', 'sum_pos_in_cart', 'total_buys_n5']
_userXproduct_gb.nb_orders = _userXproduct_gb.nb_orders.astype(np.int16) # The number of individual products purchased for each user
_userXproduct_gb.last_order_id = _userXproduct_gb.last_order_id.map(lambda x: x[1]).astype(np.int32) # Last order containing this product
_userXproduct_gb.sum_pos_in_cart = _userXproduct_gb.sum_pos_in_cart.astype(np.int16) # The sum of the product places in the cart
_userXproduct_gb.total_buys_n5 = _userXproduct_gb.total_buys_n5.astype(np.int16) # The sum of the purchased products in the last five orders
del _prior_gb

# Index for last orders
_last_gb.set_index(['order_id', 'product_id'], inplace=True, drop=False)

Razdelitev zadnjih naročil v učno in testno množico.

In [7]:
def create_data():
    """
    Creates train and test data for Gradient boosting model
    
    :return: Data frame for train and test data
    """
    # Get last orders
    last_orders = _orders_gb[_orders_gb.eval_set == 'last']

    # Split into train and test dataset
    user_list = last_orders['user_id'].unique()
    msk = np.random.rand(len(user_list)) < 0.9
    train_orders = last_orders.loc[last_orders['user_id'].isin(user_list[msk])]
    test_orders = last_orders.loc[last_orders['user_id'].isin(user_list[~msk])]
    
    return (train_orders, test_orders)

Zgradba seznama z individualno združenimi naročili in izdelki na podlagi ustvarjenih značilk.

In [8]:
def create_main_gb_data(selected_orders, labels_given=False):
    order_list = []
    product_list = []
    labels = []
    train_index = set(_last_gb.index)
    
    for row in selected_orders.itertuples():
        user_id = row.user_id
        order_id = row.order_id
        user_prods = _users_gb['all_products'][user_id]
        product_list += user_prods
        order_list += [order_id] * len(user_prods)
        if labels_given:
            labels += [(order_id, prod) in train_index for prod in user_prods]
        
    df = pd.DataFrame({'order_id':order_list, 'product_id':product_list}, dtype=np.int32)
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list
    
    # User attributes
    df['user_id'] = df.order_id.map(_orders_gb.user_id)
    df['user_total_orders'] = df.user_id.map(_users_gb.nb_orders)
    df['user_total_items'] = df.user_id.map(_users_gb.total_items)
    df['total_distinct_items'] = df.user_id.map(_users_gb.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(_users_gb.average_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(_users_gb.average_basket)
    
    # Order attributes
    df['order_hour_of_day'] = df.order_id.map(_orders_gb.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(_orders_gb.days_since_prior_order)
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
    
    # Product attributes
    df['aisle_id'] = df.product_id.map(_products_gb.aisle_id)
    df['department_id'] = df.product_id.map(_products_gb.department_id)
    df['product_orders'] = df.product_id.map(_products_gb.orders).astype(np.int32)
    df['product_reorders'] = df.product_id.map(_products_gb.reorders)
    df['product_reorder_rate'] = df.product_id.map(_products_gb.reorder_rate)

    # user X product attributes
    df['z'] = df.user_id * 100000 + df.product_id
    df.drop(['user_id'], axis=1, inplace=True)
    df['UP_orders'] = df.z.map(_userXproduct_gb.nb_orders)
    df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_last_order_id'] = df.z.map(_userXproduct_gb.last_order_id)
    df['UP_average_pos_in_cart'] = (df.z.map(_userXproduct_gb.sum_pos_in_cart) / df.UP_orders).astype(np.float32)
    df['UP_reorder_rate'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(_orders_gb.order_number)
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - df.UP_last_order_id.map(_orders_gb.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)
    df['UP_total_buys_n5'] = df.z.map(_userXproduct_gb.total_buys_n5)

    df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)

    return (df, labels)

Metoda za gradientno pospeševanje.

In [9]:
class GradientBoosting:
    def __init__(self):
        """
        Initialization of a method for predicting next products purchased
        """
        self.params = {
            'task': 'train',
            'boosting_type': 'gbdt', 
            'objective': 'binary',
            'metric': {'binary_logloss'},
            'num_leaves': 64,
            'max_depth': 10,
            'feature_fraction': 0.7,
            'bagging_fraction': 0.9,
            'bagging_freq': 5,
            'verbose': -1,
        }
        self.rounds = 100
        self.predicted_products = {}
        self.test_users = set()
    
    def fit(self, number_predict, allow_filter = False, update_filter = False):
        """
        Trains the model

        :param int number_predict: Number of predicted products
        :param allow_filter: Allow the filter to interfere with the model
        :param update_filter: Update the filter when model is trained
        :type allow_filter: Boolean
        :type update_filter: Boolean
        """
        # Check the number of predictions
        if number_predict <= 0:
            print("The number of predictions must be greater than zero.")
        product_scores = {}
        
        # Create train and test dataset
        train_orders, test_orders = create_data()
        gb_train, gb_labels = create_main_gb_data(train_orders, labels_given=True)
        gb_test, _ = create_main_gb_data(test_orders)
        gb_attributes = ['user_total_orders', 'user_total_items', 'total_distinct_items',
                'user_average_days_between_orders', 'user_average_basket',
                'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
                'department_id', 'product_orders', 'product_reorders',
                'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
                'UP_average_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last',
                'UP_delta_hour_vs_last', 'UP_total_buys_n5'] # Excluded: aisle_id
        
        # Last order for test users
        for row in test_orders.itertuples():
            user_id = _order_user[str(row.order_id)]
            self.test_users.add(user_id)
        
        # Gradient boosting
        gb_dataset = lgb.Dataset(gb_train[gb_attributes],
                      label=gb_labels,
                      categorical_feature=['department_id'])
        gb_trained_data = lgb.train(self.params, gb_dataset, self.rounds)
        gb_predicted_data = gb_trained_data.predict(gb_test[gb_attributes])
        gb_test['pred'] = gb_predicted_data
        
        # For each user, it adds the predicted value of purchase for each product
        for row in gb_test.itertuples():
            user_id = _order_user[str(row.order_id)]
            product_id = str(row.product_id)
            if user_id not in product_scores:
                product_scores[user_id] = {}
            
            if not (allow_filter and product_id in _filter[user_id]):
                product_scores[user_id][product_id] = row.pred
        
        # For each user, it sorts the dictionary by values from most purchases to least purchases of a product
        for user_id in product_scores:
            product_scores[user_id] = {k: v for k, v in sorted(product_scores[user_id].items(), key=lambda item: item[1], reverse=True)}
        
        # Save predicted products
        for user_id in product_scores:
            count_predict = 0
            self.predicted_products[user_id] = {}
            
            for product_id in product_scores[user_id]:
                self.predicted_products[user_id][product_id] = product_scores[user_id][product_id]

                if update_filter:
                    _filter[user_id].add(product_id)

                # Check the number of stored products
                count_predict += 1
                if count_predict >= number_predict:
                    break
            
            # Add products if there aren't enough predictions
            if count_predict < number_predict:
                while count_predict < number_predict:
                    if allow_filter:
                        available_products = _product_category.keys() - _filter[user_id].union(self.predicted_products[user_id])
                    else:
                        available_products = _product_category.keys() - self.predicted_products[user_id]
                    random_product_id = random.choice(list(available_products))
                    self.predicted_products[user_id][random_product_id] = 0
                    
                    if update_filter:
                        _filter[user_id].add(random_product_id)
                    
                    count_predict += 1
    
    def predict(self, user_id, printout = False):
        """
        Predict values

        :param str user_id: Id of an user
        :param printout: Print products that the user will buy in the next order
        :type printout: Boolean
        :return: Predicted product ids for specified user
        """
        if user_id not in self.test_users:
            return None
        
        if printout:
            print("User id:", user_id)
            print("Products for the next order")
            print("-"*27)

            for product_id, product_prediction in self.predicted_products[user_id].items():
                print(get_product_name(product_id), "-", product_prediction)
        
        return set(self.predicted_products[user_id].keys())
    
    def evaluate(self, printout = False):
        """
        Evaluation of GradientBoosting method
        
        :param printout: Print precision and recall of GradientBoosting method
        :type printout: Boolean
        :return: precision and recall of GradientBoosting method
        """
        precision_all = []
        recall_all = []
        for user_id in self.test_users:
            intersect = _user_last_order[user_id].intersection(self.predicted_products[user_id])
            precision_all.append(len(intersect) / len(self.predicted_products[user_id]))
            recall_all.append(len(intersect) / len(_user_last_order[user_id]))
            
        precision = sum(precision_all) / len(precision_all)
        recall = sum(recall_all) / len(recall_all)
        if printout:
            print("Precision:", precision)
            print("Recall:", recall)
            print("F-score:", 2 * precision * recall / (precision + recall))
        
        return (precision, recall)
    
    def get_test_users(self):
        """
        Get a set of test users
        
        :return: Set of test user ids
        """
        return self.test_users

## Skupinsko filtriranje
Metoda za priporočanje izdelkov, ki jih uporabnik še ni kupil (Mogoče bi vas zanimalo tudi...).

In [10]:
class ItemBased:
    def __init__(self):
        """
        Initialization of a method to predict products that the user has not yet purchased
        """
        self.predicted_products = {}
    
    def fit(self, number_predict, allow_filter = False, update_filter = False):
        """
        Trains the model

        :param int number_predict: Number of predicted products
        :param allow_filter: Allow the filter to interfere with the model
        :param update_filter: Update the filter when model is trained
        :type allow_filter: Boolean
        :type update_filter: Boolean
        """
        # Check the number of predictions
        if number_predict <= 0:
            print("The number of predictions must be greater than zero.")
        user_products = {}
        product_users = {}
        similarity = {}
        product_scores = {}
        
        # For each user, it adds the number of purchases for each product
        for order_product in _order_products_prior:            
            product_id = order_product.product_id
            user_id = _order_user[order_product.order_id]
            if product_id not in product_users:
                product_users[product_id] = set()
            if user_id not in user_products:
                user_products[user_id] = {}
            
            product_users[product_id].add(user_id)
            user_products[user_id][product_id] = user_products[user_id].get(product_id, 0) + 1
        
        # Calculate the cosine similarity between two products
        product_users_list = list(product_users)
        list_index = 0
        for product_id_one in product_users_list:
            list_index += 1
            for product_id_two in product_users_list[list_index:]:
                numerator = 0
                denominator_product_one = 0
                denominator_product_two = 0
                
                # Users who purchased both products
                for user_id in product_users[product_id_one].intersection(product_users[product_id_two]):
                    product_score_one = user_products[user_id][product_id_one]
                    product_score_two = user_products[user_id][product_id_two]
                    numerator += product_score_one * product_score_two
                    denominator_product_one += product_score_one * product_score_one
                    denominator_product_two += product_score_two * product_score_two
                
                denominator = math.sqrt(denominator_product_one) + math.sqrt(denominator_product_two)
                similarity[(product_id_one, product_id_two)] = numerator / denominator if denominator else 0
        
        # Calculate the predictions
        for user_id in user_products:
            # Products that the user has not yet purchased
            for product_id in (product_users.keys() - user_products[user_id].keys()):
                numerator = 0
                denominator = 0
                
                # Products that the user has already purchased
                for user_product_id in user_products[user_id]:
                    if (product_id, user_product_id) in similarity:
                        numerator += user_products[user_id][user_product_id] * similarity[(product_id, user_product_id)]
                        denominator += similarity[(product_id, user_product_id)]
                    else:
                        numerator += user_products[user_id][user_product_id] * similarity[(user_product_id, product_id)]
                        denominator += similarity[(user_product_id, product_id)]
                
                if user_id not in product_scores:
                    product_scores[user_id] = {}
                    
                if not (allow_filter and product_id in _filter[user_id]):
                    product_scores[user_id][product_id] = numerator / denominator if denominator else 0
        
        # For each user, it sorts the dictionary by values from most purchases to least purchases of a product
        for user_id in product_scores:
            product_scores[user_id] = {k: v for k, v in sorted(product_scores[user_id].items(), key=lambda item: item[1], reverse=True)}
        
        # Save predicted products
        for user_id in product_scores:
            count_predict = 0
            self.predicted_products[user_id] = {}
            
            for product_id in product_scores[user_id]:
                self.predicted_products[user_id][product_id] = product_scores[user_id][product_id]

                if update_filter:
                    _filter[user_id].add(product_id)

                # Check the number of stored products
                count_predict += 1
                if count_predict >= number_predict:
                    break
        
    def predict(self, user_id, printout = False):
        """
        Predict values

        :param str user_id: Id of an user
        :param printout: Print products that the user has not yet purchased
        :type printout: Boolean
        :return: Predicted product ids for specified user
        """
        if user_id not in _user_last_order:
            return None
        
        if printout:
            print("User id:", user_id)
            print("Products of interest")
            print("-"*20)
            
            for product_id, interest in self.predicted_products[user_id].items():
                print(get_product_name(product_id), "-", interest)
        
        return set(self.predicted_products[user_id].keys())
    
    def evaluate(self, printout = False):
        """
        Evaluation of ItemBased method
        
        :param printout: Print precision and recall of ItemBased method
        :type printout: Boolean
        :return: precision and recall of ItemBased method
        """
        precision_all = []
        recall_all = []
        for user_id in _user_last_order:
            intersect = _user_last_order[user_id].intersection(self.predicted_products[user_id])
            precision_all.append(len(intersect) / len(self.predicted_products[user_id]))
            recall_all.append(len(intersect) / len(_user_last_order[user_id]))
            
        precision = sum(precision_all) / len(precision_all)
        recall = sum(recall_all) / len(recall_all)
        if printout:
            print("Precision:", precision)
            print("Recall:", recall)
            print("F-score:", 2 * precision * recall / (precision + recall))
        
        return (precision, recall)

## Dvonivojska napoved
Metoda za priporočanje najbolj prodajanih izdelkov iz kategorij, v katerih posamezni uporabnik največ kupuje (Izbor po kategorijah).

In [11]:
class CategorySelection:
    def __init__(self):
        """
        Initialization of a method to predict products in the categories that the user buys the most
        """
        self.predicted_products = {}
        self.categories_count = 0
    
    def fit(self, number_predict, number_categories, allow_filter = False, update_filter = False):
        """
        Trains the model

        :param int number_predict: Number of predicted products
        :param int number_categories: Number of predicted categories
        :param allow_filter: Allow the filter to interfere with the model
        :param update_filter: Update the filter when model is trained
        :type allow_filter: Boolean
        :type update_filter: Boolean
        """
        # Check the number of predictions and categories
        if number_predict <= 0:
            print("The number of predictions must be greater than zero.")
        if number_categories <= 0:
            print("The number of categories must be greater than zero.")
        self.categories_count = number_categories
        category_scores = {}
        product_scores = {}
        
        # For each user, it adds the number of purchases for each category
        # and for each category, it adds the number of purchases for each product
        for order_product in _order_products_prior:            
            product_id = order_product.product_id
            user_id = _order_user[order_product.order_id]
            category_id = _product_category[product_id]["department"] # Alternative: ["aisle"]
            if user_id not in category_scores:
                category_scores[user_id] = {}
            if category_id not in product_scores:
                product_scores[category_id] = {}
            
            category_scores[user_id][category_id] = category_scores[user_id].get(category_id, 0) + 1
            product_scores[category_id][product_id] = product_scores[category_id].get(product_id, 0) + 1
        
        # Add the missing products to the category dictionary
        for product in _products:
            product_id = product.product_id
            category_id = product.department_id # Alternative: aisle_id
            if category_id not in product_scores:
                product_scores[category_id] = {}
            if product_id not in product_scores[category_id]:
                product_scores[category_id][product_id] = 0
        
        # For each user, it sorts the dictionary by values from most purchases to least purchases in a category
        for user_id in category_scores:
            category_scores[user_id] = {k: v for k, v in sorted(category_scores[user_id].items(), key=lambda item: item[1], reverse=True)}
        
        # Sort the dictionary by value from the most products in a category to the fewest products in a category
        product_scores = {k: v for k, v in sorted(product_scores.items(), key=lambda item: len(item[1]), reverse=True)}
        # For each category, it sorts the dictionary by values from most purchases to least purchases of a product
        for category_id in product_scores:
            product_scores[category_id] = {k: v for k, v in sorted(product_scores[category_id].items(), key=lambda item: item[1], reverse=True)}
        
        # Save predicted products
        for user_id in category_scores:
            count_categories = 0
            self.predicted_products[user_id] = {}
            # INFO: Other categories are added at the end of the user's categories (in order from most products
            # to least products) so that there can be enough predicted categories for the user
            for category_id in (list(category_scores[user_id]) + list(product_scores.keys() - category_scores[user_id].keys())):
                available_products = set(product_scores[category_id].keys())
                
                if allow_filter:
                    available_products = available_products.difference(_filter[user_id])
                
                # Check if the category has enough products to predict
                if len(available_products) < number_predict:
                    continue
                
                count_predict = 0
                self.predicted_products[user_id][category_id] = {}

                for product_id in product_scores[category_id]:
                    if product_id not in available_products:
                        continue
                    
                    self.predicted_products[user_id][category_id][product_id] = product_scores[category_id][product_id]

                    if update_filter:
                        _filter[user_id].add(product_id)

                    # Check the number of stored products
                    count_predict += 1
                    if count_predict >= number_predict:
                        break

                # Check the number of stored categories
                count_categories += 1
                if count_categories >= number_categories:
                    break
    
    def predict(self, user_id, printout = False):
        """
        Predict values

        :param str user_id: Id of an user
        :param printout: Print products from the categories in which the user buys the most
        :type printout: Boolean
        :return: Predicted product ids from specific categories for a given user
        """
        if user_id not in _user_last_order:
            return None
        
        if printout:
            print("User id:", user_id)
            print("Best selling products in categories")
            
            for category_id in self.predicted_products[user_id]:
                category_name = get_department_name(category_id) # Alternative: get_aisle_name()
                print("-"*35)
                print("Category:", category_name) 
                print("-"*(10 + len(category_name)))
                for product_id, product_purchases in self.predicted_products[user_id][category_id].items():
                    print(get_product_name(product_id), "-", product_purchases)
        
        return {category_id: set(self.predicted_products[user_id][category_id].keys()) for category_id in self.predicted_products[user_id]}
    
    def evaluate(self, printout = False):
        """
        Evaluation of CategorySelection method
        
        :param printout: Print precision and recall of CategorySelection method
        :type printout: Boolean
        :return: precision and recall of CategorySelection method
        """
        evaluations = {}
        for category_count in range(self.categories_count):
            precision_all = []
            recall_all = []
            for user_id in _user_last_order:
                category_id = list(self.predicted_products[user_id])[category_count]
                intersect = _user_last_order[user_id].intersection(self.predicted_products[user_id][category_id])
                precision_all.append(len(intersect) / len(self.predicted_products[user_id][category_id]))
                recall_all.append(len(intersect) / len(_user_last_order[user_id]))

            precision = sum(precision_all) / len(precision_all)
            recall = sum(recall_all) / len(recall_all)
            if printout:
                print(f"{category_count+1}. category:")
                print("Precision:", precision)
                print("Recall:", recall)
                print("F-score:", 2 * precision * recall / (precision + recall))
            
            evaluations[category_count+1] = (precision, recall)
        
        return evaluations

## Evalvacija in rezultati
Izpis rezultatov funkcij in prikaz natančnosti in priklicov uporabljenih metod.

Inicializacija in deklaracija pomožnih spremenljivk ter izpis količine podatkov.

In [12]:
_filter = create_filter()
_order_user = create_order_user()
_user_last_order = create_user_last_order()
_product_category = create_product_category()

print(f'''List lengths
{"-"*12}
Order products Prior: {len(_order_products_prior)}
Order products Last: {len(_order_products_last)}
Orders: {len(_orders)}
Products: {len(_products)}
Aisles: {len(_aisles)}
Departments: {len(_departments)}\n''')

print(f'''Lengths
{"-"*7}
Filter: {len(_filter)}
Order user: {len(_order_user)}
Users last order: {len(_user_last_order)}
Products with categories: {len(_product_category)}''')

List lengths
------------
Order products Prior: 148395
Order products Last: 19055
Orders: 21204
Products: 10000
Aisles: 134
Departments: 21

Lengths
-------
Filter: 2000
Order user: 21204
Users last order: 2000
Products with categories: 10000


Evalvacija metod.

In [13]:
def evaluateMethods(predictions_number, categories_number, use_filter, test_filter_methods):
    """
    Evaluation of methods

    :param int predictions_number: Number of predicted products
    :param int categories_number: Number of predicted categories
    :param boolean use_filter: Use product filter on methods
    :param boolean test_filter_methods: Test the filter and methods
    """
    # Check the number of predictions and categories
    if predictions_number <= 0:
        print("The number of predictions must be greater than zero.")
    if categories_number <= 0:
        print("The number of categories must be greater than zero.")
    precision_sum = 0
    recall_sum = 0
    
    print("Best selling all:")
    best_selling_all = BestSellingAll()
    best_selling_all.fit(predictions_number, use_filter, use_filter)
    evaluation_temp = best_selling_all.evaluate(True)
    precision_sum += evaluation_temp[0]
    recall_sum += evaluation_temp[1]
    
    print("\nBest selling:")
    best_selling = BestSelling()
    best_selling.fit(predictions_number, use_filter, use_filter)
    evaluation_temp = best_selling.evaluate(True)
    precision_sum += evaluation_temp[0]
    recall_sum += evaluation_temp[1]

    print("\nGradient boosting:")
    gradient_boosting = GradientBoosting()
    gradient_boosting.fit(predictions_number, use_filter, use_filter)
    evaluation_temp = gradient_boosting.evaluate(True)
    precision_sum += evaluation_temp[0]
    recall_sum += evaluation_temp[1]

    print("\nItem based:")
    item_based = ItemBased()
    item_based.fit(predictions_number, use_filter, use_filter)
    evaluation_temp = item_based.evaluate(True)
    precision_sum += evaluation_temp[0]
    recall_sum += evaluation_temp[1]
    
    print("\nCategory selection:")
    category_selection = CategorySelection()
    category_selection.fit(predictions_number, categories_number, use_filter, use_filter)
    evaluation_temp = category_selection.evaluate(True)
    for category_eavluation in evaluation_temp:
        precision_sum += evaluation_temp[category_eavluation][0]
        recall_sum += evaluation_temp[category_eavluation][1]
    
    print("\nAverage evaluation:")
    precision_avg = precision_sum / (4 + categories_number)
    recall_avg = recall_sum / (4 + categories_number)
    print("Precision:", precision_avg)
    print("Recall:", recall_avg)
    print("F-score:", 2 * precision_avg * recall_avg / (precision_avg + recall_avg))
    
    print("\nCombined predictions:")
    precision_all = []
    recall_all = []
    for user_id in gradient_boosting.get_test_users():
        predictions = set()
        
        predictions.update(best_selling.predict(user_id))
        predictions.update(best_selling_all.predict(user_id))
        predictions.update(item_based.predict(user_id))
        predictions.update(gradient_boosting.predict(user_id))
        for category_id, predicted_products in category_selection.predict(user_id).items():
            predictions.update(predicted_products)
        
        intersect = _user_last_order[user_id].intersection(predictions)
        precision_all.append(len(intersect) / len(predictions))
        recall_all.append(len(intersect) / len(_user_last_order[user_id]))
    
    precision_all_avg = sum(precision_all) / len(precision_all)
    recall_all_avg = sum(recall_all) / len(recall_all)
    print("Precision:", precision_all_avg)
    print("Recall:", recall_all_avg)
    print("F-score:", 2 * precision_all_avg * recall_all_avg / (precision_all_avg + recall_all_avg))
    
    # Test filter and methods
    if use_filter and test_filter_methods:
        best_selling_all_count = 0
        best_selling_all_users = set()
        
        best_selling_count = 0
        best_selling_users = set()
        
        item_based_count = 0
        item_based_users = set()
        
        gradient_boosting_count = 0
        gradient_boosting_test_users_count = 0
        gradient_boosting_users = set()
        
        category_keys_count = 0
        category_value_count = 0
        category_users = set()
        
        filter_products = {}
        filter_count_test = 0
        filter_count_actual = 0
        
        for user_id in _user_last_order:
            filter_products[user_id] = set()
            
            temp = best_selling_all.predict(user_id)
            best_selling_all_count = len(temp)
            if len(temp) < predictions_number:
                best_selling_all_users.add(user_id)
            filter_products[user_id].update(temp)
            
            temp = best_selling.predict(user_id)
            best_selling_count += len(temp)
            if len(temp) < predictions_number:
                best_selling_users.add(user_id)
            filter_products[user_id].update(temp)
            
            temp = item_based.predict(user_id)
            item_based_count += len(temp)
            if len(temp) < predictions_number:
                item_based_users.add(user_id)
            filter_products[user_id].update(temp)
            
            temp = gradient_boosting.predict(user_id)
            if temp:
                gradient_boosting_test_users_count += 1
            
            temp_dict = category_selection.predict(user_id)
            category_keys_count += len(temp_dict)
            for category_id, category_products in temp_dict.items():
                category_value_count += len(category_products)
                if len(category_products) < predictions_number:
                    category_users.add(user_id)
                filter_products[user_id].update(category_products)
        
        for user_id in gradient_boosting.get_test_users():
            temp = gradient_boosting.predict(user_id)
            gradient_boosting_count += len(temp)
            if len(temp) < predictions_number:
                gradient_boosting_users.add(user_id)
            filter_products[user_id].update(temp)
        
        for user_id in filter_products:
            if user_id in gradient_boosting.get_test_users():
                filter_count_test += len(filter_products[user_id])
        
        for user_id in _filter:
            if user_id in gradient_boosting.get_test_users():
                filter_count_actual += len(_filter[user_id])
        
        print("\nTest")
        print("-"*4)
                
        print("best_selling_all missing users:", len(best_selling_all_users))
        print("best_selling_all products:", best_selling_all_count)
        
        print("\nbest_selling missing users:", len(best_selling_users))
        print("best_selling products:", best_selling_count / len(_user_last_order))
        
        print("\nitem_based missing users:", len(item_based_users))
        print("item_based products:", item_based_count / len(_user_last_order))
        
        print("\ngradient_boosting missing users:", len(gradient_boosting_users))
        print("gradient_boosting test users:", gradient_boosting_test_users_count)
        print("gradient_boosting actual users:", len(gradient_boosting.get_test_users()))
        print("gradient_boosting products:", gradient_boosting_count / len(gradient_boosting.get_test_users()))
        
        print("\ncategory_selection missing users:", len(category_users))
        print("category_selection categories:", category_keys_count / len(_user_last_order))
        print("category_selection products:", category_value_count / category_keys_count)
        
        print("\nFilter test:", filter_count_test / len(gradient_boosting.get_test_users()))
        print("Filter actual:", filter_count_actual / len(gradient_boosting.get_test_users()))

Rezultati metod.

In [14]:
evaluateMethods(5, 3, True, True)

Best selling all:
Precision: 0.12829999999999878
Recall: 0.06949710733050268
F-score: 0.09015782880590627

Best selling:
Precision: 0.35889999999999983
Recall: 0.2344378522892806
F-score: 0.283614958533273

Gradient boosting:
Precision: 0.25816326530612266
Recall: 0.15418364755001615
F-score: 0.1930634262912874

Item based:
Precision: 0.0002
Recall: 5.555555555555555e-05
F-score: 8.695652173913041e-05

Category selection:
1. category:
Precision: 0.04150000000000026
Recall: 0.02021706059154047
F-score: 0.027188852045359195
2. category:
Precision: 0.018499999999999996
Recall: 0.008775518882803228
F-score: 0.011904235444937175
3. category:
Precision: 0.015699999999999967
Recall: 0.007573437469926066
F-score: 0.010217912023652333

Average evaluation:
Precision: 0.11732332361516021
Recall: 0.0706771685242321
F-score: 0.08821338945031659

Combined predictions:
Precision: 0.1157434402332359
Recall: 0.4750129414158832
F-score: 0.18613301084045886

Test
----
best_selling_all missing users: 0
be

In [15]:
_filter = create_filter()
evaluateMethods(10, 3, True, True)

Best selling all:
Precision: 0.09779999999999862
Recall: 0.10665138283728531
F-score: 0.10203408846383445

Best selling:
Precision: 0.2655500000000006
Recall: 0.32336582179102935
F-score: 0.291619925290708

Gradient boosting:
Precision: 0.1433497536945814
Recall: 0.13703454988616248
F-score: 0.1401210319048567

Item based:
Precision: 0.0002
Recall: 0.00014052287581699347
F-score: 0.0001650671785028791

Category selection:
1. category:
Precision: 0.029150000000000214
Recall: 0.026653231527789728
F-score: 0.027845760102554643
2. category:
Precision: 0.010800000000000011
Recall: 0.009736525732216419
F-score: 0.010240727110231464
3. category:
Precision: 0.00915
Recall: 0.008352637566928016
F-score: 0.008733156182334797

Average evaluation:
Precision: 0.07942853624208299
Recall: 0.08741923888817547
F-score: 0.08323254150514006

Combined predictions:
Precision: 0.08078817733990132
Recall: 0.6102815153523815
F-score: 0.14268758074883664

Test
----
best_selling_all missing users: 0
best_sellin

In [16]:
_filter = create_filter()
evaluateMethods(15, 3, True, True)

Best selling all:
Precision: 0.08136666666666617
Recall: 0.13100180287125576
F-score: 0.10038382863661786

Best selling:
Precision: 0.21469999999999648
Recall: 0.3855648389675391
F-score: 0.27581415919251057

Gradient boosting:
Precision: 0.07119155354449468
Recall: 0.09732287332562539
F-score: 0.08223113802363544

Item based:
Precision: 0.00016666666666666666
Recall: 0.00022385620915032678
F-score: 0.00019107391910739188

Category selection:
1. category:
Precision: 0.020200000000000086
Recall: 0.02630665760920758
F-score: 0.022852404839378816
2. category:
Precision: 0.00889999999999998
Recall: 0.012243264984808783
F-score: 0.010307306694882582
3. category:
Precision: 0.006633333333333323
Recall: 0.009262672336162474
F-score: 0.007730544948297048

Average evaluation:
Precision: 0.05759403145873677
Recall: 0.09456085232910706
F-score: 0.07158680113615472

Combined predictions:
Precision: 0.0544710191769014
Recall: 0.6661629201814433
F-score: 0.10070736671784773

Test
----
best_selling_a

In [17]:
_filter = create_filter()
evaluateMethods(20, 3, True, True)

Best selling all:
Precision: 0.072675
Recall: 0.15293687660582087
F-score: 0.09852927669004875

Best selling:
Precision: 0.1814750000000007
Recall: 0.4294141251010482
F-score: 0.2551295321874415

Gradient boosting:
Precision: 0.03333333333333331
Recall: 0.06234464081746968
F-score: 0.04344060819348386

Item based:
Precision: 0.00015
Recall: 0.00025326797385620916
F-score: 0.00018841166936790925

Category selection:
1. category:
Precision: 0.015400000000000101
Recall: 0.02793678583950549
F-score: 0.01985502586747885
2. category:
Precision: 0.0070500000000000215
Recall: 0.012402475756590542
F-score: 0.008989853546465877
3. category:
Precision: 0.005275000000000006
Recall: 0.009992468599966624
F-score: 0.006904913086238698

Average evaluation:
Precision: 0.04505119047619059
Recall: 0.09932580581346538
F-score: 0.06198696346232472

Combined predictions:
Precision: 0.043997317236753854
Recall: 0.6801111086412132
F-score: 0.08264802102488092

Test
----
best_selling_all missing users: 0
best_