In [1]:




# %load_ext autoreload
# %autoreload 2

import pandas as pd

hist_data = pd.read_csv('../datasets/hist_data.csv.gz')
test_data = pd.read_csv('../datasets/test.csv.gz')


In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split


# метрики оцениваются для вектора релевантности. пример:
# реальные item_id, которые приобрел покупатель: [1 ,4, 5, 69]
# рекомендованные алгоритмом item_id: [4, 6, 7, 8, 1, 2, 67, 90]
# тогда вектор релеватности будет выглядеть следующим образом: [1, 0, 0, 0, 1, 0, 0, 0]
# и уже по не му будет расчитываться ndcg
def dcg(y_relevance):
    return np.sum([(2**i - 1) / np.log2(k + 1) for (k, i) in enumerate(y_relevance, start=1)])

def ndcg(y_relevance, k):
    if y_relevance.sum() == 0:
        return 0.0
    DCG = dcg(y_relevance[:k])
    IDCG = dcg(-np.sort(-y_relevance)[:k])
    return DCG / IDCG

def apply_relevance(x):
    return [int(item in x['basket']) for item in x['preds']]

def create_relevance(pred):
    d = pred.copy()
    d['basket'] = d['basket'].apply(set)
    d = d.apply(apply_relevance, axis=1)
    return d

def ndcg_full_dataset(d):
    dd = pd.DataFrame(d.to_list()).fillna(0).to_numpy()
    k = dd.shape[1]
    scores = [ndcg(dd[i], k) for i in range(len(dd))]
    return np.mean(scores)

def compute_ndcg_score(pred):
    relevance = create_relevance(pred)
    return ndcg_full_dataset(relevance)




def split_data(data, test_size=0.3):
    orders_sort = data[['pav_order_id', 'created']].drop_duplicates().sort_values(by=['created', 'pav_order_id'])
    train_orders, test_orders = train_test_split(orders_sort['pav_order_id'].tolist(), test_size=test_size, shuffle=False)
    train_orders, test_orders = set(train_orders), set(test_orders)
    train = data[data['pav_order_id'].apply(lambda x: x in train_orders)]
    test = data[data['pav_order_id'].apply(lambda x: x in test_orders)]
    return train, test, orders_sort, train_orders, test_orders


class CartModel(object):
    
    def __init__(self, **kwargs):
        self.kwargs = kwargs
        
    def param(self, name, default=None):
        if name in self.kwargs: 
            return self.kwargs[name]
        else: 
            return default
    
    def train(self, X_train):
        raise RuntimeException("Not Implemented")
    
    def predict(self, X_test):
        raise RuntimeException("Not Implemented")
    
    
    def to_basket(self, X_test, addcols=['buyer_id']):
        basket = X_test.groupby([*addcols, 'pav_order_id'])['item_id'].agg([('basket', list)])
        return basket
    
    def quality(self, X_val):
        basket = self.predict(X_val)
        score = compute_ndcg_score(basket)
        return score

In [3]:

import pandas as pd
import numpy as np
from collections import Counter 

def softmax(x):
    w = np.exp(x)
    rz = w / np.sum(w)
    return rz

class LazyBayesModel(CartModel):

    def __init__(self, **kwargs):
        CartModel.__init__(self, **kwargs)
        self.top_n = self.param("top_n", 10)
        self.most_freq_dict = None
        self.eps = self.param("eps", 1e-5)
    
    def train(self, X_train, C0 = 50, C1 = 2):
        top_n = self.top_n
        pairs = X_train[['item_id', 'pav_order_id']]\
                .sort_values(['item_id', 'pav_order_id'])\
                .merge(X_train[['item_id', 'pav_order_id']], 
                       how='left', on=['pav_order_id'], suffixes=('', '_left'))
        
        q = pairs.item_id != pairs.item_id_left
        
        # совстречаемость товаров c(xy)
        pairs['cnt'] = 1.
        xy = pairs[q][['item_id', 'item_id_left', 'cnt']]\
                    .groupby(['item_id', 'item_id_left'])[["cnt"]]\
                    .count()\
                    .rename({'cnt':'xy'}, axis=1)\
                    .reset_index()
        
        q = xy['xy'] >= C0
        xy = xy.loc[q]
        
        # частотность товара в парах c(x)
        x = pairs.groupby(["item_id"])[["cnt"]]\
                    .count()\
                    .rename({'cnt':'x'}, axis=1)\
                    .reset_index()
        
        # если купили товар X, грубо оцениваем вероятность
        # C - shrinkage term
        # p(y|x) = с(xy) / с(x)
        ptrs1 = xy.merge(x, on="item_id")
        ptrs1["y|x"] = ptrs1["xy"] / (ptrs1["x"])
        
        # buyers = X_train[["buyer_id", "item_id", "count"]]#.drop_duplicates()
        # buyers = np.exp(-(buyers['count'].max() - buyers['count']) / (buyers['count'].mean()))
        
        buyers = X_train[["buyer_id", "item_id"]] #.drop_duplicates()
        buyers["count"] = 1
        u = buyers.groupby(["buyer_id"])[["count"]]\
                    .count()\
                    .rename({'count':'u'}, axis=1)\
                    .reset_index()
        
        xu = buyers.groupby(["item_id", "buyer_id"])[["count"]]\
                    .count()\
                    .rename({'count':'xu'}, axis=1)\
                    .reset_index()
        
        q = xu['xu'] >= C1
        ptrs2 = xu.merge(u, on="buyer_id")
        ptrs2["x|u"] = ptrs2["xu"] / (ptrs2["u"])
        
        self.ptrs1 = ptrs1[ ptrs1["y|x"] >= self.eps]
        self.ptrs2 = ptrs2[ ptrs2["x|u"] >= self.eps]
        self.ptrs2 = self.ptrs2.rename({"item_id":"item_id_left"}, axis=1)
    
    def predict(self, X_test):
        # add "recently_bought" feature
        preds = self.to_basket(X_test)
        X_test["cnt"] = 1
        # support = X_test.groupby(["item_id"])[["cnt"]]\
        #             .count()\
        #             .reset_index()\
        #             .rename({'cnt':'x1', 'item_id':'item_id_left'}, axis=1)
        
        basket_items = X_test[["buyer_id", "pav_order_id", "item_id"]]
        basket_recs = basket_items.merge(self.ptrs1, on=["item_id"])
        basket_recs1 = basket_recs.groupby(
            ["buyer_id", "pav_order_id", "item_id_left"], as_index=False)[["y|x"]].sum()
        
        user_recs = basket_items.merge(self.ptrs2, on=["buyer_id"])
        basket_recs2 = user_recs.groupby(
            ["buyer_id", "pav_order_id", "item_id_left"], as_index=False)[["x|u"]].sum()
        
        idx = preds.reset_index()
        idx = idx[["buyer_id", "pav_order_id"]]
        relevance = idx.merge(basket_recs1, on=["buyer_id", "pav_order_id"], how="left")\
                       .merge(basket_recs2, on=["buyer_id", "pav_order_id", "item_id_left"], how="left")\
                       .fillna(self.eps)
        
        
        relevance["score"] = relevance["y|x"] + relevance["x|u"]
        relevance = relevance.merge(preds, on=["buyer_id", "pav_order_id"])
        def select(df):
            q = ~df.item_id_left.isin(df.basket)
            df = df[q].sort_values(by="score", ascending=False).head(self.top_n)
            return [*df.item_id_left]
        
        predicted = relevance.groupby(["buyer_id", "pav_order_id"]).apply(select)
        preds["preds"] = predicted
        
        preds = preds.sort_values(by="pav_order_id")
        
        return preds


    



In [4]:

from models.core import split_data


# разобьем историю в отношении 70 на 30 для трейна и валидации
train_data_split, test_data_split, orders_sort, train_orders, test_orders = split_data(hist_data)




In [5]:
model = LazyBayesModel(top_n = 15)
model.train(train_data_split)
model.quality(test_data_split)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  buyers["count"] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test["cnt"] = 1


0.5011284959981711

### Prepare Submission File

In [6]:
hist_data = pd.read_csv('../datasets/hist_data.csv.gz')
test_data = pd.read_csv('../datasets/test.csv.gz')

In [7]:
model = LazyBayesModel(top_n = 20)

In [8]:
# model.train(hist_data.sample(1000))
model.train(hist_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  buyers["count"] = 1


In [9]:
test = model.predict(test_data)

In [10]:
test.reset_index()[['pav_order_id', 'preds']]\
    .sort_values(by='pav_order_id').to_csv('pred.csv', index=False)

In [None]:
model.quality(test_data)