In [73]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [71]:
# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/306007
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
def apk(actual, predicted, k=12):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=12):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [3]:
articles = pd.read_csv('../data/articles.csv')

customers = pd.read_csv('../data/customers.csv')

transactions_train = pd.read_csv('../data/transactions_train.csv')
transactions_train['t_dat'] = pd.to_datetime(transactions_train['t_dat'])

In [12]:
articles.head(2)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,4,Dark,5,Black,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,3,Light,9,White,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.


In [13]:
customers.head(2)

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...


In [57]:
train = transactions_train.query('t_dat < "2020-09-16"')

test = transactions_train.query('t_dat > "2020-09-15"')\
    .groupby('customer_id')['article_id'].apply(list).to_frame('target').reset_index()

In [85]:
def time_series_split(transactions_train=transactions_train):
    # пытаюсь сделать кросс валидацию по временному таргету с учетом сезонности
    train = None
    test = None
    for part in range(6):
        # part1
        if part == 0:
            train = transactions_train.query('t_dat < "2020-09-16"')
            test = transactions_train.query('t_dat >= "2020-09-16"')\
                .groupby('customer_id')['article_id'].apply(list).to_frame('target').reset_index()
        # part2
        elif part == 1:
            train = transactions_train.query('t_dat < "2020-09-09"')
            test = transactions_train.query('t_dat >= "2020-09-09"')\
                .groupby('customer_id')['article_id'].apply(list).to_frame('target').reset_index()
        # part3
        elif part == 2:
            train = transactions_train.query('t_dat < "2020-09-02"')
            test = transactions_train.query('t_dat >= "2020-09-02"')\
                .groupby('customer_id')['article_id'].apply(list).to_frame('target').reset_index()
        # part4
        elif part == 3:
            train = transactions_train.query('t_dat < "2019-09-16"')
            test = transactions_train.query('t_dat >= "2019-09-16"')\
                .groupby('customer_id')['article_id'].apply(list).to_frame('target').reset_index()
        # part5
        elif part == 4:
            train = transactions_train.query('t_dat < "2019-09-09"')
            test = transactions_train.query('t_dat >= "2019-09-09"')\
                .groupby('customer_id')['article_id'].apply(list).to_frame('target').reset_index()
        # part6
        elif part == 5:
            train = transactions_train.query('t_dat < "2019-09-02"')
            test = transactions_train.query('t_dat >= "2019-09-02"')\
                .groupby('customer_id')['article_id'].apply(list).to_frame('target').reset_index()
        yield train, test
for train, test in time_series_split:
    print(train, test)

TypeError: 'function' object is not iterable

In [None]:
train = transactions_train.query('t_dat < "2019-09-02"')
test = transactions_train.query('t_dat >= "2019-09-02"')\
    .groupby('customer_id')['article_id'].apply(list).to_frame('target').reset_index()

In [91]:
arr = train.groupby('article_id').size().sort_values(ascending=False)[:12].index.values
arr

array([706016001, 706016002, 372860001, 610776002, 759871002, 464297007,
       372860002, 610776001, 399223001, 720125001, 706016003, 156231001])

In [92]:
arr = train.groupby('article_id').size().sort_values(ascending=False)[:12].index.values
pred = list(arr)

test['predict'] = test.apply(lambda _: pred, axis=1)
test

Unnamed: 0,customer_id,target,predict
0,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,[624486001],"[706016001, 706016002, 372860001, 610776002, 7..."
1,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,[827487003],"[706016001, 706016002, 372860001, 610776002, 7..."
2,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,"[757926001, 788575004, 640021019]","[706016001, 706016002, 372860001, 610776002, 7..."
3,000525e3fe01600d717da8423643a8303390a055c578ed...,[874110016],"[706016001, 706016002, 372860001, 610776002, 7..."
4,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,"[903762001, 879189005, 158340001, 867966009, 9...","[706016001, 706016002, 372860001, 610776002, 7..."
...,...,...,...
68979,fffa67737587e52ff1afa9c7c6490b5eb7acbc439fe82b...,"[874816003, 911870004]","[706016001, 706016002, 372860001, 610776002, 7..."
68980,fffa7d7799eb390a76308454cbdd76e473d65b1497fbe4...,"[861803014, 849886010]","[706016001, 706016002, 372860001, 610776002, 7..."
68981,fffae8eb3a282d8c43c77dd2ca0621703b71e90904dfde...,"[396135007, 817472007, 715624050, 817472003, 8...","[706016001, 706016002, 372860001, 610776002, 7..."
68982,fffd870c6324ad3bda24e4d6aeae221c199479086bfdfd...,"[750423010, 761269001]","[706016001, 706016002, 372860001, 610776002, 7..."


In [74]:
mapk(test['target'].to_list(), test['predict'].to_list())

0.002649039658359146