In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd

In [2]:
%%time

transactions = pd.read_parquet('transactions_train.parquet')
customers = pd.read_parquet('customers.parquet')
articles = pd.read_parquet('articles.parquet')

CPU times: user 2.88 s, sys: 1.83 s, total: 4.71 s
Wall time: 1.65 s


In [3]:
customers['age_bucket'] = pd.cut(customers['age'].fillna(22), [15, 24, 35, 50, 100], 
                                    labels=['16-24', '25-35', '36-50', '51-100'])

In [4]:
VALID = True

if VALID:
    valid_df = transactions[transactions.t_dat>='2020-09-16']
    transactions = transactions[transactions.t_dat<'2020-09-16']

In [5]:
test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

In [6]:
weekly_purchases = transactions[['customer_id', 'week', 'article_id']].drop_duplicates()

In [7]:
purchase_count = transactions[['customer_id','week',
                               'article_id'
                              ]].groupby(['week','customer_id'
                                         ])['article_id'].value_counts().reset_index(name='ArticleCount')

weekly_purchases = weekly_purchases.merge(purchase_count, on=['week','customer_id','article_id'])

In [8]:
weekly_purchases = weekly_purchases.merge(weekly_purchases[['week','article_id']].groupby( \
    ['week'])['article_id'].value_counts().reset_index(name='ArticleCountGlobal'), 
                                          on=['week','article_id'])

In [9]:
last_week_purchases = weekly_purchases.copy()
last_week_purchases = last_week_purchases.drop('ArticleCount', axis=1)
last_week_purchases['week'] = last_week_purchases['week'] + 1

In [10]:
k = last_week_purchases[last_week_purchases.week==104][['article_id','ArticleCountGlobal']].drop_duplicates(). \
sort_values(['ArticleCountGlobal']).tail(100)['article_id'].tolist()

In [11]:
def score_and_best(actual, predict, Lookback):
    
    n_cust = actual['customer_id'].nunique()
    n_article_reco = predict['article_id'].nunique()
    
    act_tot = len(actual)
    pre_tot = len(predict)
    df = actual.merge(predict, on=['customer_id', 'article_id'], how='inner')
    correct = df.shape[0]
    
    print(f"[+] Recall = {correct/act_tot*100:.1f}% ({correct}/{act_tot})")
    print(f"[+] Multiple Factor = {pre_tot//correct} ({pre_tot}/{correct})")
    
#     df = df.groupby('article_id')['customer_id'].count().rank(
#         method='dense').sort_values(ascending=False).reset_index(name='BestSellerRank').iloc[:int(n_article_reco/4)]
    
#     df['BestSellerRank'] = [i for i in range(1, int(n_article_reco/4)+1)]
#     df['WeeksLookback'] = [Lookback]*int(n_article_reco/4)
    
#     return df

In [12]:
score_and_best(valid_df, last_week_purchases[(last_week_purchases.week==104) &(last_week_purchases.article_id.isin(k))], 
               'Lookback')

[+] Recall = 0.3% (673/240311)
[+] Multiple Factor = 44 (30187/673)


In [21]:
art_orig = pd.read_csv('articles.csv')

In [23]:
art_orig.product_type_name.unique()

array(['Vest top', 'Bra', 'Underwear Tights', 'Socks', 'Leggings/Tights',
       'Sweater', 'Top', 'Trousers', 'Hair clip', 'Umbrella',
       'Pyjama jumpsuit/playsuit', 'Bodysuit', 'Hair string', 'Unknown',
       'Hoodie', 'Sleep Bag', 'Hair/alice band', 'Belt', 'Boots',
       'Bikini top', 'Swimwear bottom', 'Underwear bottom', 'Swimsuit',
       'Skirt', 'T-shirt', 'Dress', 'Hat/beanie', 'Kids Underwear top',
       'Shorts', 'Shirt', 'Cap/peaked', 'Pyjama set', 'Sneakers',
       'Sunglasses', 'Cardigan', 'Gloves', 'Earring', 'Bag', 'Blazer',
       'Other shoe', 'Jumpsuit/Playsuit', 'Sandals', 'Jacket', 'Costumes',
       'Robe', 'Scarf', 'Coat', 'Other accessories', 'Polo shirt',
       'Slippers', 'Night gown', 'Alice band', 'Straw hat', 'Hat/brim',
       'Tailored Waistcoat', 'Necklace', 'Ballerinas', 'Tie',
       'Pyjama bottom', 'Felt hat', 'Bracelet', 'Blouse',
       'Outdoor overall', 'Watch', 'Underwear body', 'Beanie', 'Giftbox',
       'Sleeping sack', 'Dungarees',

In [26]:
art_new = pd.read_parquet('articles_new.parquet')

In [31]:
art_orig[art_orig.article_id==456163086]

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
3520,456163086,456163,Woody(1),308,Hoodie,Garment Upper body,1010016,Solid,93,Dark Green,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Wide top in sweatshirt fabric with a lined dra...


In [32]:
art_orig[art_orig.article_id==295873004]

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
830,295873004,295873,Sune leather glove,71,Gloves,Accessories,1010016,Solid,23,Dark Yellow,...,Accessories,F,Menswear,3,Menswear,25,Men Accessories,1019,Accessories,Leather gloves with decorative stitching on th...


In [30]:
art_new[art_new.clusters==2]

Unnamed: 0,article_id,prod_name_cat_cat_code,product_code,department_no,colour_group_name_cat_cat_code,ct,clusters
0,456163086.0,45144,456163.0,1676.0,6,1287.0,2
1,457039006.0,5002,457039.0,1919.0,24,3.0,2
2,456641003.0,9091,456641.0,1338.0,1,1.0,2
3,457292020.0,8695,457292.0,1344.0,5,377.0,2
4,456163087.0,45144,456163.0,1676.0,0,522.0,2
...,...,...,...,...,...,...,...
100705,295873004.0,40932,295873.0,9985.0,13,16.0,2
100706,293510007.0,258,293510.0,7931.0,5,109.0,2
100707,294008056.0,18379,294008.0,1919.0,16,190.0,2
100708,294008005.0,18379,294008.0,1919.0,47,2497.0,2


In [13]:
def find_alternate_items_all(base_itm, base_wk, base_prod):

    item_1 = base_itm
    wk_num = base_wk
    prod_nm = base_prod
    
    item_2 = ''
    item_3 = ''
    item_4 = ''
    item_5 = ''

    color_1 = articles[articles.article_id==item_1]['colour_group_name'].iloc[0]
    g_1 = articles[articles.article_id==item_1]['graphical_appearance_name'].iloc[0]

    p1 = transactions[(transactions.article_id==item_1) & 
                 (transactions.prod_name==prod_nm) & 
                 (transactions.week==wk_num)]['price'].mean()

    filter_df = transactions[(transactions.article_id!=item_1) & 
                 (transactions.prod_name==prod_nm) & 
                 (transactions.week==wk_num)]

    if len(filter_df)>0:

        item_2 = filter_df.groupby('week')['article_id'].value_counts().head(1).index.values[0][1]
        color_2 = articles[articles.article_id==item_2]['colour_group_name'].iloc[0]
        g_2 = articles[articles.article_id==item_2]['graphical_appearance_name'].iloc[0]

        p2 = transactions[(transactions.article_id==item_2) & 
                 (transactions.prod_name==prod_nm) & 
                 (transactions.week==wk_num)]['price'].mean()

        p_low = min(p1, p2)
        filter_df_2 = transactions[~(transactions.article_id.isin([item_1, item_2])) & 
                 (transactions.prod_name==prod_nm) & 
                 (transactions.week==wk_num) & 
                 (transactions.price < p_low)]

        if len(filter_df_2)>0:

            item_3 = filter_df_2.groupby('week')['article_id'].value_counts().head(1).index.values[0][1]
            color_3 = articles[articles.article_id==item_3]['colour_group_name'].iloc[0]
            g_3 = articles[articles.article_id==item_3]['graphical_appearance_name'].iloc[0]

            filter_df_3 = transactions[~(transactions.article_id.isin([item_1, item_2, item_3])) & 
                     (transactions.prod_name==prod_nm) & 
                     (transactions.week==wk_num) & 
                     ~ (transactions.colour_group_name.isin([color_1, color_2, color_3]))]

            if len(filter_df_3)>0:
                item_4 = filter_df_3.groupby('week')['article_id'].value_counts().head(1).index.values[0][1]
                color_4 = articles[articles.article_id==item_4]['colour_group_name'].iloc[0]
                g_4 = articles[articles.article_id==item_4]['graphical_appearance_name'].iloc[0]

                filter_df_4 = transactions[~(transactions.article_id.isin([item_1, item_2, item_3, item_4])) & 
                     (transactions.prod_name==prod_nm) & 
                     (transactions.week==wk_num) & 
                     ~ (transactions.graphical_appearance_name.isin([g_1, g_2, g_3, g_4]))]

                if len(filter_df_4)>0:
                    item_5 = filter_df_4.groupby('week')['article_id'].value_counts().head(1).index.values[0][1]            

            else:
                filter_df_3_1 = transactions[~(transactions.article_id.isin([item_1, item_2, item_3])) & 
                 (transactions.prod_name==prod_nm) & 
                 (transactions.week==wk_num) & 
                 ~ (transactions.graphical_appearance_name.isin([g_1, g_2, g_3]))]

                if len(filter_df_3_1)>0:
                    item_4 = filter_df_3_1.groupby('week')['article_id'].value_counts().head(1).index.values[0][1]

        else:
            filter_df_2_1 = transactions[~(transactions.article_id.isin([item_1, item_2])) & 
                 (transactions.prod_name==prod_nm) & 
                 (transactions.week==wk_num) & 
                 ~ (transactions.colour_group_name.isin([color_1, color_2]))]

            if len(filter_df_2_1)>0:
                item_3 = filter_df_2_1.groupby('week')['article_id'].value_counts().head(1).index.values[0][1]
                color_3 = articles[articles.article_id==item_3]['colour_group_name'].iloc[0]
                g_3 = articles[articles.article_id==item_3]['graphical_appearance_name'].iloc[0]

            else:
                filter_df_2_2 = transactions[~(transactions.article_id.isin([item_1, item_2])) & 
                     (transactions.prod_name==prod_nm) & 
                     (transactions.week==wk_num) & 
                     ~ (transactions.graphical_appearance_name.isin([g_1, g_2]))]

                if len(filter_df_2_2)>0:
                    item_3 = filter_df_2_2.groupby('week')['article_id'].value_counts().head(1).index.values[0][1]

    return item_2, item_3, item_4, item_5


In [14]:
transactions = pd.read_parquet('transactions_train.parquet')
transactions = transactions.merge(articles[['article_id','prod_name',
                                            'colour_group_name','graphical_appearance_name']], on='article_id')

In [15]:
best_sell_df = pd.DataFrame({'article_id':k})
best_sell_df['week'] = 104
best_sell_df = best_sell_df.merge(articles[['article_id','prod_name']], on='article_id')

In [16]:
alternate_best_sell_df_all = pd.DataFrame(columns = best_sell_df.columns)

wk_list = []
art_list = []
bs_list = []
rt_list = []

for i in range(0, len(best_sell_df)):
    
    bs_itm = best_sell_df['article_id'][i]
    bs_wk = best_sell_df['week'][i]
    bs_prod = best_sell_df['prod_name'][i]
    
    a,b,c,d = find_alternate_items_all(bs_itm, bs_wk, bs_prod)

    wk_list.extend([bs_wk]*4)
    art_list.extend([a,b,c,d])
    rt_list.extend(['SecondMost', 'LowerPrice', 'AnotherColor', 'AnotherAppearance'])

In [17]:
alternate_best_sell_df_all['week'] = wk_list
alternate_best_sell_df_all['article_id'] = art_list
alternate_best_sell_df_all['RankType'] = rt_list

del wk_list, art_list, rt_list

best_sell_df = pd.concat([best_sell_df.drop('prod_name', axis=1), 
                          alternate_best_sell_df_all.drop('prod_name', axis=1)], axis=0)

best_sell_df = best_sell_df[~(best_sell_df.article_id=='')]

In [18]:
k1 = best_sell_df.article_id.tolist()

score_and_best(valid_df,
               last_week_purchases[(last_week_purchases.week==104) &(last_week_purchases.article_id.isin(k1))], 
               'Lookback')

[+] Recall = 0.4% (850/240311)
[+] Multiple Factor = 46 (39767/850)
