# H&M Recommendation: Popularity Baseline

#### What we'll be doing here:
This is a bare minimum notebook. As we have already found out, popularity and repetition is king in this competition. We'll combine these two to create a good enough baseline.

In [1]:
import numpy as np
import pandas as pd
import os
import glob
#import reco
from tqdm import tqdm
import datetime

### Forming Train Set

In [2]:
data = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv", dtype={'article_id':str})
data.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


We'll drop everything except the last few(up for experimentation) days. The info from previous months are not coming of much use. 
We'll keep 2 weeks as train and the last week as validation.

In [3]:
print("All Transactions Date Range: {} to {}".format(data['t_dat'].min(), data['t_dat'].max()))

data["t_dat"] = pd.to_datetime(data["t_dat"])
data = data.loc[data["t_dat"] >= datetime.datetime(2020,9,1)]

split_date = datetime.datetime(2020,9,15)
val = data.loc[data['t_dat'] > split_date, :]
train = data.loc[data['t_dat'] <= split_date, :]

All Transactions Date Range: 2018-09-20 to 2020-09-22


Items which an user has bought in our train set time.

In [4]:
# List of all purchases per user (has repetitions)
positive_items_per_user = train.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user

customer_id
00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657                                         [0568601043]
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318                                         [0794321007]
0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d8cd0c725276a467a2a                             [0719530003, 0448509014]
000172a9c322560c849754ffbdfdb2180d408aa7176b943f957804686be8e1f0                 [0685814001, 0685814001, 0685814001]
0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37e011580a479e80aa94    [0777148006, 0835801001, 0923134005, 086592900...
                                                                                          ...                        
fffcc686584b3388a3afe410e3555e3557f556382b2f60c221d51914093e167b     [0817067002, 0840604001, 0723469001, 0416961006]
fffd0248a95c2e49fee876ff93598e2e20839e51b9b7678aab75d9e8f9f3c6c8    [0509091057, 0859737002, 0573085028, 074568600...
fffef3b6b73545df065b521e19f64bf6fe93bfd450ab

Next we do exponential weighting based popularity for items. This leads to items bought more recently having more weight in the popularity list.

In [5]:
train['pop_factor'] = train['t_dat'].apply(lambda x: 1/np.exp((datetime.datetime(2020,9,16) - x).days))
popular_items_group = train.groupby(['article_id'])['pop_factor'].sum()
_, popular_items = zip(*sorted(zip(popular_items_group, popular_items_group.keys()))[::-1])

train['pop_factor'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


count    5.579580e+05
mean     3.002452e-02
std      8.126532e-02
min      3.059023e-07
25%      6.144212e-06
50%      3.354626e-04
75%      6.737947e-03
max      3.678794e-01
Name: pop_factor, dtype: float64

### Moving on to Validation ...

In [6]:
def apk(actual, predicted, k=12):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=12):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

Items bought by users in the validation period. Similar as the one for train set.

In [7]:
positive_items_val = val.groupby(['customer_id'])['article_id'].apply(list)
positive_items_val, len(positive_items_val)

(customer_id
 00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793                                         [0624486001]
 0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf4672f30b3e622fec55                                         [0827487003]
 000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed6396773839f6bf71a9                 [0757926001, 0788575004, 0640021019]
 000525e3fe01600d717da8423643a8303390a055c578ed8a97256600baf54565                                         [0874110016]
 00077dbd5c4a4991e092e63893ccf29294a9d5c46e85010e95f2fc10bf9437a4    [0903762001, 0879189005, 0158340001, 086796600...
                                                                                           ...                        
 fffa67737587e52ff1afa9c7c6490b5eb7acbc439fe82bd11d746ddb223dff26                             [0874816003, 0911870004]
 fffa7d7799eb390a76308454cbdd76e473d65b1497fbe44fe8cf95effea0bed7                             [0861803014, 0849886010]
 fffae8eb3a282d8c43c77dd2ca0621703b

In [8]:
# creating validation set for metrics use case
val_users = positive_items_val.keys()
val_items = []

for i,user in tqdm(enumerate(val_users)):
    val_items.append(positive_items_val[user])
    
print("Total users in validation:", len(val_users))

68984it [00:00, 150889.45it/s]

Total users in validation: 68984





We'll now validate our algo on the validation set.

In [9]:
from collections import Counter
outputs = []
cnt = 0

for user in tqdm(val_users):
    if user not in positive_items_per_user.keys():
        cnt+=1
        outputs.append(popular_items[0:12])
        continue
    
    most_common_items_of_user = [k for k, v in Counter(positive_items_per_user[user]).most_common()]
    
    user_output = list(most_common_items_of_user[:12])
            
    user_output = user_output + list(popular_items[:12 - len(user_output)])
    outputs.append(user_output)
    
print("mAP Score on Validation set:", mapk(val_items, outputs))

100%|██████████| 68984/68984 [00:00<00:00, 131066.18it/s]


mAP Score on Validation set: 0.02249134163309251


0.0225 mAP. Not bad! We'll now create submissions by adjusting the date ranges of the train set.

### Prediction on Test Set

In [10]:
data = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv", dtype={'article_id':str})

data["t_dat"] = pd.to_datetime(data["t_dat"])
train = data.loc[data["t_dat"] >= datetime.datetime(2020,9,7)]

positive_items_per_user = train.groupby(['customer_id'])['article_id'].apply(list)

train['pop_factor'] = train['t_dat'].apply(lambda x: 1/np.exp((datetime.datetime(2020,9,23) - x).days))
popular_items_group = train.groupby(['article_id'])['pop_factor'].sum()
_, popular_items = zip(*sorted(zip(popular_items_group, popular_items_group.keys()))[::-1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [11]:
submission = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv")
submission.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0706016001 0706016002 0372860001 0610776002 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0706016001 0706016002 0372860001 0610776002 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0706016001 0706016002 0372860001 0610776002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0706016002 0372860001 0610776002 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0706016001 0706016002 0372860001 0610776002 07...


In [12]:
from collections import Counter
outputs = []
cnt = 0

for user in tqdm(submission['customer_id']):
    if user not in positive_items_per_user.keys():
        cnt+=1
        outputs.append(popular_items[0:12])
        continue
    
    most_common_items_of_user = [k for k, v in Counter(positive_items_per_user[user]).most_common()]
    
    user_output = list(most_common_items_of_user[:12])
            
    user_output = user_output + list(popular_items[:12 - len(user_output)])
    outputs.append(user_output)
    
str_outputs = []
for output in outputs:
    str_outputs.append(" ".join([str(x) for x in output]))

100%|██████████| 1371980/1371980 [00:06<00:00, 223987.45it/s]


In [13]:
submission['prediction'] = str_outputs
submission.to_csv("submissions.csv", index=False)

In [14]:
submission.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0924243002 0448509014 0751471001 0866731001 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0924243002 0448509014 0751471001 0866731001 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0924243002 0448509014 0751471001 08...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243002 0448509014 0751471001 0866731001 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0924243002 0448509014 0751471001 0866731001 07...
