# Setting up

In [1]:
import pandas as pd
import numpy as np

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

import tensorflow as tf
import tensorflow_recommenders as tfrs

import seaborn as sns
from matplotlib import pyplot as plt

from tqdm.notebook import tqdm
from pathlib import Path
from typing import Dict, Text
import random
from collections import Counter

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Getting data

## Inspecting data

In [3]:
pd.read_csv("../data/../data/hmdata/articles.csv.zip", nrows=5)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,4,Dark,5,Black,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,3,Light,9,White,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,1,Dusty Light,9,White,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,4,Dark,5,Black,1339,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,3,Light,9,White,1339,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [4]:
pd.read_csv("../data/../data/hmdata/customers.csv.zip", nrows=5)

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


In [5]:
pd.read_csv('../data/../data/hmdata/transactions_train.csv.zip', nrows=5)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


## Splitting data

In [6]:
def get_train_test_dates(all_data_df):
    all_data_df['t_dat'] = pd.to_datetime(all_data_df['t_dat'])
    test_end_date = (all_data_df['t_dat'].max() + pd.Timedelta(days=1))
    test_start_date = test_end_date - pd.Timedelta(days=7)
    train_start_date = test_start_date - pd.Timedelta(days=7*6)
    
    print('train start date:', train_start_date)
    print('test start date:', test_start_date)
    print('test end date:', test_end_date)
    
    return train_start_date, test_start_date, test_end_date

In [7]:
def get_df(
    start_date,
    end_date
):
    df = all_data_df.loc[
        (all_data_df['t_dat'] >= start_date)
        & (all_data_df['t_dat'] < end_date),
        :
    ]
    
    return df

def get_train_df():
    train_start_date, test_start_date, test_end_date = get_train_test_dates(all_data_df)
    train_df = get_df(train_start_date, test_start_date)
    
    return train_df

def get_test_df():
    train_start_date, test_start_date, test_end_date = get_train_test_dates(all_data_df)
    test_df = get_df(test_start_date, test_end_date)
    
    return test_df

## Cleaning data

In [8]:
def clean_article_id(
    df
):
    df = df.copy()
    df['article_id'] = df['article_id'].astype(str)
    # zfill adds zeros (0) at the beginning of the string, until it reaches the specified length
    df['article_id'] = df['article_id'].apply(lambda x: x.zfill(10))
    
    return df

## Munging data

In [9]:
article_columns = [
    'garment_group_name'
]

def add_article_features(
    df
):
    df = df.join(
        article_df.set_index('article_id')[article_columns],
        on='article_id'
    )
    
    return df

In [10]:
customer_columns = [
    'age'
]

def add_customer_features(
    df
):
    df = df.join(
        customer_df.set_index('customer_id')[customer_columns],
        on='customer_id'
    )
    
    return df

In [11]:
def add_features(
    df
):
    df = add_article_features(df)
    df = add_customer_features(df)
    
    return df

## Preparing data

In [12]:
article_df = pd.read_csv("../data/../data/hmdata/articles.csv.zip")
article_df = clean_article_id(article_df)

In [13]:
customer_df = pd.read_csv("../data/../data/hmdata/customers.csv.zip")

In [14]:
all_data_df = pd.read_csv('../data/../data/hmdata/transactions_train.csv.zip')

In [15]:
train_df = get_train_df()
train_df = clean_article_id(train_df)
train_df = add_features(train_df)

train_df.head()

train start date: 2020-08-05 00:00:00
test start date: 2020-09-16 00:00:00
test end date: 2020-09-23 00:00:00


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,garment_group_name,age
29933918,2020-08-05,0011e0bd4c39195ff342c0ca0ac0601ce2b943a8265506...,751471042,0.032746,2,Trousers,22.0
29933919,2020-08-05,0011e0bd4c39195ff342c0ca0ac0601ce2b943a8265506...,824499006,0.065525,2,Dressed,22.0
29933920,2020-08-05,0011e0bd4c39195ff342c0ca0ac0601ce2b943a8265506...,824499006,0.065525,2,Dressed,22.0
29933921,2020-08-05,0011e0bd4c39195ff342c0ca0ac0601ce2b943a8265506...,913688001,0.032746,2,Blouses,22.0
29933922,2020-08-05,0013bde09d10db6b0a6a3b0987ac60b643013dfc6f924b...,776237020,0.025407,2,Swimwear,27.0


In [16]:
test_df = get_test_df()
test_df = clean_article_id(test_df)
test_df = add_features(test_df)

test_df.head()

train start date: 2020-08-05 00:00:00
test start date: 2020-09-16 00:00:00
test end date: 2020-09-23 00:00:00


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,garment_group_name,age
31548013,2020-09-16,000fb6e772c5d0023892065e659963da90b1866035558e...,786022008,0.048441,2,Dresses Ladies,42.0
31548014,2020-09-16,000fb6e772c5d0023892065e659963da90b1866035558e...,913272003,0.032288,2,Knitwear,42.0
31548015,2020-09-16,000fb6e772c5d0023892065e659963da90b1866035558e...,889669006,0.056508,2,Trousers,42.0
31548016,2020-09-16,0010e8eb18f131e724d6997909af0808adbba057529edb...,237347060,0.033881,1,Jersey Basic,25.0
31548017,2020-09-16,0010e8eb18f131e724d6997909af0808adbba057529edb...,562245001,0.013542,1,Trousers,25.0


In [17]:
print('number of training samples:', len(train_df))
print('number of test samples:', len(test_df))

number of training samples: 1614095
number of test samples: 240311


# Building candidate generators

## Helpers

In [18]:
all_train_articles = set(train_df['article_id'])
len(all_train_articles)

32792

In [19]:
all_train_customers = set(train_df['customer_id'])
len(all_train_customers)

312215

## Random

In [20]:
def get_random_candidates(
    customer_id,
    num_candidates,
    all_articles=None
):
    if not all_articles:
        all_articles = set(train_df['article_id'])
    
    random_articles = list(random.sample(all_articles, num_candidates))
    
    return random_articles

In [21]:
# with all articles specified
get_random_candidates(
    customer_id=None,
    num_candidates=3,
    all_articles=all_train_articles
)

['0794321002', '0901452003', '0894766003']

In [22]:
# with all articles calculated
get_random_candidates(
    customer_id=None,
    num_candidates=3
)

['0863308002', '0841668001', '0589662003']

## Most popular

In [23]:
def get_most_popular_articles(
    num_articles
):
    articles_counter = Counter(train_df['article_id'])
    most_popular_articles = [article_id for article_id, _ in articles_counter.most_common(num_articles)]
    
    return most_popular_articles

In [24]:
most_popular_articles_3 = get_most_popular_articles(3)
most_popular_articles_100 = get_most_popular_articles(100)
most_popular_articles_1000 = get_most_popular_articles(1000)

In [25]:
def get_most_popular_candidates(
    customer_id,
    num_candidates,
    most_popular_articles=None
):
    if not most_popular_articles:
        most_popular_articles = get_most_popular_articles(num_candidates)
    
    assert len(most_popular_articles) == num_candidates,\
    f'most_popular_articles wrong length: {len(most_popular_articles)} != {num_candidates}'
    
    return most_popular_articles

In [26]:
# with most popular articles specified
get_most_popular_candidates(
    customer_id=None,
    num_candidates=3,
    most_popular_articles=most_popular_articles_3
)

['0751471001', '0706016001', '0918292001']

In [27]:
# with most popular articles calculated
get_most_popular_candidates(
    customer_id=None,
    num_candidates=3
)

['0751471001', '0706016001', '0918292001']

In [28]:
# with most popular articles incorrectly specified
try:
    get_most_popular_candidates(
        customer_id=None,
        num_candidates=3,
        most_popular_articles=[1, 2]
    )
except AssertionError as e:
    print(e)

most_popular_articles wrong length: 2 != 3


## Most popular customer category

- Find the categories that the customer has previously purchased from
- Find the most popular articles from these categories

In [29]:
def get_most_popular_customer_categories(
    customer_id,
    num_candidates,
    most_popular_articles=None
):
    customer_train_df = train_df.loc[train_df['customer_id'] == customer_id, :]
    if len(customer_train_df) == 0:
        raise Exception('no customer training data')
    
    customer_categories = set(customer_train_df['garment_group_name'])
    categories_train_df = train_df.loc[train_df['garment_group_name'].isin(customer_categories), :]
    
    articles_counter = Counter(categories_train_df['article_id'])
    most_popular_articles = [article_id for article_id, _ in articles_counter.most_common(num_candidates)]
    
    return most_popular_articles

In [30]:
get_most_popular_customer_categories(
    customer_id=train_df['customer_id'].sample(1).iloc[0],
    num_candidates=3
)

['0758034001', '0812668001', '0464297007']

In [31]:
def get_most_popular_customer_category_candidates(
    customer_id,
    num_candidates,
    most_popular_articles=None
):
    
    try:
        most_popular_category_articles = get_most_popular_customer_categories(customer_id, num_candidates)
        
        return most_popular_category_articles
    except:
        most_popular_articles = get_most_popular_candidates(
            customer_id,
            num_candidates,
            most_popular_articles
        )
        
        return most_popular_articles

In [32]:
get_most_popular_customer_category_candidates(
    customer_id=train_df['customer_id'].sample(1).iloc[0],
    num_candidates=3
)

['0918292001', '0916468003', '0915526001']

## Most popular customer price range

- Find the mean price of articles the customer has previously purchased
- Set the customer's price range to be within x% of this
- Find the most popular articles within the customer's price range

In [33]:
def get_most_popular_customer_price(
    customer_id,
    num_candidates,
    most_popular_articles=None
):
    customer_train_df = train_df.loc[train_df['customer_id'] == customer_id, :]
    if len(customer_train_df) == 0:
        raise Exception('no customer training data')
    
    customer_mean_price = customer_train_df['price'].mean()
    customer_lower_price = customer_mean_price * 0.8
    customer_upper_price = customer_mean_price * 1.2
    
    price_train_df = train_df.loc[
        train_df['price'].between(customer_lower_price, customer_upper_price), :]
    
    articles_counter = Counter(price_train_df['article_id'])
    most_popular_articles = [article_id for article_id, _ in articles_counter.most_common(num_candidates)]
    
    return most_popular_articles

In [34]:
get_most_popular_customer_price(
    customer_id=train_df['customer_id'].sample(1).iloc[0],
    num_candidates=3
)

['0751471001', '0706016001', '0915526001']

In [35]:
def get_most_popular_customer_price_candidates(
    customer_id,
    num_candidates,
    most_popular_articles=None
):
    
    try:
        most_popular_category_articles = get_most_popular_customer_price(customer_id, num_candidates)
        
        return most_popular_category_articles
    except:
        most_popular_articles = get_most_popular_candidates(
            customer_id,
            num_candidates,
            most_popular_articles
        )
        
        return most_popular_articles

In [36]:
get_most_popular_customer_price_candidates(
    customer_id=train_df['customer_id'].sample(1).iloc[0],
    num_candidates=3
)

['0751471001', '0918292001', '0706016001']

## Two tower model

- Two tower model using customer and article embeddings only

### Getting data

In [37]:
two_tower_features = [
    'customer_id',
    'article_id'
]

batch_size = 2048

tf.random.set_seed(42)

train_ds = tf.data.Dataset.from_tensor_slices(dict(train_df[two_tower_features]))
train_ds = train_ds.shuffle(100_000).batch(batch_size)

test_ds = tf.data.Dataset.from_tensor_slices(dict(test_df[two_tower_features]))
test_ds = test_ds.batch(batch_size)

In [38]:
articles = test_ds.map(lambda x: x['article_id'])

### Building model

In [96]:
customer_article_embedding_dimension = 64

In [97]:
customer_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
        vocabulary=np.array(list(all_train_customers))
    ),
    tf.keras.layers.Embedding(
        input_dim=len(all_train_customers) + 1,
        output_dim=customer_article_embedding_dimension
    ),
    # Removed non-linear fully-connected layer
])

In [98]:
article_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
        vocabulary=np.array(list(all_train_articles))
    ),
    tf.keras.layers.Embedding(
        input_dim=len(all_train_articles) + 1,
        output_dim=customer_article_embedding_dimension
    ),
    # Removed non-linear fully-connected layer
])

In [99]:
class TwoTowerModel(tfrs.Model):
    
    def __init__(
        self,
        customer_model,
        article_model
    ):
        super().__init__()
        
        self.customer_model: tf.keras.Model = customer_model
        self.article_model: tf.keras.Model = article_model
            
        self.task = tfrs.tasks.Retrieval()
    
    def compute_loss(
        self,
        features: Dict[Text, tf.Tensor],
        training=False
    ):
        customer_embeddings = self.customer_model(features['customer_id'])
        article_embeddings = self.article_model(features['article_id'])
        
        loss = self.task(
            query_embeddings=customer_embeddings,
            candidate_embeddings=article_embeddings
        )
        
        return loss

### Training model

In [100]:
two_tower_model = TwoTowerModel(customer_model, article_model)
two_tower_model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.01))

In [101]:
two_tower_history = two_tower_model.fit(
    train_ds,
    epochs=4,
    validation_data=test_ds
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


### Making recommendations

In [102]:
def get_two_tower_candidates(
    customer_id,
    num_candidates
):
    index = tfrs.layers.factorized_top_k.BruteForce(
        query_model=two_tower_model.customer_model,
        k=num_candidates
    )
    
    index.index_from_dataset(
        candidates=tf.data.Dataset.zip((articles, articles.map(two_tower_model.article_model)))
    )
    
    _, candidates_tensor = index(tf.constant([customer_id]))
    candidates = [i.decode('utf-8') for i in candidates_tensor.numpy().tolist()[0]]
    
    
    return candidates

In [103]:
get_two_tower_candidates(
    customer_id=train_df['customer_id'].sample(1).iloc[0],
    num_candidates=3
)

['0556255001', '0556255001', '0556255001']

# Evaluating candidate generators

## Helpers

In [104]:
def get_customer_test_purchases(
    customer_id
):
    test_purchases = set(test_df.loc[test_df['customer_id'] == customer_id, 'article_id'])

    return test_purchases

In [105]:
def calculate_recall(
    candidates,
    test_purchases
):
    true_positives = set(candidates).intersection(set(test_purchases))
    recall = len(true_positives) / len(test_purchases)
    
    return recall

In [106]:
all_train_customers = set(train_df['customer_id'])
all_test_customers = set(test_df['customer_id'])

common_customers = all_train_customers.intersection(all_test_customers)

print('number of common customers:', len(common_customers))

number of common customers: 37522


In [107]:
sampled_common_customers_100 = random.sample(common_customers, 100)
sampled_common_customers_1000 = random.sample(common_customers, 1000)

In [108]:
def calculate_candidate_generator_recall(
    get_candidate_fn,
    num_candidates,
    common_customers,
    get_candidate_fn_kwargs=None,
):
    recall_values = []
    for customer_id in common_customers:
        if get_candidate_fn_kwargs:
            candidates = get_candidate_fn(customer_id, num_candidates, **get_candidate_fn_kwargs)
        else:
            candidates = get_candidate_fn(customer_id, num_candidates)
        
        test_purchases = get_customer_test_purchases(customer_id)
        recall = calculate_recall(candidates, test_purchases)
        recall_values.append(recall)
    
    average_recall = np.mean(recall_values)
    
    return average_recall

## Random

### num_candidates = 100

In [109]:
calculate_candidate_generator_recall(
    get_candidate_fn=get_random_candidates,
    num_candidates=100,
    common_customers=sampled_common_customers_1000,
    get_candidate_fn_kwargs=dict(all_articles=all_train_articles),
)

0.002302631578947368

### num_candidates = 1000

In [110]:
calculate_candidate_generator_recall(
    get_candidate_fn=get_random_candidates,
    num_candidates=1000,
    common_customers=sampled_common_customers_1000,
    get_candidate_fn_kwargs=dict(all_articles=all_train_articles),
)

0.027884784921820525

## Most popular

### num_candidates = 100

In [111]:
calculate_candidate_generator_recall(
    get_candidate_fn=get_most_popular_candidates,
    num_candidates=100,
    common_customers=sampled_common_customers_1000,
    get_candidate_fn_kwargs=dict(most_popular_articles=most_popular_articles_100),
)

0.08124936561976036

### num_candidates = 1000

In [112]:
calculate_candidate_generator_recall(
    get_candidate_fn=get_most_popular_candidates,
    num_candidates=1000,
    common_customers=sampled_common_customers_1000,
    get_candidate_fn_kwargs=dict(most_popular_articles=most_popular_articles_1000),
)

0.3248333492230164

## Most popular customer category

### num_candidates = 100

In [113]:
calculate_candidate_generator_recall(
    get_candidate_fn=get_most_popular_customer_category_candidates,
    num_candidates=100,
    common_customers=sampled_common_customers_1000,
    get_candidate_fn_kwargs=dict(most_popular_articles=most_popular_articles_100),
)

0.08022992570295201

### num_candidates = 1000

In [114]:
calculate_candidate_generator_recall(
    get_candidate_fn=get_most_popular_customer_category_candidates,
    num_candidates=1000,
    common_customers=sampled_common_customers_1000,
    get_candidate_fn_kwargs=dict(most_popular_articles=most_popular_articles_1000),
)

0.2687018737841106

## Most popular customer price range

### num_candidates = 100

In [115]:
calculate_candidate_generator_recall(
    get_candidate_fn=get_most_popular_customer_price_candidates,
    num_candidates=100,
    common_customers=sampled_common_customers_1000,
    get_candidate_fn_kwargs=dict(most_popular_articles=most_popular_articles_100),
)

0.061017261758709126

### num_candidates = 1000

In [116]:
calculate_candidate_generator_recall(
    get_candidate_fn=get_most_popular_customer_price_candidates,
    num_candidates=1000,
    common_customers=sampled_common_customers_1000,
    get_candidate_fn_kwargs=dict(most_popular_articles=most_popular_articles_1000),
)

0.20214366551491164

## Two tower model

### num_candidates = 100

In [73]:
calculate_candidate_generator_recall(
    get_candidate_fn=get_two_tower_candidates,
    num_candidates=100,
    common_customers=sampled_common_customers_1000
)

0.0

### num_candidates = 1000

In [74]:
calculate_candidate_generator_recall(
    get_candidate_fn=get_two_tower_candidates,
    num_candidates=1000,
    common_customers=sampled_common_customers_1000
)

0.002