In [1]:
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm

### Preparing the data

We need to restrict the data respect to a minimum transaction date. In that way, we reduce the dimensionality of the problem and we get rid of transactions that are not important in terms of the time decaying popularity.

Also, we are getting rid of articles that have not been bought enough. (Minimum 10 purchases are required)

In [2]:
#transactions = pd.read_csv('transactions_train.csv', dtype={'article_id':str})
transactions = pd.read_csv('transactions_train.csv')
transactions.drop(['sales_channel_id', 'price'], inplace=True, axis=1)
transactions['bought'] = 1

In [3]:
transactions.head(5)

Unnamed: 0,t_dat,customer_id,article_id,bought
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,1
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,1
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,1
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,1
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,1


In [4]:
start_date = datetime.datetime(2020,9,1)
# Filter transactions by date
transactions["t_dat"] = pd.to_datetime(transactions["t_dat"])

In [5]:
print(f'earliest purchase date is: {transactions["t_dat"].min()}')
print(f'latest purchase date is: {transactions["t_dat"].max()}')

earliest purchase date is: 2018-09-20 00:00:00
latest purchase date is: 2020-09-22 00:00:00


In [6]:
transactions = transactions.loc[transactions["t_dat"] >= start_date]
# Filter transactions by number of an article has been bought
article_bought_count = transactions[['article_id', 't_dat']].groupby('article_id').count().reset_index().rename(columns={'t_dat': 'count'})
most_bought_articles = article_bought_count[article_bought_count['count']>10]['article_id'].values
transactions = transactions[transactions['article_id'].isin(most_bought_articles)]

In [15]:
most_bought_articles

array([111565001, 111586001, 111593001, ..., 953450001, 953763001,
       956217002], dtype=int64)

In [7]:
transactions

Unnamed: 0,t_dat,customer_id,article_id,bought
30990057,2020-09-01,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,923134005,1
30990058,2020-09-01,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,865929003,1
30990059,2020-09-01,0005ed68483efa39644c45185550a82dd09acb07622acb...,863646004,1
30990060,2020-09-01,0008f66f5804877333ab9942c880b4a222f5b1a9f6ce27...,574109042,1
30990061,2020-09-01,0008f66f5804877333ab9942c880b4a222f5b1a9f6ce27...,797079002,1
...,...,...,...,...
31788318,2020-09-22,ffd4cf2217de4a0a3f9f610cdec334c803692a18af08ac...,856440002,1
31788319,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,929511001,1
31788320,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,891322004,1
31788322,2020-09-22,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,833459002,1


Due to the big amount of items, we can not consider the whole matrix in order to train. Therefore, we need to generate some negative samples: transactions that have never occured.

In [8]:
# Generate negative samples
np.random.seed(0)

negative_samples = pd.DataFrame({
    'article_id': np.random.choice(transactions.article_id.unique(), transactions.shape[0]),
    'customer_id': np.random.choice(transactions.customer_id.unique(), transactions.shape[0]),
    'bought': np.zeros(transactions.shape[0])
})

### Defining the model

Model will be based on recommendations computed through the time decaying popularity and the most similar items to those items bought the most times by each user. Similarity among items is computed through cosine distance.

In [9]:
customers = transactions.customer_id.values
articles = transactions.article_id.values
customer_id2index = {c: i for i, c in enumerate(np.unique(customers))}
article_id2index = {a: i for i, a in enumerate(np.unique(articles))}
len(customer_id2index)

185063

In [16]:
customer_id2index

{'00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657': 0,
 '000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318': 1,
 '0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d8cd0c725276a467a2a': 2,
 '000172a9c322560c849754ffbdfdb2180d408aa7176b943f957804686be8e1f0': 3,
 '0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37e011580a479e80aa94': 4,
 '0001f8cef6b9702d54abf66fd89eb21014bf98567065a9b5e42f37bc99528cf5': 5,
 '0002cca4cc68601e894ab62839428e5f0696417fe0f9e84551c6827a7629d441': 6,
 '00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793': 7,
 '0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf4672f30b3e622fec55': 8,
 '00040239317e877c77ac6e79df42eb2633ad38fcac09fc0094e549180ddc201c': 9,
 '000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed6396773839f6bf71a9': 10,
 '000525e3fe01600d717da8423643a8303390a055c578ed8a97256600baf54565': 11,
 '0005340aa69bb5a28d98712a36d8f669024bce137e3c82a618ea49032b9023c6': 12,
 '0005ed68483efa39644c45185550a82dd09acb07622acb17cff1304ed64

In [28]:
n_samples = transactions.shape[0]
training_indices = np.arange(n_samples)
np.random.shuffle(training_indices)
training_indices
x = np.random.rand(5,10)
y = np.random.rand(5,10)
similarity_matrix = cosine_similarity(x,y)
print(similarity_matrix)
similarity_matrix = np.argsort(similarity_matrix, axis=1)
print(similarity_matrix)
similarity_matrix = similarity_matrix[:, -12:]
print(similarity_matrix)


[[0.78052694 0.69801984 0.53506939 0.67392171 0.69830783]
 [0.86648361 0.7626745  0.83327608 0.72920463 0.68934503]
 [0.88678583 0.84517197 0.69845382 0.75061213 0.80928817]
 [0.57913918 0.72177055 0.43674513 0.53536101 0.85140563]
 [0.74979291 0.6936791  0.64349722 0.70908465 0.76772711]]
[[2 3 1 4 0]
 [4 3 1 2 0]
 [2 3 4 1 0]
 [2 3 0 1 4]
 [2 1 3 0 4]]
[[2 3 1 4 0]
 [4 3 1 2 0]
 [2 3 4 1 0]
 [2 3 0 1 4]
 [2 1 3 0 4]]


In [31]:
transactions_by_customer = transactions[['customer_id', 'article_id', 'bought']].groupby(['customer_id', 'article_id']).count().reset_index()
transactions_by_customer.loc[transactions_by_customer.groupby('customer_id').bought.idxmax()]['article_id'].values
len(transactions_by_customer)

653427

In [10]:
from sklearn.metrics.pairwise import cosine_similarity


class ItemBased_RecSys:
    ''' Collaborative filtering using a custom sim(u,u'). '''

    def __init__(self, positive_transactions, negative_transactions, num_components=10):
        ''' Constructor '''
        self.positive_transactions = positive_transactions
        self.transactions = pd.concat([positive_transactions, negative_transactions])
        self.customers = self.transactions.customer_id.values
        self.articles = self.transactions.article_id.values
        self.bought = self.transactions.bought.values
        self.num_components = num_components

        self.customer_id2index = {c: i for i, c in enumerate(np.unique(self.customers))}
        self.article_id2index = {a: i for i, a in enumerate(np.unique(self.articles))}
        
    def __sdg__(self):
        for idx in tqdm(self.training_indices):
            # Get the current sample
            customer_id = self.customers[idx]
            article_id = self.articles[idx]
            bought = self.bought[idx]

            # Get the index of the user and the article
            customer_index = self.customer_id2index[customer_id]
            article_index = self.article_id2index[article_id]

            # Compute the prediction and the error
            prediction = self.predict_single(customer_index, article_index)
            error = (bought - prediction) # error
            
            # Update latent factors in terms of the learning rate and the observed error
            self.customers_latent_matrix[customer_index] += self.learning_rate * \
                                    (error * self.articles_latent_matrix[article_index] - \
                                     self.lmbda * self.customers_latent_matrix[customer_index])
            self.articles_latent_matrix[article_index] += self.learning_rate * \
                                    (error * self.customers_latent_matrix[customer_index] - \
                                     self.lmbda * self.articles_latent_matrix[article_index])
                
                
    def fit(self, n_epochs=10, learning_rate=0.001, lmbda=0.1):
        ''' Compute the matrix factorization R = P x Q '''
        self.learning_rate = learning_rate
        self.lmbda = lmbda
        n_samples = self.transactions.shape[0]
        
        # Initialize latent matrices
        self.customers_latent_matrix = np.random.normal(scale=1., size=(len(np.unique(self.customers)), self.num_components))
        self.articles_latent_matrix = np.random.normal(scale=1., size=(len(np.unique(self.articles)), self.num_components))

        for epoch in range(n_epochs):
            print('Epoch: {}'.format(epoch))
            self.training_indices = np.arange(n_samples)
            
            # Shuffle training samples and follow stochastic gradient descent
            np.random.shuffle(self.training_indices)
            self.__sdg__()

    def predict_single(self, customer_index, article_index):
        ''' Make a prediction for an specific user and article '''
        prediction = np.dot(self.customers_latent_matrix[customer_index], self.articles_latent_matrix[article_index])
        prediction = np.clip(prediction, 0, 1)
        
        return prediction

    def default_recommendation(self):
        ''' Calculate time decaying popularity '''
        # Calculate time decaying popularity. This leads to items bought more recently having more weight in the popularity list.
        # In simple words, item A bought 5 times on the first day of the train period is inferior than item B bought 4 times on the last day of the train period.
        self.positive_transactions['pop_factor'] = self.positive_transactions['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,23) - x).days)
        transactions_by_article = self.positive_transactions[['article_id', 'pop_factor']].groupby('article_id').sum().reset_index()
        return transactions_by_article.sort_values(by='pop_factor', ascending=False)['article_id'].values[:12]


    def predict(self, customers):
        ''' Make recommendations '''
        recommendations = []

        # Compute similarity matrix (cosine)
        similarity_matrix = cosine_similarity(self.articles_latent_matrix, self.articles_latent_matrix, dense_output=False)

        # Convert similarity matrix into a matrix containing the 12 most similar items' index for each item
        similarity_matrix = np.argsort(similarity_matrix, axis=1)
        similarity_matrix = similarity_matrix[:, -12:]

        # Get default recommendation (time decay popularity)
        default_recommendation = self.default_recommendation()

        # Group articles by user and articles to compute the number of times each article has been bought by each user
        transactions_by_customer = self.positive_transactions[['customer_id', 'article_id', 'bought']].groupby(['customer_id', 'article_id']).count().reset_index()
        most_bought_article = transactions_by_customer.loc[transactions_by_customer.groupby('customer_id').bought.idxmax()]['article_id'].values

        # Make predictions
        for customer in tqdm(customers):
            try:
                rec_aux1 = []
                rec_aux2 = []
                aux = []

                # Retrieve the most bought article by customer
                user_most_bought_article_id = most_bought_article[self.customer_id2index[customer]]

                # Using the similarity matrix, get the 6 most similar articles
                rec_aux1 = self.articles[similarity_matrix[self.article_id2index[user_most_bought_article_id]]]
                # Return the half of the default recommendation
                rec_aux2 = default_recommendation

                # Merge half of both recommendation lists
                for rec_idx in range(6):
                    aux.append(rec_aux2[rec_idx])
                    aux.append(rec_aux1[rec_idx])

                recommendations.append(' '.join(aux))
            except:
                # Return the default recommendation
                recommendations.append(' '.join(default_recommendation))
        
        return pd.DataFrame({
            'customer_id': customers,
            'prediction': recommendations,
        })

### Training

Define your hyperparameters and fit the model. Take into account that there are more customizable parameters in the data processing section.

In [11]:
rec = ItemBased_RecSys(transactions, negative_samples, num_components=1000)
rec.fit(n_epochs=20)

  0%|                                                                                      | 0/1485990 [00:00<?, ?it/s]

Epoch: 0


100%|█████████████████████████████████████████████████████████████████████| 1485990/1485990 [01:21<00:00, 18129.62it/s]
  0%|                                                                        | 1340/1485990 [00:00<02:05, 11845.99it/s]

Epoch: 1


100%|█████████████████████████████████████████████████████████████████████| 1485990/1485990 [01:19<00:00, 18598.09it/s]
  0%|                                                                        | 1642/1485990 [00:00<01:43, 14277.67it/s]

Epoch: 2


100%|█████████████████████████████████████████████████████████████████████| 1485990/1485990 [01:21<00:00, 18244.17it/s]
  0%|                                                                        | 2045/1485990 [00:00<01:23, 17695.55it/s]

Epoch: 3


100%|█████████████████████████████████████████████████████████████████████| 1485990/1485990 [01:17<00:00, 19105.40it/s]
  0%|                                                                        | 2257/1485990 [00:00<01:13, 20120.56it/s]

Epoch: 4


100%|█████████████████████████████████████████████████████████████████████| 1485990/1485990 [01:19<00:00, 18723.40it/s]
  0%|                                                                        | 2164/1485990 [00:00<01:15, 19670.78it/s]

Epoch: 5


100%|█████████████████████████████████████████████████████████████████████| 1485990/1485990 [01:16<00:00, 19548.42it/s]
  0%|                                                                        | 2090/1485990 [00:00<01:17, 19052.17it/s]

Epoch: 6


100%|█████████████████████████████████████████████████████████████████████| 1485990/1485990 [01:16<00:00, 19391.02it/s]
  0%|                                                                        | 1475/1485990 [00:00<01:54, 12990.34it/s]

Epoch: 7


100%|█████████████████████████████████████████████████████████████████████| 1485990/1485990 [01:16<00:00, 19515.25it/s]
  0%|                                                                        | 1886/1485990 [00:00<01:30, 16327.93it/s]

Epoch: 8


100%|█████████████████████████████████████████████████████████████████████| 1485990/1485990 [01:16<00:00, 19467.42it/s]
  0%|                                                                        | 2274/1485990 [00:00<01:15, 19712.81it/s]

Epoch: 9


100%|█████████████████████████████████████████████████████████████████████| 1485990/1485990 [01:18<00:00, 18951.97it/s]
  0%|                                                                                      | 0/1485990 [00:00<?, ?it/s]

Epoch: 10


100%|█████████████████████████████████████████████████████████████████████| 1485990/1485990 [01:25<00:00, 17481.52it/s]
  0%|                                                                        | 2111/1485990 [00:00<01:16, 19398.04it/s]

Epoch: 11


100%|█████████████████████████████████████████████████████████████████████| 1485990/1485990 [01:15<00:00, 19678.91it/s]
  0%|                                                                        | 2176/1485990 [00:00<01:11, 20749.01it/s]

Epoch: 12


100%|█████████████████████████████████████████████████████████████████████| 1485990/1485990 [01:20<00:00, 18428.54it/s]
  0%|                                                                        | 1799/1485990 [00:00<01:28, 16806.25it/s]

Epoch: 13


100%|█████████████████████████████████████████████████████████████████████| 1485990/1485990 [01:31<00:00, 16268.80it/s]
  0%|                                                                        | 1685/1485990 [00:00<01:41, 14598.09it/s]

Epoch: 14


100%|█████████████████████████████████████████████████████████████████████| 1485990/1485990 [01:37<00:00, 15204.06it/s]
  0%|                                                                        | 1901/1485990 [00:00<01:26, 17171.96it/s]

Epoch: 15


100%|█████████████████████████████████████████████████████████████████████| 1485990/1485990 [01:17<00:00, 19270.16it/s]
  0%|                                                                        | 2209/1485990 [00:00<01:15, 19753.87it/s]

Epoch: 16


100%|█████████████████████████████████████████████████████████████████████| 1485990/1485990 [01:16<00:00, 19412.25it/s]
  0%|                                                                        | 2225/1485990 [00:00<01:14, 19879.28it/s]

Epoch: 17


100%|█████████████████████████████████████████████████████████████████████| 1485990/1485990 [01:15<00:00, 19681.94it/s]
  0%|                                                                        | 2027/1485990 [00:00<01:22, 17962.21it/s]

Epoch: 18


100%|█████████████████████████████████████████████████████████████████████| 1485990/1485990 [01:15<00:00, 19591.06it/s]
  0%|                                                                        | 1992/1485990 [00:00<01:23, 17674.57it/s]

Epoch: 19


100%|█████████████████████████████████████████████████████████████████████| 1485990/1485990 [01:15<00:00, 19702.66it/s]


### Generating a submission

In [12]:
customers = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv').customer_id.unique()

FileNotFoundError: [Errno 2] No such file or directory: '../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv'

In [None]:
recommendations = rec.predict(customers)

In [None]:
recommendations.to_csv('submission.csv', index=False)

In [None]:
print(recommendations)