### 4.4 Method 4: Customers-based Recommendation

In [5]:
#Due to the large size of dataset, we will only use 0.05% of data in the modelling.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
from sklearn.metrics.pairwise import cosine_similarity
tqdm.pandas()


transactions_df = pd.read_csv('./data/transactions_train.csv')
customer_df = pd.read_csv('./data/customers.csv')
discarded_transactions, selected_transactions = train_test_split(transactions_df, test_size=0.0005)

#Change string to datetime format
from datetime import datetime, timedelta
selected_transactions['t_dat'] = selected_transactions['t_dat'].apply(lambda x :datetime.strptime(x, '%Y-%m-%d'))

#import customer dataframe, and create a column called age_group based on customer age
def agegroup(x):
    if x<=18:
        return 1
    elif x>18 and x<=35:
        return 2
    elif x>35 and x<= 50:
        return 3
    else:
        return 4
customer_df['age_group'] = customer_df['age'].apply(agegroup)
customer_df = customer_df[['customer_id', 'age_group']]

#merge age_group to selected_transactions
selected_transactions = pd.merge(selected_transactions, customer_df, on='customer_id')

selected_transactions = selected_transactions[['t_dat','customer_id','article_id','age_group']]
print(max(transactions_df['t_dat']))
#maximum date is '2020-09-22'

#Use the transaction in latest 6 month as test set
test_set = selected_transactions[selected_transactions['t_dat'] >= datetime.strptime('2020-03-22', '%Y-%m-%d')]
#Test set is around 24.9% of the selected dataset.

training_set = selected_transactions[selected_transactions['t_dat'] < datetime.strptime('2020-03-22', '%Y-%m-%d')]
training_set = training_set.groupby(["customer_id","article_id", "age_group"])["t_dat"].count().reset_index(name='counts')

#Create a matrix to indicate whether a customer has purchased a product.
similarity_matrix = pd.pivot_table(training_set,values='counts',index='customer_id',columns='article_id')
similarity_matrix = similarity_matrix.fillna(0)

#Generate a matrix of cosine similarity between customers.
cosine = cosine_similarity(similarity_matrix)
np.fill_diagonal(cosine, 0)
similarity = pd.DataFrame(cosine, index=similarity_matrix.index)
similarity.columns = similarity_matrix.index

#Find the top 20 similar customers for each customer
neighbours = similarity.apply(lambda x: pd.Series(x.sort_values(ascending=False)
           .iloc[:20].index, 
          index=['top{}'.format(i) for i in range(1, 21)]), axis=1)


#count the number of purchases of each product in each age_group
purchase_counts = training_set.groupby(["age_group","article_id"])['counts'].sum().reset_index(name='counts')

#generate a dataframe for each age group
age_group1 = purchase_counts.loc[purchase_counts['age_group'] == 1]
age_group2 = purchase_counts.loc[purchase_counts['age_group'] == 2]
age_group3 = purchase_counts.loc[purchase_counts['age_group'] == 3]
age_group4 = purchase_counts.loc[purchase_counts['age_group'] == 4]

def popular_products(age_group):
    age_group = age_group.sort_values(by=['counts'], ascending = False)
    age_group = age_group.iloc[:12,:]['article_id'].tolist()
    return age_group

#Find the top 12 most popular products for each age group.
popular_product = []
popular_product.append(popular_products(age_group1))
popular_product.append(popular_products(age_group2))
popular_product.append(popular_products(age_group3))
popular_product.append(popular_products(age_group4))


#get_kitems_customer_recommendations will take in the customer_id and the number of products to predict, and return a list of product_id.
def purchased_products(similarity_matrix, customer_id):
    a = similarity_matrix.loc[[customer_id],]
    b = (a != 0).any()
    return a.columns[b].tolist()

#user_to_recommend takes in a string of customer_id
def get_kitems_customer_recommendations(topk, user_to_recommend, neighbours, popular_products, similarity_matrix):
    #user_to_Recommend: customer_id, indicate who we are going to recommend products for
    #neighbours: neighbours dataframe
    #popular_products: a list of most popular products for the customer's age group
    if user_to_recommend in neighbours.index:
        neighbour_customer = neighbours.loc[[user_to_recommend],].values #list of neighbours
        purchased_items = purchased_products(similarity_matrix, user_to_recommend)
        recommendation = []
        for neighbour in neighbour_customer[0]:
            recommendation.extend(purchased_products(similarity_matrix, neighbour))
        #remove purchased items from recommendation
        recommendation = [x for x in recommendation if x not in purchased_items]
        n_items = len(recommendation)
        if n_items >= topk:
            return recommendation[:topk]
        else:
            gap = topk - n_items
            for i in range(gap):
                recommendation.append(popular_products[i])
            return recommendation
    else:
        return popular_products[:topk]
    


2020-09-22


### 5.4 Model Evaluation

In [None]:
import warnings
warnings.filterwarnings('ignore')
#for evaluation, we take the first 6 digits of article_id, which represents the product_code
results = pd.DataFrame(columns=['customer_id', 'article_id', 'predicted_y'])
for customer, age_group in zip(test_set['customer_id'], test_set['age_group']):
    recommendations = get_kitems_customer_recommendations(12, customer, neighbours, popular_product[age_group-1], similarity_matrix)
    for product in recommendations:
        results = results.append({'customer_id': customer, 'article_id': str(product)[:6], 'predicted_y': 1}, ignore_index=True)        
test_set.article_id = test_set.article_id.astype(str)
test_set['article_id'] = test_set['article_id'].str[:6]

def update_user_choices(row):
    customer = row['customer_id']
    article = row['article_id']
    search = test_set.loc[(test_set.customer_id == customer) & (test_set.article_id == article), ]
    if len(search)>0:
        return 1
    else:
        return 0

#update actual purchase
results['actual_y'] = results.apply(update_user_choices, axis=1)

#Compute MAP for the model
results = results.drop(["predicted_y","actual_y"],axis=1)
train_unq = results.groupby('customer_id')['article_id'].apply(list).reset_index()
train_unq['valid_pred'] = train_unq['article_id'].map(lambda x: (str(x)[1:-1]))
valid_unq = test_set.groupby('customer_id')['article_id'].apply(list).reset_index()
valid_unq['valid_true'] = valid_unq['article_id'].map(lambda x: str(x)[1:-1])

merged = pd.merge(train_unq, valid_unq, on='customer_id', how='left').fillna('')
merged = merged.drop(["article_id_x","article_id_y"],axis=1)
merged = merged[merged['valid_true']!=''].reset_index(drop=True)

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)


def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])



# MAP calculation
mapk(merged['valid_true'].map(lambda x: x.split()), 
    merged['valid_pred'].map(lambda x: x.split()), 
    k=12
)
