In [1]:
# library
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

## Collaborative Filtering Implementation

In [None]:
class CollaborativeFiltering:
    def __init__(self, data, pattern=None):
        self.data = pd.read_csv(data)
        self.data_processing()
    
    def data_processing(self):
        # data pre-processing
        # creating a utility matrix where is each row is a user (member) and each column is an item
        # this is to calculate similarity between users (members)
        # adding a count column to count the each items bought by a user (member)
        # (as our raw dataset contains user id (member bumber) and one item in each row, so initially I am adding 1 to count column)
        self.data['count'] = 1

        # creating a utility matrix (basically we are creating a wide form of the dataset)
        # where all the colums are available unique item name (167 items) and rows are the member's ID (3872 members) and 
        # values are the values from count column (summed if a item bought multiple times)
        self.utility_matrix = pd.pivot_table(self.data, index='Member_number', columns='itemDescription', values='count', aggfunc='sum', fill_value=0)

        # coverting the utility matrix (dataframe) to numpy for faster and efficient system
        self.utility_matrix_numpy = self.utility_matrix.to_numpy()

        # mapping member ID's to numpy arrary index
        self.member_id_to_np_index_map = {}
        for i, member_id in enumerate(self.utility_matrix.index):
            self.member_id_to_np_index_map[member_id] = i

        # mapping item decription to numpy index
        self.item_description = self.utility_matrix.columns.to_list()

        # overview of the utility matrix
        return self.utility_matrix
    
    # calculating cosine similarity 
    def cosine_similarity(self, user_1, user_2):
        # calculating dot product between two users
        dot_product = np.dot(user_1, user_2)
        # calculating magnitude 
        magnitude_of_vector = np.sqrt(np.sum(user_1 ** 2) * np.sum(user_2 ** 2))
        # cosine similarity
        cos_similarity = dot_product / magnitude_of_vector
        return cos_similarity
    
    def predict_purchase(self, item_index, member_index):
        items_each_member = self.utility_matrix_numpy[:, item_index]
        purchesed_item_indeces = np.where(items_each_member != 0)[0]
        purchesed_item_count = items_each_member[purchesed_item_indeces]

        member_similarity = [self.cosine_similarity(self.utility_matrix_numpy[member_index], self.utility_matrix_numpy[i]) for i in purchesed_item_indeces]
        # prediction
        weighted_sum = np.dot(member_similarity, purchesed_item_count) / np.sum(member_similarity) 
        return weighted_sum

    
    def make_recommendation(self, member_number, recommend_top_k):
        # getting index for member id
        member_index = self.member_id_to_np_index_map[member_number]

        item_not_bought_by_member_indeces = np.where(self.utility_matrix_numpy[member_index] == 0)[0]
        weighted_scores = []

        for item_index in item_not_bought_by_member_indeces:
            predicted_purchase_score = self.predict_purchase(item_index, member_index)
            weighted_scores.append((item_index, predicted_purchase_score))

        # sorting based on weighted scores
        top_items_to_recommend = sorted(weighted_scores, key=lambda x: x[1], reverse=True)[:recommend_top_k]
        recommened_items = [self.item_description[i] for i, weight in top_items_to_recommend]
        return recommened_items
    


In [3]:
# making recommendation for a user
data = 'Groceries data train.csv'
member_number = 1000
recommend_item = 5

# instantiating class
recommend_system = CollaborativeFiltering(data=data)

# making recommendation
recommend_system.make_recommendation(member_number, recommend_item)

['whole milk',
 'other vegetables',
 'rolls/buns',
 'root vegetables',
 'tropical fruit']

## Evaluate Perfomance

In [4]:
test_data = pd.read_csv('Groceries data test.csv')
test_data.head()

Unnamed: 0,Member_number,Date,itemDescription,year,month,day,day_of_week
0,3481,8/03/2015,candy,2015,3,8,6
1,1254,19/04/2015,white wine,2015,4,19,6
2,2835,28/01/2014,domestic eggs,2014,1,28,1
3,2854,2/08/2015,coffee,2015,8,2,6
4,4637,12/08/2014,bottled water,2014,8,12,1


In [5]:
# getting the member id's from the test data
test_member_id = test_data['Member_number'].to_list()
test_member_id.sort()
test_member_id = set(test_member_id)
print(f'Member in the test data: {len(test_member_id)}')

Member in the test data: 3566


In [6]:
def precision_at_k(recommended, ground_truth, k):
    recommended = recommended[:k]
    precision = len(set(recommended).intersection(ground_truth))
    return precision / k

In [11]:
# evaluating 
precision_score_list = []
for member_id in test_member_id:
    u = recommend_system.data_processing()
    if member_id not in u.index:
        continue
    true_items = test_data[test_data['Member_number'] == member_id]['itemDescription'].to_list()
    recommended_items_for_user = recommend_system.make_recommendation(member_id, 5)
    precision_score = precision_at_k(recommended_items_for_user, true_items, 5)
    precision_score_list.append(precision_score)

In [14]:
np.mean(precision_score_list)

0.08598870056497175