# User based collaborative filtering to make recommendtaion system 

## Recommendation System — Use of Collaborative Filtering and Hybrid Collaborative — Content in Retail using LightFM library on Market basket dataset

 The "user-item interaction matrix" defines the interaction between the user (customer) to the item (product), and in the reatail case we take 
“number of purchases” as the rating. 
EG:  If a customer A bought product B 10 times, then we can say customer A rated product B with rating 10. You can also take into account binary ratings where 1 refers to customer A had bought or 0 as had never bought product B.

The "item-feature interaction matrix" defines the features of the items. Item to features can be represented as a product to its metadata such as the product’s category, sub-category, or even any pieces of information.

In [87]:
import pandas as pd # pandas for data manipulation
import numpy as np # numpy for sure
from scipy.sparse import coo_matrix # for constructing sparse matrix
# lightfm 
from lightfm import LightFM # model
from lightfm.evaluation import auc_score
# timing
import time

### Importing data

In [12]:
#IMPORTING DATA
maret_basket_analysis = pd.read_csv (r'H:\intern_lovelocal\ML_Project\store_product_orders_data_modified.csv')
maret_basket_analysis= maret_basket_analysis[['customer_id','product_name','product_category_name']] #Taking required columns

In [88]:
maret_basket_analysis.head()

Unnamed: 0,customer_id,product_name,product_category_name
0,4941c8cacc20474cae212b33518e1fe5,Gemini Refined Sunflower Oil Jar (5 L),Edible Oils & Ghee
1,4941c8cacc20474cae212b33518e1fe5,Aashirvaad Superior MP Atta (1 kg),"Foodgrains, Oil & Masala"
2,4b8e778049aa4801b33978eba68e863d,Mix Dryfruits 500 Gm,Rice & Rice Products
3,4e5cbd6b7660492286cf5934a651384d,McCain Smiles (1.25 kg),Dairy
4,4e5cbd6b7660492286cf5934a651384d,Svt Amla Fizz 300Ml,Beverages


In [22]:
len(maret_basket_analysis)

83895

## Data preparation for LightFM library

### Users         (unique customer_id list)
### Items         (all unique product names list)
### Features      (all unique product name category)

In [165]:
users= np.sort(maret_basket_analysis['customer_id'].unique())
items= np.sort(maret_basket_analysis['product_name'].unique())
features= np.sort(maret_basket_analysis['product_category_name'].unique())

In [166]:
users

array(['0001370ecc044bbdaf598d8c229fe42b',
       '00017f22595b4f3a9ad38570d84565c0',
       '0005748202924e7ab74ea337c738dc27', ...,
       'fff9f95e83bd4ef5929f9fb2e073dabf',
       'fffb8874765c4e2a8ded950640209564',
       'ffff5d4c654541dbb2d9aa8643f14480'], dtype=object)

In [167]:
items

array(['\t Whisper Choic Ultra Xl (6 pads)', ' GOWARDHAN PANEER1 PC',
       ' Glucon D Energy Drink Pure Glucose Tangy Orange (100 g)', ...,
       'vegge', 'vim bar 4x120g=480g', 'vim gel(155 ml)'], dtype=object)

In [168]:
features

array(['APP EXCLUSIVE DEALS', 'All Purpose Cleaners', 'All products',
       'Antiseptics & Bandages', 'Appliances', 'Appliances & Electricals',
       'Atta, Flours & Sooji', 'Baby Care', 'Baby Food & Formula',
       'Bakery Snacks', 'Bakery, Cakes & Dairy', 'Bath',
       'Bath & Hand Wash', 'Bath & Shower', 'Beauty & Hygiene',
       'Beverages', 'Bins & Bathroom Ware', 'Biscuits & Cookies', 'Body',
       'Breads & Buns', 'Breakfast Cereals', 'Burger Pizza and Pasta',
       'Cakes & Pastries', 'Car & Shoe Care', 'Cereals & Breakfast',
       "Children's Day Special", 'Chinese', 'Chips & Corn Snacks',
       'Chocolates', 'Chocolates & Biscuits', 'Chocolates & Candies',
       'Cleaning & Household', 'Coffee', 'Combos and Meals',
       'Cookies, Rusk & Khari', 'Cooking & Baking Needs',
       'Cookware & Non Stick', 'Cuts & Sprouts', 'Dairy',
       'Dairy & Cheese', 'Dals & Pulses', 'Dental Care',
       'Detergents & Dishwash', 'Disposables, Garbage Bag',
       'Diwali Special

### 1. Customers/users mapping 
### user_id, product_name, product_count
### user to product rating (user_id with the count of the products baught)

In [89]:
user_to_product_rating = maret_basket_analysis.pivot_table(index=['customer_id','product_name'], aggfunc='size').sort_values(ascending= False)
#Converting from series to dataframe
user_to_product_rating = user_to_product_rating.to_frame()
#NAming third column
user_to_product_rating = user_to_product_rating.rename(columns = {0: "product_count"})
#Setting the Index
user_to_product_rating.reset_index(inplace= True)
user_to_product_rating = user_to_product_rating.rename(columns = {'customer_id':"user_id"})
user_to_product_rating.head()


Unnamed: 0,user_id,product_name,product_count
0,552fac652e2e4a469e48df9f86e845bb,Potato,43
1,552fac652e2e4a469e48df9f86e845bb,Tomato,41
2,552fac652e2e4a469e48df9f86e845bb,Lady Finger,34
3,552fac652e2e4a469e48df9f86e845bb,Onion,34
4,43a1871f2529414eb16cc4b9fd456edd,Colgate Toothbrush Zig Zag Black (1 pc),32


### Splitting the data set into train and test data

In [91]:
from sklearn.model_selection import train_test_split
# create training and testing vars
user_to_product_rating_train, user_to_product_rating_test = train_test_split(user_to_product_rating, test_size=0.2)

###  2.  product category name(fearture) mapping 

In [92]:
product_to_feature = maret_basket_analysis.pivot_table(index=['product_name','product_category_name'], aggfunc='size').sort_values(ascending= False)
#Converting from series to dataframe
product_to_feature = product_to_feature.to_frame()
#NAming third column
product_to_feature = product_to_feature.rename(columns = {0: "feature_count",'product_category_name':"feature"})
#Setting the Index
product_to_feature.reset_index(inplace= True)
product_to_feature = product_to_feature.rename(columns = {'product_category_name':"feature"})
product_to_feature.head()


Unnamed: 0,product_name,feature,feature_count
0,Sugar (1 kg),"Salt, Sugar & Jaggery",2168
1,Tata Salt (1kg),"Salt, Sugar & Jaggery",633
2,Toor Dal (1 kg),Dals & Pulses,620
3,Onion,Fresh Vegetables,577
4,Red Potato,Fresh Vegetables,456


## Helper functions

###  To generate mapping, LightFM library can't read other than (integer) index hence converting them to integers

In [93]:
def id_mappings(user_list, item_list, feature_list):
    """
    
    Create id mappings to convert user_id, item_id, and feature_id
    
    """
    user_to_index_mapping = {}
    index_to_user_mapping = {}
    for user_index, user_id in enumerate(user_list):
        user_to_index_mapping[user_id] = user_index
        index_to_user_mapping[user_index] = user_id
        
    item_to_index_mapping = {}
    index_to_item_mapping = {}
    for item_index, item_id in enumerate(item_list):
        item_to_index_mapping[item_id] = item_index
        index_to_item_mapping[item_index] = item_id
        
    feature_to_index_mapping = {}
    index_to_feature_mapping = {}
    for feature_index, feature_id in enumerate(feature_list):
        feature_to_index_mapping[feature_id] = feature_index
        index_to_feature_mapping[feature_index] = feature_id
        
        
    return user_to_index_mapping, index_to_user_mapping, \
           item_to_index_mapping, index_to_item_mapping, \
           feature_to_index_mapping, index_to_feature_mapping


### To calculate interaction matrix

In [103]:
def get_interaction_matrix(df, df_column_as_row, df_column_as_col, df_column_as_value, row_indexing_map, 
                          col_indexing_map):
    
    row = df[df_column_as_row].apply(lambda x: row_indexing_map[x]).values
    col = df[df_column_as_col].apply(lambda x: col_indexing_map[x]).values
    value = df[df_column_as_value].values
    
    return coo_matrix((value, (row, col)), shape = (len(row_indexing_map), len(col_indexing_map)))

In [104]:
# generate user_item_interaction_matrix for train data
user_to_product_interaction_train = get_interaction_matrix(user_to_product_rating_train, "user_id", 
                                                    "product_name", "product_count", user_to_index_mapping, item_to_index_mapping)

# generate user_item_interaction_matrix for test data
user_to_product_interaction_test = get_interaction_matrix(user_to_product_rating_test, "user_id", 
                                                    "product_name", "product_count", user_to_index_mapping, item_to_index_mapping)

# generate item_to_feature interaction
product_to_feature_interaction = get_interaction_matrix(product_to_feature, "product_name", "feature",  "feature_count", 
                                                        item_to_index_mapping, feature_to_index_mapping)

## Applying LightFM Cross Validation

using pure collaborative filtering, not adding some item features as consideration



In [None]:
# initialising model with warp loss function
model_without_features = LightFM(loss = "warp")

In [105]:
# fitting into user to product interaction matrix only / pure collaborative filtering factor
start = time.time()
#===================

model_without_features.fit(user_to_product_interaction_train,
          user_features=None, 
          item_features=None, 
          sample_weight=None, 
          epochs=1, 
          num_threads=4,
          verbose=False)

#===================
end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))

# auc metric score (ranging from 0 to 1)

start = time.time()
#===================

auc_without_features = auc_score(model = model_without_features, 
                        test_interactions = user_to_product_interaction_test,
                        num_threads = 4, check_intersections = False)
#===================
end = time.time()

print("time taken = {0:.{1}f} seconds".format(end - start, 2))
print("average AUC without adding item-feature interaction = {0:.{1}f}".format(auc_without_features.mean(), 2))


time taken = 0.28 seconds
time taken = 10.76 seconds
average AUC without adding item-feature interaction = 0.75


### The hybrid collaborative — content based by adding products/items and features interactions with the code below

In [None]:
# initialising model with warp loss function
model_with_features = LightFM(loss = "warp")

In [130]:
# fitting the model with hybrid collaborative filtering + content based (product + features)
start = time.time()
#===================


model_with_features.fit(user_to_product_interaction_train,
          user_features=None, 
          item_features=product_to_feature_interaction, 
          sample_weight=None, 
          epochs=1, 
          num_threads=4,
          verbose=False)

#===================
end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))

start = time.time()
#===================
auc_with_features = auc_score(model = model_with_features, 
                        test_interactions = user_to_product_interaction_test,
                        train_interactions = user_to_product_interaction_train, 
                        item_features = product_to_feature_interaction,
                        num_threads = 4, check_intersections=False)
#===================
end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))

print("average AUC without adding item-feature interaction = {0:.{1}f}".format(auc_with_features.mean(), 2))


time taken = 0.18 seconds
time taken = 14.00 seconds
average AUC without adding item-feature interaction = 0.64


### Requesting Products/Items Recommendation

### We need to combine the train and the test dataset into one by combining through function below

In [131]:
def combined_train_test(train, test):
    """
    
    test set is the more recent rating/number_of_order of users.
    train set is the previous rating/number_of_order of users.
    non-zero value in the test set will replace the elements in 
    the train set matrices
    """
    # initialising train dict
    train_dict = {}
    for train_row, train_col, train_data in zip(train.row, train.col, train.data):
        train_dict[(train_row, train_col)] = train_data
        
    # replacing with the test set
    
    for test_row, test_col, test_data in zip(test.row, test.col, test.data):
        train_dict[(test_row, test_col)] = max(test_data, train_dict.get((test_row, test_col), 0))
        
    
    # converting to the row
    row_element = []
    col_element = []
    data_element = []
    for row, col in train_dict:
        row_element.append(row)
        col_element.append(col)
        data_element.append(train_dict[(row, col)])
        
    # converting to np array
    
    row_element = np.array(row_element)
    col_element = np.array(col_element)
    data_element = np.array(data_element)
    
    return coo_matrix((data_element, (row_element, col_element)), shape = (train.shape[0], train.shape[1]))


### Creating a user to product interaction matrix


In [132]:
user_to_product_interaction = combined_train_test(user_to_product_interaction_train, 
                                                 user_to_product_interaction_test)

In [133]:
user_to_product_interaction

<23281x15666 sparse matrix of type '<class 'numpy.float64'>'
	with 76180 stored elements in COOrdinate format>

###  Retraining the final model with combined dataset(combining train and test dataset)
### This time using pure collaborative filtering only cause its giving better accuracy than hybrid collaborative filtering


In [142]:
# retraining the final model with combined dataset

final_model = LightFM(loss = "warp")

# fitting to combined dataset with pure collaborative filtering result

start = time.time()
#===================

final_model.fit(user_to_product_interaction,
          user_features=None, 
          item_features=None, 
          sample_weight=None, 
          epochs=1, 
          num_threads=4,
          verbose=False)

#===================
end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))

time taken = 0.35 seconds


### Class to ask about recommendation


In [163]:
class recommendation_sampling:
    
    def __init__(self, model, items = items, user_to_product_interaction_matrix = user_to_product_interaction, 
                user2index_map = user_to_index_mapping):
        
        self.user_to_product_interaction_matrix = user_to_product_interaction_matrix
        self.model = model
        self.items = items
        self.user2index_map = user2index_map
    
    def recommendation_for_user(self, user):
        
        # getting the userindex
        
        userindex = self.user2index_map.get(user, None)
        
        if userindex == None:
            return None
        
        users = [userindex]
        
        # products already bought
        
        known_positives = self.items[self.user_to_product_interaction_matrix.tocsr()[userindex].indices]
        
        # scores from model prediction
        scores = self.model.predict(user_ids = users, item_ids = np.arange(self.user_to_product_interaction_matrix.shape[1]))
        
        # top items
        
        top_items = self.items[np.argsort(-scores)]
        
        # printing out the result
        print("User %s" % user)
        print("     Known positives:")
        
        for x in known_positives[:3]:
            print("                  %s" % x)
            
            
        print("     Recommended:")
        
        for x in top_items[:3]:
            print("                  %s" % x)
            


### Calling the recommendation for a random user from the dataset using the final model

In [164]:
recom = recommendation_sampling(model = final_model)
recom.recommendation_for_user('742b67e06d144247a2b34befcc716e66')

User 742b67e06d144247a2b34befcc716e66
     Known positives:
                  BADAM 250GM
                  Nivia Black&White Deo 150Ml
     Recommended:
                  Sugar (1 kg)
                  Tata Salt (1kg)
                  Toor Dal (1 kg)
