In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# Define Classes: Recommender and Data

In [62]:
class Recommender:
    def __init__(self, data, user_col, item_cols, cf_method='item', similarity='pearson'):
        '''init Recommender class'''
        self.data = data
        self.user_col = user_col
        self.item_cols = item_cols
        self.cf_method = cf_method
        self.similarity = similarity
        self.similarity_matrix = []
        self.user_scores = []
        self.recs = []

    def create_similarity_matrix(self):
        '''creates correlation/similarity matrix for all items and stores results in self.similarity_matrix'''
        print('creating matrix...')
        self.similarity_matrix = self._create_empty_df(self.cf_method)
        self._fill_similarity_matrix(self.similarity_matrix, self.similarity)

    def score_users(self, users=None):
        '''generates item ratings for each item for each user and stores result as self.user_scores'''
        'generate user scores for entirely new dataframe. (batch creation)'
        if not users:
            # grab all users in data by default
            users = self.data.loc[:,self.user_col]
        cols = [self.user_col] + list(self.item_cols)
        user_data = self.data.loc[:,cols].set_index(self.user_col)
        self.user_scores = pd.DataFrame(index=users, columns=self.item_cols)
        self.user_scores = self.data[self.item_cols].dot(self.similarity_matrix)
        self.user_scores[self.user_col]=users
        self.user_scores= self.user_scores.set_index(self.user_col)
        print(self.user_scores)


    def score_new_users(self, users, user_data):
        '''generates item ratings for users passed in from external data set and stores result as self.user_scores'''
        #adding new users to an existing dataframe which we built in 'score_users'
        cols = [self.user_col] + list(self.item_cols)
        self.user_scores = pd.DataFrame(index=user_data.index, columns=self.item_cols)
        self.user_scores = user_data.loc[self.item_cols].dot(self.similarity_matrix)      
                
    def generate_recs(self, users=None, num_recs=3):
        '''generates top num_rec recommendations for users and stores result as self.recs'''
        print('We hope you will find these recommended items to your satisfaction should you purchase in the future!\nThank you for shopping with us')
        if not users:
            # grab all users in data by default
            users = self.data.loc[:,self.user_col]
        cols = ['Rec ' + str(x) for x in range(1,num_recs+1)] + ['Score ' + str(x) for x in range(1,num_recs+1)]
        self.recs = pd.DataFrame(index=users, columns=cols)
        #create dataframe that will list the recommendations by each row, labeled by Member_number
        progress_bar = tqdm(total = len(users), mininterval=10)
        for user in users:
            progress_bar.update()
            sorted_items = self.user_scores.sort_values(by=user, ascending=False, axis=1).loc[user,:].index
            for i in range(num_recs):
                item = sorted_items[i]
                item_col = cols[i]
                score_col = cols[i+num_recs]
                self.recs.loc[user, item_col] = item
                self.recs.loc[user, score_col] = self.user_scores.loc[user, item]
        self.recs.reset_index(inplace=True, drop=False)
       
        
    def print_recs(self):
        print(self.recs)
        
    def save_recs(self, filename='recommendations', format='excel'):
        '''saves self.recs to filename in specified format'''
        if format == 'excel':
            extension ='.xlsx'
            self.recs.to_excel(filename + extension, index=False)
        elif format == 'csv':
            extension += '.csv'
            self.recs.to_csv(filename + extension, index=False)
        else:
            raise ValueError('Invalid file format.  Please specify "excel" or "csv".')
  
    def _create_empty_df(self, cf_type):
        '''creates and returns empty df with users or items as rows and columns'''
        if cf_type == 'item':
            labels = self.item_cols
        elif cf_type == 'user':
            labels = self.data[user_col]
        else:
            raise ValueError('Invalid collaborative filtering type.  Please specify "item" or "user".')
        return pd.DataFrame(index=labels, columns=labels)

    def _fill_similarity_matrix(self, similarity_matrix, similarity):
        '''calculates correlation between items using specified similarity and saves results in NxN similarity_matrix
           3 similarity types: jaccard, pearson, cosine. Jaccard will be used in this case project.'''
        progress_bar = tqdm(total = similarity_matrix.shape[0], mininterval=5)
        item_df = self.data[self.item_cols]   
        for i in range(similarity_matrix.shape[0]):
            progress_bar.update()
            similarity_matrix.iloc[i,i] = 1.0
            #this reduces the computation time in get_similarity
            x = item_df.iloc[:,i]
            for j in range(i,similarity_matrix.shape[1]):
                y = item_df.iloc[:,j]
                similarity_matrix.iloc[i,j] = self._get_similarity(x, y, similarity)
                similarity_matrix.iloc[j,i] = similarity_matrix.iloc[i, j]
                '''each x column represents an item column. so when we perform jaccard similarity, the union and intersection calculated
        each time is with each pairing of columns, or pairing of vectors. Skip same item pairings with correlation of 1'''

                
    def _get_similarity(self, x, y, similarity):
        '''find similarity metric and determine correlation between two vectors and return result'''
        if similarity == 'pearson':
            return self._pearson_similarity(x, y)
        elif similarity == 'jaccard':
            return self._jaccard_similarity(x, y)
        elif similarity == 'cosine':
            return self._cosine_similarity(x, y)
        else:
            raise ValueError('Invalid similarity type.  Please specify "cosine", "pearson", or "jaccard".')
        
    def _pearson_similarity(self, x, y):
        #effective if data can be transformed to normal distribution 
        pearson_sim= np.corrcoef(x,y)
        return pearson_sim

    def _jaccard_similarity(self, x, y):
        '''when we calculate using 'len', we see how often each pairing of items come up in each basket transaction 
        (same row same column).
        This means the 2 items are complimentary goods, and will have higher similarities. 
        Ideal for binary data, e.g. buy vs non-buy '''        
        intersect_len=len(set(list(x)).intersection(list(y)))
        union_len= len(list(x)) + len(list(y))- intersect_len
        if union_len== 0:
            return 0
        else:
            return float(intersect_len)/union_len

    def _cosine_similarity(self, x, y):
        #returns cosine of angles between x and y
        cos_sim= dot(x,y)/(norm(x)*norm(y))
        return cos_sim

In [58]:
class Data:
    def __init__(self): 
        '''init Data class'''
        self.data= None
    
    def load_data(self, filename, format='txt'):
        '''loads data from excel, csv, tsv, or txt file'''
        if format == 'excel':
            self.data = pd.read_excel(filename)
        elif format == 'csv':
            self.data = pd.read_csv(filename)
        elif format == 'tsv':
            self.data = pd.read_csv(filename, sep='\t')
        elif format == 'txt':
            self.data = pd.read_table(filename)
        else:
            raise ValueError('Invalid file format.  Please specify "excel", "csv", "tsv" or "txt".')
        
 
    def drop_small_orders(self, order_col='Member_number', min_order_size=None):
        '''drop orders from self.data that have less than min_order_size unique items in basket'''
        self.data=self.data[self.data.groupby(order_col)['itemDescription'].transform('count')>=min_order_size]
        
    def expand_columns(self, columns=[]):
        #computes one-hot encoding on specified columns and appends them to self.data
        dfs=[]
        dfs.append(self.data)
        for col in columns:
            dfs.append(pd.get_dummies(self.data[col], prefix=None, sparse=False))
        self.data = pd.concat(dfs, axis=1)
        
    def drop_columns(self, columns=[]):
        #drops columns from self.data
        self.data.drop(columns, axis=1, inplace=True)
        return self.data
        
    def consolidate_orders(self, order_col='Member_number'):
        #consolidates each order in self.data into single record.order number is maintained and all other columns summed.'''
        self.data = self.data.groupby(order_col).sum().reset_index()
        return self.data
    
    def get_columns(self):
        data_columns= list(self.data.columns)
        return data_columns
    
    def check_duplicates(self):
        #check at the end after data consolidation
        unique_rows= self.data.nunique()[0]
        generic_rows= self.data.count()[0]
        if unique_rows==generic_rows:
            print('Data is good to go! No duplicates in unique identifier column')
        else:
            if generics_rows>unique_rows:
                print('Duplicates found: {}'.format(generic_rows-unique_rows))
            else:
                print('Invalid data. Unique rows are greater than total rows. Please check data again')
       
        

# Define Variables

In [59]:
run_rec_engine=True
user_col='Member_number'
item_cols=data.get_columns()
item_cols.remove(user_col)

# Prepare Data

In [60]:
#Drop orders with few items, one-hot encode itemDescription category information, drop unnecessary columns, 
# and consolidate unique orders into single records
data = Data()
data.load_data('groceries_dataset.csv', format='csv')
data.drop_small_orders(order_col='Member_number', min_order_size=1)
data.expand_columns(['itemDescription'])  
data.drop_columns(['Date'])
data.consolidate_orders(order_col='Member_number')
data.check_duplicates()


Data is good to go! No duplicates in unique identifier column


# Run Recommender Methods

In [63]:
if run_rec_engine:
    rec_engine = Recommender(data.data, user_col=user_col, item_cols=item_cols, cf_method='item', similarity='jaccard')
    rec_engine.create_similarity_matrix()
    rec_engine.score_users()
    rec_engine.generate_recs()
    rec_engine.print_recs()
    rec_engine.save_recs()

  0%|                                                                                          | 0/167 [00:00<?, ?it/s]

creating matrix...


100%|████████████████████████████████████████████████████████████████████████████████| 167/167 [00:18<00:00,  8.97it/s]
  0%|                                                                                         | 0/3898 [00:00<?, ?it/s]

              Instant food products  UHT-milk abrasive cleaner  \
Member_number                                                    
1000                       0.003336  0.006288         0.003336   
1001                       0.003079  0.006032         0.003079   
1002                       0.002053  0.003722         0.002053   
1003                       0.002053  0.003593         0.002053   
1004                       0.005389  0.009625         0.005389   
...                             ...       ...              ...   
4996                       0.002566   0.00462         0.002566   
4997                        0.00154  0.002823          0.00154   
4998                       0.000513  0.001027         0.000513   
4999                       0.004106   0.00693         0.004106   
5000                       0.001796  0.003337         0.001796   

              artif. sweetener baby cosmetics      bags baking powder  \
Member_number                                                       

100%|██████████████████████████████████████████████████████████████████████████████| 3898/3898 [01:21<00:00, 47.94it/s]

      Member_number             Rec 1             Rec 2           Rec 3  \
0              1000        whole milk       brown bread            soda   
1              1001        whole milk       brown bread            soda   
2              1002        whole milk  other vegetables     brown bread   
3              1003       brown bread              soda  tropical fruit   
4              1004        whole milk  other vegetables     brown bread   
...             ...               ...               ...             ...   
3893           4996       brown bread              soda  tropical fruit   
3894           4997        whole milk       brown bread            soda   
3895           4998       brown bread              soda  tropical fruit   
3896           4999  other vegetables       brown bread            soda   
3897           5000  other vegetables       brown bread            soda   

       Score 1   Score 2   Score 3  
0     0.007187   0.00693   0.00693  
1     0.007059  0.006802 


