In [0]:
!pip install mlxtend

In [0]:
import pandas as pd
import numpy as np
import time
import scipy.sparse as sparse
from sklearn.metrics.pairwise import cosine_similarity
import os
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [0]:
df = pd.read_csv('/Users/Kate/Downloads/2020-Jan.csv')
df.head()

In [0]:
df.columns

In [0]:
purchase_df = df[df['event_type'] == 'purchase']
purchase_df.head()

# COSINE SIMILARITY

In [0]:
orders = purchase_df[['user_session', 'product_id']]
orders['product_quantity'] = 1
orders.head()

In [0]:
def recommendations(df,filename,rows='user_session',cols='product_id',quantity='product_quantity',top=11):

    #Creating the sparse matrix to see the frequency products occur together
    start_time = time.time()
    orders = list(sorted(set(df[rows])))
    products = list(sorted(set(df[cols])))
    quantity = list(df[quantity])

    rs = pd.Categorical(df[rows], categories=orders).codes
    cs = pd.Categorical(df[cols], categories=products).codes

    sparse_matrix = sparse.csr_matrix((quantity, (rs,cs)), shape=(len(orders),len(products)))

    matrix_size = sparse_matrix.shape[0]*sparse_matrix.shape[1] # Number of possible interactions in the matrix
    num_purchases = len(sparse_matrix.nonzero()[0]) # Number of items interacted with
    sparsity = round(100*(1 - (float(num_purchases)/matrix_size)),2) #Calculating sparsity

    print('Sparse matrix creation time: {} seconds.\nSparsity: {}'.format(round(time.time() - start_time,2), sparsity))

    #Calculating the cosine similarity of all products with each other
    start_time = time.time()
    similarities = cosine_similarity(sparse_matrix.T)
    df_sim = pd.DataFrame(similarities, index=products, columns=products)
    end_time = time.time()
    print("Cosine calculation time: {} seconds.".format(round(end_time - start_time,2)))


    #Creating the table of top 10 recommendations for every product
    start_time = time.time()
    df_match = pd.DataFrame(index=products, columns=[f'Rec {i}' for i in range(1, top)])
    df_score = pd.DataFrame(index=products, columns=[f'Score {i}' for i in range(1, top)])

   # df_match = pd.DataFrame(index=products,columns =  #place for the recommendations
   # ['Rec 0','Rec 1','Rec 2','Rec 3','Rec 4','Rec 5','Rec 6','Rec 7','Rec 8','Rec 9','Rec 10'])
   # df_score = pd.DataFrame(index=products,columns =  #place for the scores
   # ['Score 0','Score 1','Score 2','Score 3','Score 4','Score 5','Score 6','Score 7','Score 8','Score 9','Score 10'])

    for i in range(len(products)): #inserting into tables
        # Get the top recommendations and scores, limited by the available data
        top_recs = df_sim.iloc[:, i].sort_values(ascending=False)
        # Filter out the product itself
        top_recs = top_recs[top_recs.index != df_sim.index[i]]
        # Ensuring num_recs is within the bounds of top_recs and df_match
        num_recs = min(top -1 , len(top_recs) -1, df_match.shape[1])

        df_match.iloc[i, :num_recs] = top_recs.iloc[:num_recs].index # Using iloc to select from top_recs
        df_score.iloc[i, :num_recs] = top_recs.iloc[:num_recs].values # Using iloc to select from top_recs

    for i in range(top-1): #remove recs with similarities of 0 # Adjusting loop range
        try:
            df_match.iloc[np.where(df_score.iloc[:,i] == 0),i] = None
            df_score.iloc[np.where(df_score.iloc[:,i] == 0),i] = None
        except:
            continue

    mismatch_loc = np.where(df_match.iloc[:,0] != df_match.index)[0] #Correcting sorting errors
    df_match.iloc[mismatch_loc,1] = df_match.iloc[mismatch_loc,0]

    df_new = df_match.merge(df_score, how='inner', left_index=True, right_index=True)
    # Removed the drop command as 'Rec 0' and 'Score 0' are no longer created
    # df_new.drop(['Rec 0','Score 0'],axis = 1,inplace=True)
    df_new.index.names = ['product_id']
    print('Table creation time: {} seconds'.format(round(time.time() - start_time,2)))

    #df_new.to_csv(filename)


    print('Process completed')
    return df_new

In [0]:
prod_recs = recommendations(orders, 'product_recs_cosine_similarity.csv')
prod_recs.head()

# APRIORI

In [0]:
# Can still use the orders dataframe created a bove
basket = (orders
          .groupby(['user_session', 'product_id'])['product_quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('user_session'))  # Resets the index to keep 'user_session' as a column

basket.head()

In [0]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
basket_sets = basket.applymap(encode_units)
basket_sets = basket_sets.astype(bool)

In [0]:
# Generate frequent itemsets
frequent_itemsets = apriori(basket_sets, min_support=0.001, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1, num_itemsets=len(frequent_itemsets))
rules

In [0]:
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets = frequent_itemsets[frequent_itemsets['length'] >=2]

# Sort the frequent itemsets by their support (descending) to get the most frequent ones
frequent_itemsets.sort_values(by='support', ascending=False)