In [1]:
import pandas as pd
import psycopg2
import scipy.sparse as sparse
import numpy as np
from scipy.sparse.linalg import spsolve
import sys
import implicit
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
import pickle
import csv

'''这一块代码是数据处理'''
# retail_data = pd.read_csv('/home/wyf/Desktop/mixed/train.csv')
retail_data = pd.read_csv('train.csv')
# retail_data = pd.read_csv('train.csv')
# retail_data.drop(retail_data.columns[0],axis=1,inplace=True)

retail_data['user_id'] = retail_data.user_id.astype(int)
data = retail_data[['user_id', 'item_id', 'Occurrence']] # Get rid of unnecessary info

item_lookup = retail_data[['item_id', 'cat_id']].drop_duplicates() # Only get unique item/description pairs
item_lookup['item_id'] = retail_data.item_id.astype(int) # Encode as strings for future lookup ease

grouped_cleaned = data.groupby(['user_id', 'item_id']).sum().reset_index() # Group together
grouped_cleaned.Occurrence.loc[grouped_cleaned.Occurrence == 0] = 1
grouped_purchased = grouped_cleaned.query('Occurrence > 0')

customers = list(np.sort(grouped_purchased.user_id.unique())) # Get our unique customers
products = list(grouped_purchased.item_id.unique()) # Get our unique products that were purchased
quantity = list(grouped_purchased.Occurrence)

rows = grouped_purchased.user_id.astype('category', categories = customers).cat.codes 
# Get the associated row indices
cols = grouped_purchased.item_id.astype('category', categories = products).cat.codes 
# Get the associated column indices
purchases_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(customers), len(products)))

matrix_size = purchases_sparse.shape[0]*purchases_sparse.shape[1] # Number of possible interactions in the matrix
num_purchases = len(purchases_sparse.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1 - (num_purchases/matrix_size))
sparsity  # sparsity is 99.92%

  return f(*args, **kwds)
  return f(*args, **kwds)
  """)


99.97700623073791

In [2]:
customers_arr = np.array(customers) # Array of customer IDs from the ratings matrix
products_arr = np.array(products) # Array of product IDs from the ratings matrix
len(customers_arr)

166190

In [3]:
with open ('test.txt', 'rb') as fp:
# with open ('test.txt', 'rb') as fp:
    test_set = pickle.load(fp) # 在explicit with cmf的变量名叫做product_users_altered
test_set[:5]

[[144751, 294076, 1573],
 [35363, 403762, 7577],
 [35363, 159310, 7577],
 [392690, 981776, 4073],
 [392690, 1086021, 4073]]

In [5]:
alpha = 2
'''
当alpha=2时命中率最高，为27.9%
由于数据量很大，所以建议放在服务器上跑
'''
user_vecs, item_vecs = implicit.alternating_least_squares((purchases_sparse*alpha).astype('double'), 
                                                          factors=40, 
                                                          regularization = 0.1, 
                                                         iterations = 40)
def rec_items(customer_id, mf_train, user_vecs, item_vecs, customer_list, item_list, item_lookup, num_items = 10):
    '''
    This function will return the top recommended items to our users 

    parameters:

    customer_id - Input the customer's id number that you want to get recommendations for

    mf_train - The training matrix you used for matrix factorization fitting

    user_vecs - the user vectors from your fitted matrix factorization

    item_vecs - the item vectors from your fitted matrix factorization

    customer_list - an array of the customer's ID numbers that make up the rows of your ratings matrix 
                    (in order of matrix)

    item_list - an array of the products that make up the columns of your ratings matrix
                    (in order of matrix)

    item_lookup - A simple pandas dataframe of the unique product ID/product descriptions available

    num_items - The number of items you want to recommend in order of best recommendations. Default is 10. 

    returns:

    - The top n recommendations chosen based on the user/item vectors for items never interacted with/purchased
    '''

    cust_ind = np.where(customer_list == customer_id)[0][0] # Returns the index row of our customer id
    pref_vec = mf_train[cust_ind,:].toarray() # Get the ratings from the training set ratings matrix
    pref_vec = pref_vec.reshape(-1) + 1 # Add 1 to everything, so that items not purchased yet become equal to 1
    pref_vec[pref_vec > 1] = 0 # Make everything already purchased zero
    rec_vector = user_vecs[cust_ind,:].dot(item_vecs.T) # Get dot product of user vector and all item vectors
#     print(rec_vector)
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
#     print(rec_vector_scaled)
    recommend_vector = pref_vec*rec_vector_scaled
    # Items already purchased have their recommendation multiplied by zero
    product_idx = np.argsort(recommend_vector)[::-1][:num_items] # Sort the indices of the items into order 
    # of best recommendations
    rec_list = [] # start empty list to store items
    for index in product_idx:
        code = item_list[index]
#         rec_list.append([code, item_lookup.Description.loc[item_lookup.StockCode == code].iloc[0]]) 
        rec_list.append([code, item_lookup.cat_id.loc[item_lookup.item_id == code].iloc[0]]) 
        # Append our descriptions to the list
    codes = [item[0] for item in rec_list]
    descriptions = [item[1] for item in rec_list]
    final_frame = pd.DataFrame({'item_id': codes, 'cat_id': descriptions}) # Create a dataframe 
    return final_frame[['item_id', 'cat_id']] # Switch order of columns around

# total_rec = rec_items(, product_train, user_vecs, item_vecs, customers_arr, products_arr, item_lookup,num_items = 100)

def accuracy(item, _list):
    if item in _list:
        return 1
    else:
        return 0

def acc_topN_with_cat(altered_pair, num):
    count = 0
#     print(len(altered_pair))
    for i in range(len(altered_pair)):
#         print(altered_pair[i])
        total_rec = rec_items(altered_pair[i][0], purchases_sparse, user_vecs, item_vecs, customers_arr, products_arr, item_lookup, num_items = 30000)
#         print(total_rec)
        cat_topN = list(total_rec.groupby('cat_id').head(num).item_id)
#         print(cat_topN)
#         print(len(cat_topN))
        count = count + accuracy(altered_pair[i][1], cat_topN)
    return count/len(altered_pair)

topN_acc = acc_topN_with_cat(test_set, 5)
print('the accuracy is:', topN_acc)

This method is deprecated. Please use the AlternatingLeastSquares class instead
100%|██████████| 40.0/40 [00:43<00:00,  1.23s/it]


KeyboardInterrupt: 