<a href="https://colab.research.google.com/github/tpmarsha/ML2AmazonKaggle/blob/master/Purchase_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install mxnet
#!pip install d2l
import pandas as pd
import scipy.sparse as sparse
import numpy

In [None]:
# read user and items into a dataframe which is then converted into csv
# this part takes a while and is only done ONCE
# after creating csv, we can upload that into a dataframe directly

#def read_file(f):
 #   for l in open(f):
 #       yield eval(l)
#df = pd.DataFrame()

#for l in read_file("train.json"):
 #   reviewerID,itemID = l['reviewerID'],l['itemID']
 #   df = df.append({'reviewerID': reviewerID, 'itemID': itemID}, ignore_index = True)
#df.to_csv("train.csv")

In [None]:
# import data straight from csv that was created above

In [None]:
# now we can upload csv straight into dataframe
path = "/Users/dipali/Desktop/CMU MSBA/machine_learning_2/final_project/"
data = pd.read_csv(path+"train.csv")
data = data.drop(data.columns[0], axis=1)  # drop the unnamed column
# check to see if there are any duplicate users + items
len(data[data.duplicated()])
# add a column to indicate item was purchased
data['Purchased'] = 1
data.head(5)

In [None]:
print(len(data))

In [None]:
customers = list(numpy.sort(data.reviewerID.unique())) # Get our unique customers
products = list(data.itemID.unique()) # Get our unique products that were purchased
quantity = list(data.Purchased) # All of our purchases

len(customers)

In [None]:
# create a sparse matrix
rows = data.reviewerID.astype('category').cat.codes 
cols = data.itemID.astype('category').cat.codes 
purchases_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(customers), len(products)))
purchases_sparse

In [None]:
# assign indices to item and reviewer and add them to dataframe
data['item_indices'] = data.itemID.astype('category').cat.codes
data['reviewer_indices'] = data.reviewerID.astype('category').cat.codes
data.head(5)

In [None]:
# check the sparsity
matrix_size = purchases_sparse.shape[0]*purchases_sparse.shape[1] # Number of possible interactions in the matrix
num_purchases = len(purchases_sparse.nonzero()[0]) # Number of items interacted with
sparsity_original = 100*(1 - (num_purchases/matrix_size))
sparsity_original  # extremely sparse

# from d2l textbook: A viable solution is to use additional side information such as user/item features to alleviate the sparsity

training_set = purchases_sparse.copy() # Make a copy of the original data we can alter as our training set
test_set = purchases_sparse.copy() # Make a copy of the original set to be the test set

In [None]:
# identify all the interactions within the sparse matrix and zip them together
nonzero_inds = training_set.nonzero()
nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) 
len(nonzero_pairs)

In [None]:
# randomly select 20% of users with a purchase, mask their item purchases (turn label to 0's) in training data, save them for testing
import random
num_samples = int(numpy.ceil(0.20*len(nonzero_pairs)))
samples = random.sample(nonzero_pairs, num_samples) # randomly select 20% to be in the test set
user_inds = [index[0] for index in samples] # Get the user row indices
item_inds = [index[1] for index in samples] # Get the item column indices
training_set[user_inds, item_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero

# store the users saved for testing into a list
users_altered = list(set(user_inds))

### Training

In [None]:
# apply matrix factorization
# alternating least squares approximates user feature vec + item feature vec, meant to determine items to recommend
# optimize parameter on the loss. not sure how to apply a grid search to find the best hyperparameters because i'm unable to extract the loss from the widget below, so i re-ran a bunch of times

#!pip install implicit
import implicit
# apply model
model = implicit.als.AlternatingLeastSquares(factors=5, regularization=150, iterations=50,num_threads=1,calculate_training_loss=True)
alpha_val = 50
data_conf = (training_set * alpha_val).astype('double')
model.fit(data_conf)


In [None]:
# generate predictions for each user-item
# higher prediction value = user-item pair strongly predicted to interact
user_vecs = model.item_factors
item_vecs = model.user_factors
predictions = [sparse.csr_matrix(user_vecs), sparse.csr_matrix(item_vecs.T)]
item_vecs = predictions[1]

### Testing

In [None]:
from sklearn.metrics import recall_score, precision_score, accuracy_score

In [None]:
# generate predictions for each user who was masked in training
# evaluate the performance by mean recall at k, mean precision at k, mean accuracy at k (where k is top x items)


top_x_items = int(len(products)*0.20)  # play around with this number

recall = []
precision = []
accuracy = []

for user in users_altered: # Iterate through each user that had an item altered
    training_row = training_set[user,:].toarray().reshape(-1) # Get the training set row
    zero_inds = numpy.where(training_row == 0) # Find where the interaction had not yet occurred
    # Get the predicted values based on our user/item vectors
    user_vec = predictions[0][user,:]
    pred = user_vec.dot(item_vecs).toarray()[0,zero_inds].reshape(-1)
    # Select all ratings from the MF prediction for this user that originally had no iteraction
    actual = test_set[user,:].toarray()[0,zero_inds].reshape(-1) 
    
    # get top x items with highest prediction scores
    top_indices = numpy.argsort(pred)[::-1][:top_x_items]   
    pred2 = numpy.zeros(actual.shape)
    for i in top_indices:    # make top x items 1's and the rest of items as 0's
        pred2[i] = 1
    pred2 = pred2[:top_x_items]
    actual = actual[:top_x_items]
    
    recall.append(recall_score(actual,pred2))
    precision.append(precision_score(actual,pred2))
    accuracy.append(accuracy_score(actual,pred2))
      
print('mean recall at k: %f, mean precision at k: %f, mean accuracy at k: %f'% (numpy.mean(recall), 
                                                                                numpy.mean(precision), 
                                                                                numpy.mean(accuracy)))
      

In [None]:
# identify top items by item index
pop_items = data[['item_indices', 'Purchased']]
pop_items = pop_items.groupby('item_indices').sum().reset_index()
pop_items = pop_items.nlargest(3000, 'Purchased')   # most popular items
pop_items = pop_items['item_indices'].to_list()

### Predictions

In [None]:
# upload the test data
test_data = pd.read_csv("/Users/dipali/Desktop/CMU MSBA/machine_learning_2/final_project/pairs_Purchase.txt")
test_data = pd.DataFrame(test_data['reviewerID-itemID'].str.split("-",expand=True))
test_data.columns = 'reviewerID', 'itemID'

# find corresponding item and reviewer indices which were determined earlier
data2 = data.drop(columns = ['itemID', 'Purchased', 'item_indices'])
data2 = data2.drop_duplicates()
data3 = data.drop(columns = ['reviewerID', 'Purchased', 'reviewer_indices'])
data3 = data3.drop_duplicates()

# one dataframe of test set with reviewerid, itemid, and indices
test_data = test_data[['reviewerID', 'itemID']].merge(data2[['reviewerID', 'reviewer_indices']], on='reviewerID', how='left')
test_data = test_data[['reviewerID', 'itemID', 'reviewer_indices']].merge(data3[['itemID', 'item_indices']], on='itemID', how='left')

# if NaN (reviewer or item did not exist in the test set), assign them to a random value: '123456'
test_data["reviewer_indices"] = test_data["reviewer_indices"].fillna(123456)
test_data["item_indices"] = test_data["item_indices"].fillna(123456)

test_data.head(5)

In [None]:
# generate predictions
# 2 methods: 1) for users who do not exist in test set, set all their predictions to 0 OR 2) set their prediction to 1 if item is among most popular
# method 1 yielded better results, so method 2 is commented out below

test_pred_boolean = []  # prediction values (0 or 1)
test_pred = []          # interaction score (dot product between user vec and item vec)
test_user_item_indices = list(zip(test_data['reviewer_indices'],test_data['item_indices']))

for user, item in test_user_item_indices:
    if int(user) == 123456:  # for users who did not exist in test set (104 users), set prediction to 1 if item is among most popular
        #if int(item) in pop_items:
        #    test_pred_boolean.append(1)
        #    test_pred.append(0)
        #else:
            test_pred_boolean.append(0)
            test_pred.append(0)
    elif int(item) == 123456:   # if item did not exist in the test set, set prediction to 0
        test_pred_boolean.append(0)
        test_pred.append(0)
    else:                      # if user or item did exist in test set...
        user_vec = predictions[0][int(user),:]
        pred = user_vec.dot(item_vecs).toarray().reshape(-1)
        test_pred.append(pred[int(item)])  # generate interaction score
        
        top_indices = numpy.argsort(pred)[::-1][:top_x_items]   # get top x% items with highest interaction scores
        pred2 = numpy.zeros(pred.shape)
        for i in top_indices:
            pred2[i] = 1
        test_pred_boolean.append(pred2[int(item)])  # generate 1's and 0's

In [None]:
len(test_pred)

In [None]:
len(test_pred_boolean)

In [None]:
# insert predictions and interaction scores into dataframe
test_data['prediction'] = numpy.array(test_pred_boolean,dtype=int)
test_data['interaction_score'] = numpy.array(test_pred)

test_data.head(10)

In [None]:
# potentially set a cut-off value to determine prediction, in addition to top x%
# if interaction score is >= avg interaction score, set prediction to 1
# avg_pred = sum(test_pred) / len(test_pred)
# mask = test_data['interaction_score'] >= avg_pred
# test_data['prediction'][mask] = 1

In [None]:
# prepare dataframe to match format for kaggle submission
test_data['reviewerID-itemID'] = test_data['reviewerID'] + "-" + test_data['itemID']
predictions_upload = test_data[['reviewerID-itemID', 'prediction']]
predictions_upload.head(5)

In [None]:
predictions_upload.groupby('prediction').count()

In [None]:
# export prediction file for submission

path = "/Users/dipali/Desktop/CMU MSBA/machine_learning_2/final_project/"

predictions_upload.to_csv(path+"predictions_test.txt", index=False, header=True, line_terminator="\n", sep=",")