<a href="https://colab.research.google.com/github/tpmarsha/ML2AmazonKaggle/blob/master/Purchase_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [113]:
#!pip install mxnet
#!pip install d2l
import pandas as pd
from mxnet import gluon, np, npx, autograd
from mxnet.gluon import nn
import d2l
import mxnet as mx
import scipy.sparse as sparse
import numpy
npx.set_np()

In [114]:
# read user and items into a dataframe which is then converted into csv
# this part takes a while and is only done ONCE
# after creating csv, we can upload that into a dataframe directly

#def read_file(f):
 #   for l in open(f):
 #       yield eval(l)
#df = pd.DataFrame()

#for l in read_file("train.json"):
 #   reviewerID,itemID = l['reviewerID'],l['itemID']
 #   df = df.append({'reviewerID': reviewerID, 'itemID': itemID}, ignore_index = True)
#df.to_csv("train.csv")

In [115]:
#from google.colab import drive
#drive.mount('/content/drive')

In [116]:
# now we can upload csv straight into dataframe
#root_path = '/content/drive/My Drive/Team 3 Kaggle Competition ML2/'
#data = pd.read_csv(root_path+"train_PurchasePrediction.csv")
data = pd.read_csv("train.csv")
data = data.drop(data.columns[0], axis=1)  # drop the unnamed column
# check to see if there are any duplicate users + items
len(data[data.duplicated()])
# add a column to indicate item was purchased
data['Purchased'] = 1
data.head(5)

Unnamed: 0,itemID,reviewerID,Purchased
0,I402344648,U490934656,1
1,I697650540,U714157797,1
2,I464613034,U507366950,1
3,I559560885,U307862152,1
4,I476005312,U742726598,1


In [117]:
# to speed things up, working with a subset of rows
#data = data.drop(data.index[5000:])
print(len(data))

200000


In [118]:
customers = list(numpy.sort(data.reviewerID.unique())) # Get our unique customers
products = list(data.itemID.unique()) # Get our unique products that were purchased
quantity = list(data.Purchased) # All of our purchases

In [119]:
# create a sparse matrix
rows = data.reviewerID.astype('category').cat.codes 
cols = data.itemID.astype('category').cat.codes 
purchases_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(customers), len(products)))
purchases_sparse

<39239x19914 sparse matrix of type '<class 'numpy.longlong'>'
	with 200000 stored elements in Compressed Sparse Row format>

In [120]:
# check the sparsity
matrix_size = purchases_sparse.shape[0]*purchases_sparse.shape[1] # Number of possible interactions in the matrix
num_purchases = len(purchases_sparse.nonzero()[0]) # Number of items interacted with
sparsity_original = 100*(1 - (num_purchases/matrix_size))
sparsity_original  # extremely sparse

# from d2l textbook: A viable solution is to use additional side information such as user/item features to alleviate the sparsity.

99.97440509264123

In [121]:
training_set = purchases_sparse.copy() # Make a copy of the original data we can alter as our training set
test_set = purchases_sparse.copy() # Make a copy of the original set to be the test set

In [122]:
# identify all the interactions within the sparse matrix and zip them together
nonzero_inds = training_set.nonzero()
nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) 
len(nonzero_pairs)

200000

In [123]:
#split for train-test with 20% as test
import random
num_samples = int(np.ceil(0.20*len(nonzero_pairs))) # round the number of samples needed to the nearest integer
samples = random.sample(nonzero_pairs, num_samples) # randomly select 20% to be in the test set
len(samples)

40000

In [124]:
# for the 20% of users in the test, "mask" them in the training set (change all their purchase history to 0's)
user_inds = [index[0] for index in samples] # Get the user row indices
item_inds = [index[1] for index in samples] # Get the item column indices
training_set[user_inds, item_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero

In [125]:
# to check this happened, compare sparsity between training_set and test_set
# sparsity of training set
matrix_size = training_set.shape[0]*training_set.shape[1] # Number of possible interactions in the matrix
num_purchases = len(training_set.nonzero()[0]) # Number of items interacted with
sparsity_train = 100*(1 - (num_purchases/matrix_size))
# sparsity of test set
matrix_size = test_set.shape[0]*test_set.shape[1] # Number of possible interactions in the matrix
num_purchases = len(test_set.nonzero()[0]) # Number of items interacted with
sparsity_test = 100*(1 - (num_purchases/matrix_size))

# print sparsity of original dataset, train, and test sets
print(sparsity_original, sparsity_train, sparsity_test)

99.97440509264123 99.97952407411299 99.97440509264123


In [126]:
# store users who were masked from training in a list
users_altered = list(set(user_inds))

In [127]:
#!pip install implicit
import implicit

In [128]:
alpha = 15
user_vecs, item_vecs = implicit.alternating_least_squares((training_set*alpha).astype('double'), 
                                                           factors=20, 
                                                           regularization = 0.1, 
                                                           iterations = 50)



HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [129]:
#implicit.lmf.LogisticMatrixFactorization(factors=20, regularization=0.1, iterations=20)
#alpha_val = 15
#data_conf = (training_set * alpha_val).astype('double')
#model.fit(data_conf)

In [130]:
predictions = [sparse.csr_matrix(user_vecs), sparse.csr_matrix(item_vecs.T)]

predictions

[<39239x20 sparse matrix of type '<class 'numpy.float32'>'
 	with 784780 stored elements in Compressed Sparse Row format>,
 <20x19914 sparse matrix of type '<class 'numpy.float32'>'
 	with 398280 stored elements in Compressed Sparse Row format>]

In [131]:
from sklearn import metrics

def auc_score(predictions, test):
    '''
    This simple function will output the area under the curve using sklearn's metrics. 
    
    parameters:
    
    - predictions: your prediction output
    
    - test: the actual target result you are comparing to
    
    returns:
    
    - AUC (area under the Receiver Operating Characterisic curve)
    '''
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr)   

In [132]:
item_vecs = predictions[1]

In [133]:
store_auc = [] # An empty list to store the AUC for each user that had an item removed from the training set
popularity_auc = [] # To store popular AUC scores
pop_items = numpy.array(test_set.sum(axis = 0)).reshape(-1) # Get sum of item iteractions to find most popular

for user in users_altered: # Iterate through each user that had an item altered
    training_row = training_set[user,:].toarray().reshape(-1) # Get the training set row
    zero_inds = numpy.where(training_row == 0) # Find where the interaction had not yet occurred
    # Get the predicted values based on our user/item vectors
    user_vec = predictions[0][user,:]
    pred = user_vec.dot(item_vecs).toarray()[0,zero_inds].reshape(-1)
    # Select all ratings from the MF prediction for this user that originally had no iteraction
    actual = test_set[user,:].toarray()[0,zero_inds].reshape(-1) 
    store_auc.append(auc_score(pred, actual)) # Calculate AUC for the given user and store
    # popular items
    pop = pop_items[zero_inds] # Get the item popularity for our chosen items
    popularity_auc.append(auc_score(pop, actual)) # Calculate AUC using most popular and score

In [134]:
numpy.mean(store_auc), numpy.mean(popularity_auc)

(0.6833485756455943, 0.7063826850071151)