<a href="https://colab.research.google.com/github/tpmarsha/ML2AmazonKaggle/blob/master/Purchase_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!pip install mxnet
#!pip install d2l
#!pip install implicit
import pandas as pd
import scipy.sparse as sparse
import numpy
import random
import implicit
from sklearn import metrics
from sklearn.metrics import recall_score, precision_score, accuracy_score, roc_curve

In [2]:
# read user and items into a dataframe which is then converted into csv
# this part takes a while and is only done ONCE
# after creating csv, we can upload that into a dataframe directly

#def read_file(f):
 #   for l in open(f):
 #       yield eval(l)
#df = pd.DataFrame()

#for l in read_file("train.json"):
 #   reviewerID,itemID = l['reviewerID'],l['itemID']
 #   df = df.append({'reviewerID': reviewerID, 'itemID': itemID}, ignore_index = True)
#df.to_csv("train.csv")

In [3]:
# import data straight from csv that was created above

In [4]:
# now we can upload csv straight into dataframe
path = "/Users/dipali/Desktop/CMU MSBA/machine_learning_2/final_project/"
data = pd.read_csv(path+"train.csv")
data = data.drop(data.columns[0], axis=1)  # drop the unnamed column
# check to see if there are any duplicate users + items
len(data[data.duplicated()])
# add a column to indicate item was purchased
data['Purchased'] = 1
data.head(5)

Unnamed: 0,itemID,reviewerID,Purchased
0,I402344648,U490934656,1
1,I697650540,U714157797,1
2,I464613034,U507366950,1
3,I559560885,U307862152,1
4,I476005312,U742726598,1


In [5]:
print(len(data))

200000


In [6]:
users = list(numpy.sort(data.reviewerID.unique())) # list of unique users
items = list(data.itemID.unique()) # list of unique items
quantity = list(data.Purchased) # purchases

In [7]:
# create a sparse matrix
rows = data.reviewerID.astype('category').cat.codes 
cols = data.itemID.astype('category').cat.codes 
purchases_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(users), len(items)))
purchases_sparse

<39239x19914 sparse matrix of type '<class 'numpy.longlong'>'
	with 200000 stored elements in Compressed Sparse Row format>

In [8]:
# assign indices to item and reviewer and add them to dataframe
data['item_indices'] = data.itemID.astype('category').cat.codes
data['reviewer_indices'] = data.reviewerID.astype('category').cat.codes
data.head(5)

Unnamed: 0,itemID,reviewerID,Purchased,item_indices,reviewer_indices
0,I402344648,U490934656,1,7898,19395
1,I697650540,U714157797,1,13783,28196
2,I464613034,U507366950,1,9177,20029
3,I559560885,U307862152,1,11096,12131
4,I476005312,U742726598,1,9409,29299


In [9]:
# check the sparsity
matrix_size = purchases_sparse.shape[0]*purchases_sparse.shape[1]
num_purchases = len(purchases_sparse.nonzero()[0])  
100*(1 - (num_purchases/matrix_size)) # calculate sparsity

99.97440509264123

In [10]:
training_set = purchases_sparse.copy() # copy of original sparse matrix as training set
test_set = purchases_sparse.copy() # copy of original sparse matrix as testing set

In [11]:
# identify all non-zero (purchased) user-item pairs from training_set and zip them together
nonzero_indices = training_set.nonzero()
nonzero_pairs = list(zip(nonzero_indices[0], nonzero_indices[1])) # equal to 200,000

In [12]:
# randomly select 20% of users-item pairs with a purchase, mask their item purchases (turn label to 0's) in training data, save them for testing
testing_sample = int(numpy.ceil(0.20*len(nonzero_pairs)))
samples = random.sample(nonzero_pairs, testing_sample) # randomly select 20% of user-item pairs
user_indices = [index[0] for index in samples] # Get the user row indices
item_indices = [index[1] for index in samples] # Get the item column indices
training_set[user_indices, item_indices] = 0 # Assign all of the randomly chosen user-item pairs to zero
training_set.eliminate_zeros()

In [13]:
# store unique list of users saved for testing
testing_users = list(set(user_indices))
# find unique list of users that were not saved for testing (to be used during training)
training_users = list(set(nonzero_indices[0]) - set(testing_users))

In [14]:
# identify top items by item index
pop_items = data[['item_indices', 'Purchased']]
pop_items = pop_items.groupby('item_indices').sum().reset_index()
pop_items = pop_items.nlargest(3000, 'Purchased')   # most popular items
pop_items = pop_items['item_indices'].to_list()

### Training

In [15]:
# apply matrix factorization using alternating least squares

model = implicit.als.AlternatingLeastSquares(factors=5, regularization=150, iterations=50,num_threads=1,calculate_training_loss=True)
alpha_val = 50
data_conf = (training_set * alpha_val).astype('double')
model.fit(data_conf)



HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [16]:
# generate predictions for each user-item
user_vectors = model.item_factors
item_vectors = model.user_factors
predictions = [sparse.csr_matrix(user_vectors), sparse.csr_matrix(item_vectors.T)]  # take dot product to get predictions
item_vectors = predictions[1]
user_vectors = predictions[0]

In [17]:
# calculate average prediction score across all user-item pairs
dot_prod = predictions[0].dot(item_vectors).toarray().reshape(-1)
avg_pred = dot_prod.sum() / dot_prod.size

In [18]:
# training loop
# evaluate by mean AUC, mean precision, mean recall

top_x_items = int(len(items)*0.20)  # threshold value by which to convert predictions to binary values

training_recall = []
training_precision = []
training_auc = []
pred = []
actual = []

for user in training_users:  # iterate through users who were not saved for training
    training_row = training_set[user,:].toarray().reshape(-1)
    user_vec = predictions[0][user,:]
    pred = user_vec.dot(item_vectors).toarray()[0,:].reshape(-1)
    actual = test_set[user,:].toarray()[0,:].reshape(-1)   
    fpr, tpr, thresholds = metrics.roc_curve(actual, pred)
    training_auc.append(metrics.auc(fpr, tpr))
    
    pred2 = []
    for i in pred:
        if i >= avg_pred:
            pred2.append(1)
        else:
            pred2.append(0)
    numpy.array(pred2)

    # get top x items with highest prediction scores
    top_indices = numpy.argsort(pred)[::-1][:top_x_items]   
    pred2 = numpy.zeros(actual.shape)
    for i in top_indices:    # make top x items 1's and the rest of items as 0's
        pred2[i] = 1
    pred2 = pred2[:top_x_items]
    actual = actual[:top_x_items] 
    training_recall.append(recall_score(actual,pred2,zero_division=1))
    training_precision.append(precision_score(actual,pred2,zero_division=1))

print('mean auc: %f'% numpy.mean(training_auc))
print('mean recall: %f'% numpy.mean(training_recall))    
print('mean precision: %f'% numpy.mean(training_precision))

mean auc: 0.839772
mean recall: 0.833254
mean precision: 0.000737


### Testing

In [19]:
# testing loop
# evaluate by mean AUC, mean precision, mean recall

testing_recall = []
testing_precision = []
testing_auc = []
pred = []
actual = []

for user in testing_users:  # iterate through users saved for testing
    training_row = training_set[user,:].toarray().reshape(-1)
    zero_indices = numpy.where(training_row == 0)  # find corresponding items that were masked
    user_vec = predictions[0][user,:]
    pred = user_vec.dot(item_vectors).toarray()[0,zero_indices].reshape(-1)
    actual = test_set[user,:].toarray()[0,zero_indices].reshape(-1) # get actual values from test set
    fpr, tpr, thresholds = metrics.roc_curve(actual, pred)
    testing_auc.append(metrics.auc(fpr, tpr)) 

    pred2 = []
    for i in pred:
        if i >= avg_pred:
            pred2.append(1)
        else:
            pred2.append(0)
    numpy.array(pred2)
    
    # get top x items with highest prediction scores
    top_indices = numpy.argsort(pred)[::-1][:top_x_items]   
    pred2 = numpy.zeros(actual.shape)
    for i in top_indices:    # make top x items 1's and the rest of items as 0's
        pred2[i] = 1 
    pred2 = pred2[:top_x_items]
    actual = actual[:top_x_items]
    testing_recall.append(recall_score(actual,pred2,zero_division=1))
    testing_precision.append(precision_score(actual,pred2,zero_division=1))

print('mean auc: %f'% numpy.mean(testing_auc))
print('mean recall: %f'% numpy.mean(testing_recall))    
print('mean precision: %f'% numpy.mean(testing_precision))

mean auc: 0.723454
mean recall: 0.869631
mean precision: 0.013576


### Predictions

In [20]:
# upload the test data
test_data = pd.read_csv("/Users/dipali/Desktop/CMU MSBA/machine_learning_2/final_project/pairs_Purchase.txt")
test_data = pd.DataFrame(test_data['reviewerID-itemID'].str.split("-",expand=True))
test_data.columns = 'reviewerID', 'itemID'

# find corresponding item and reviewer indices which were determined earlier
data2 = data.drop(columns = ['itemID', 'Purchased', 'item_indices'])
data2 = data2.drop_duplicates()
data3 = data.drop(columns = ['reviewerID', 'Purchased', 'reviewer_indices'])
data3 = data3.drop_duplicates()

# one dataframe of test set with reviewerid, itemid, and indices
test_data = test_data[['reviewerID', 'itemID']].merge(data2[['reviewerID', 'reviewer_indices']], on='reviewerID', how='left')
test_data = test_data[['reviewerID', 'itemID', 'reviewer_indices']].merge(data3[['itemID', 'item_indices']], on='itemID', how='left')

# if NaN (reviewer or item did not exist in the test set), assign them to a random value: '123456'
test_data["reviewer_indices"] = test_data["reviewer_indices"].fillna(123456)
test_data["item_indices"] = test_data["item_indices"].fillna(123456)

test_data.head(5)

Unnamed: 0,reviewerID,itemID,reviewer_indices,item_indices
0,U938994110,I529819131,36963.0,10536.0
1,U181459539,I863471064,7182.0,17192.0
2,U941668816,I684585522,37055.0,13523.0
3,U768449391,I782253949,30321.0,15526.0
4,U640450168,I232683472,25359.0,4545.0


In [22]:
# generate predictions
# 2 methods: 1) for users who do not exist in test set, set all their predictions to 0 OR 2) set their prediction to 1 if item is among most popular
# method 1 yielded better results, so method 2 is commented out below

test_pred_boolean = []  # prediction values (0 or 1)
test_pred = []          # interaction score (dot product between user vec and item vec)
test_user_item_indices = list(zip(test_data['reviewer_indices'],test_data['item_indices']))

for user, item in test_user_item_indices:
    if int(user) == 123456:  # for users who did not exist in test set (101 users), set prediction to 1 if item is among most popular
        if int(item) in pop_items:
            test_pred_boolean.append(1)
            test_pred.append(0)
        else:
            test_pred_boolean.append(0)
            test_pred.append(0)
    elif int(item) == 123456:   # if item did not exist in the test set, set prediction to 0 (there's only 3 such cases)
        test_pred_boolean.append(0)
        test_pred.append(0)
    else:                      # if user or item did exist in test set...
        user_vec = predictions[0][int(user),:]
        pred = user_vec.dot(item_vectors).toarray().reshape(-1)
        test_pred.append(pred[int(item)])  # generate interaction score
        
        top_indices = numpy.argsort(pred)[::-1][:top_x_items]   # get top x% items with highest interaction scores
        pred2 = numpy.zeros(pred.shape)
        for i in top_indices:
            pred2[i] = 1
        test_pred_boolean.append(pred2[int(item)])  # generate 1's and 0's

In [23]:
len(test_pred)

28000

In [24]:
len(test_pred_boolean)

28000

In [25]:
# insert predictions and interaction scores into dataframe
test_data['prediction'] = numpy.array(test_pred_boolean,dtype=int)
test_data['interaction_score'] = numpy.array(test_pred)

test_data.head(10)

Unnamed: 0,reviewerID,itemID,reviewer_indices,item_indices,prediction,interaction_score
0,U938994110,I529819131,36963.0,10536.0,1,0.006823
1,U181459539,I863471064,7182.0,17192.0,1,0.016707
2,U941668816,I684585522,37055.0,13523.0,0,0.000161
3,U768449391,I782253949,30321.0,15526.0,0,0.01355
4,U640450168,I232683472,25359.0,4545.0,1,0.085687
5,U087574132,I014281144,3461.0,266.0,0,0.000731
6,U885457860,I600866492,34834.0,11915.0,1,0.002472
7,U319023404,I050733439,12557.0,975.0,1,0.031027
8,U535965656,I929867818,21135.0,18522.0,0,0.002509
9,U883645154,I158444048,34761.0,3106.0,0,0.010048


In [26]:
# if interaction score is >= avg interaction score, set prediction to 1
avg_pred = sum(test_pred) / len(test_pred)
mask = test_data['interaction_score'] >= avg_pred
test_data['prediction'][mask] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [27]:
# prepare dataframe to match format for kaggle submission
test_data['reviewerID-itemID'] = test_data['reviewerID'] + "-" + test_data['itemID']
predictions_upload = test_data[['reviewerID-itemID', 'prediction']]
predictions_upload.head(5)

Unnamed: 0,reviewerID-itemID,prediction
0,U938994110-I529819131,1
1,U181459539-I863471064,1
2,U941668816-I684585522,0
3,U768449391-I782253949,0
4,U640450168-I232683472,1


In [28]:
predictions_upload.groupby('prediction').count()

Unnamed: 0_level_0,reviewerID-itemID
prediction,Unnamed: 1_level_1
0,17467
1,10533


In [29]:
# export prediction file for submission

path = "/Users/dipali/Desktop/CMU MSBA/machine_learning_2/final_project/"

predictions_upload.to_csv(path+"predictions_test.txt", index=False, header=True, line_terminator="\n", sep=",")