# SVD implementation



In [1]:
%store -r df

no stored variable or alias df


In [24]:
import surprise
from surprise import SVD, Dataset, accuracy, Reader
from surprise.model_selection import cross_validate, GridSearchCV, KFold
from collections import defaultdict

import pandas as pd
import numpy as np
import random

In [3]:
my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

### SIMPLE MODEL

In [4]:
df = pd.DataFrame()
df[["userID", "itemID", "rating"]] = pd.read_json("../data/AMAZON_FASHION_5.71.268.json")[["userID", "itemID", "rating"]]
df.drop_duplicates(subset=["userID", "itemID"], inplace=True)
df.head()

Unnamed: 0,userID,itemID,rating
10,A3HX4X3TIABWOV,B000KPIHQ4,2
17,A3HX4X3TIABWOV,B000V0IBDM,2
24,A3QY3THQ42WSCQ,B000YFSR5G,1
25,AGZ5OOZVDO194,B000YFSR5G,5
26,A3GJ3DJU1RXOHN,B000YFSR4W,4


In [5]:
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(df, reader)
raw_ratings = data.raw_ratings
random.shuffle(raw_ratings)

In [6]:
min_r = df['rating'].min()
max_r = df['rating'].max()
print('Ratings min is {0}, max {1}'.format(min_r,max_r))

Ratings min is 1, max 5


In [7]:
#loading the data to surprise
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(df, reader)

In [8]:
svd = SVD()
trainset = data.build_full_trainset()
data_fit = svd.fit(trainset)

In [9]:
pred = svd.predict(uid='A3HX4X3TIABWOV', iid = 'B000KPIHQ4')
score = pred.est
score

3.538328310695088

In [10]:
itemIDs = df['itemID'].unique()
itemIDs_A3HX4X3TIABWOV = df.loc[df['userID'] == 'A3HX4X3TIABWOV', 'itemID']
itemIDs_to_pred = np.setdiff1d(itemIDs, itemIDs_A3HX4X3TIABWOV)
itemIDs_to_pred

array(['B000YFSR4W', 'B000YFSR5G', 'B0014F7B98', 'B0014F8TIU',
       'B0017LD0BM', 'B0017LGD34', 'B001IKJOLW', 'B001LNSY2Q',
       'B003M6060S', 'B0058YEJ5K', 'B005AGO4LU', 'B0092UF54A',
       'B009MA34NY', 'B00GKF5BAS', 'B00I0VHS10', 'B00LKWYX2I',
       'B00MLYE8PQ', 'B00ND9047Y', 'B00ZUA6AJK', 'B010RRWKT4',
       'B014IBJKNO', 'B01595OS62', 'B016XAJLVO', 'B01H7KY678'],
      dtype=object)

In [11]:
testset = [['A3HX4X3TIABWOV', iid , 5] for iid in itemIDs_to_pred]
predictions = svd.test(testset)

In [12]:
pred_ratings = np.array([pred.est for pred in predictions])
i_max = pred_ratings.argmax()
iid = itemIDs_to_pred[i_max]
print('Top item for user A3HX4X3TIABWOV has iid {0} with predicted rating {1}'.format(iid,pred_ratings[i_max]))

Top item for user A3HX4X3TIABWOV has iid B01595OS62 with predicted rating 4.12726001983114


### ---------------------------------------------------------------------------
### NEW MODEL

In [15]:
df = pd.DataFrame()
df[["userID", "itemID", "rating"]] = pd.read_json("../data/AMAZON_FASHION_5.71.268.json")[["userID", "itemID", "rating"]]
df.drop_duplicates(subset=["userID", "itemID"], inplace=True)
df.head()

Unnamed: 0,userID,itemID,rating
10,A3HX4X3TIABWOV,B000KPIHQ4,2
17,A3HX4X3TIABWOV,B000V0IBDM,2
24,A3QY3THQ42WSCQ,B000YFSR5G,1
25,AGZ5OOZVDO194,B000YFSR5G,5
26,A3GJ3DJU1RXOHN,B000YFSR4W,4


In [16]:
reader = Reader(rating_scale=(3,5))
data = Dataset.load_from_df(df, reader)
raw_ratings = data.raw_ratings
random.shuffle(raw_ratings)

In [17]:
#A = 80%, B=20%
threshold = int(.8 * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]

In [18]:
data.raw_ratings = A_raw_ratings

In [19]:
param_grid = {'n_epochs': [20,30,40,50], 'lr_all': [0.005,0.05],
              'reg_all': [0.1, 0.4]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5)
gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.4369139151841323
{'n_epochs': 40, 'lr_all': 0.05, 'reg_all': 0.1}


In [20]:
svd = gs.best_estimator['rmse']
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fb7ce7c7f50>

In [21]:
predictions = svd.test(trainset.build_testset())
print('Biased accuracy on A,', end= '  ')
accuracy.rmse(predictions)

testset = data.construct_testset(B_raw_ratings)
predictions = svd.test(testset)
print('Unbiased accuracy on B,', end= ' ')
accuracy.rmse(predictions)

Biased accuracy on A,  RMSE: 0.4125
Unbiased accuracy on B, RMSE: 0.4431


0.4430694110314506

### Precision and recall

In [25]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

"""
kf = KFold(n_splits=5)

for trainset, testset in kf.split(data):
    svd.fit(trainset)
    predictions = svd.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))
    """

'\nkf = KFold(n_splits=5)\n\nfor trainset, testset in kf.split(data):\n    svd.fit(trainset)\n    predictions = svd.test(testset)\n    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)\n\n    # Precision and recall can then be averaged over all users\n    print(sum(prec for prec in precisions.values()) / len(precisions))\n    print(sum(rec for rec in recalls.values()) / len(recalls))\n    '

In [26]:
precision_recall_at_k(predictions)

({'A1VXFXHDY4SRI5': 1.0,
  'A2JX7ESVMGXMJY': 0,
  'A2Q6OH5LUQXPFV': 1.0,
  'A12LDKOWDBPQTC': 1.0,
  'A2R0RL0TF6E9VC': 1.0,
  'A34LYR4FWZ77CA': 1.0,
  'AC5WLG32YTU5K': 1.0,
  'A2ZE58RF22658T': 1.0,
  'A2JD3RN8JCO62Z': 0,
  'A6CXK8NXD50R2': 0,
  'A2GOOFFNUI20XI': 1.0,
  'A2PYQN4T4LV0NP': 1.0,
  'AB29N6EMOFEVW': 1.0,
  'AIP4C9MG13COB': 1.0,
  'ADD8OMIBXZ89G': 0,
  'A1GTC5EVSJNCQ8': 1.0,
  'A2PWOO618R3IKJ': 1.0,
  'A3A2DRPMDCRFBC': 1.0,
  'AD7ZF3D5JH9HL': 1.0,
  'AOFQAZVA6Q6E7': 1.0,
  'A2KGKIJ528667I': 0,
  'A37K0VP2H8FKOU': 1.0,
  'A3PZ1TJ71WPLXV': 1.0,
  'AZYHPRWLMSY9O': 1.0,
  'A2QPBIA0HKEL96': 1.0,
  'AG8MPE8CQAPM7': 1.0,
  'A1PFKS00R93MRZ': 1.0,
  'A2O3GSYI4B5Z9H': 1.0,
  'A2P87PVYP2KLT6': 1.0,
  'A1OWUKMLTHKYG': 1.0,
  'A3D71DQPAAI6O4': 1.0,
  'A2AXI9WHORKU6X': 1.0,
  'ASOSP4VAUDB7I': 1.0,
  'A3BN0MRGRDKM0J': 1.0,
  'A36UF843501J4Y': 1.0,
  'A1P2JWDZ4SFW78': 1.0,
  'ABRX0RQNRXX85': 1.0,
  'AV2MXVWMTERBK': 1.0,
  'A2EGVPBTNJI3HB': 1.0,
  'A3CXL2P49AQQRH': 1.0,
  'A13JGYKUU10QKH': 1.0