In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
%matplotlib inline

# Importing and Cleaning

In [2]:
jsonFile = 'Digital_Music_5.json'
df = pd.read_json(jsonFile, lines=True)
# df.to_csv('DigitalMusicReviews.csv', sep=',', index=False)

In [3]:
print("Dataframe shape : ", df.shape)
print("Nan values per column",df.isna().sum())

Dataframe shape :  (169781, 12)
Nan values per column overall                0
vote              162170
verified               0
reviewTime             0
reviewerID             0
asin                   0
style              11792
reviewerName           5
reviewText           158
summary               36
unixReviewTime         0
image             169599
dtype: int64


In [4]:
# Dropping columns - style image and vote
df.drop(['style','image','vote'],axis = 1, inplace=True)
df.isna().sum()

overall             0
verified            0
reviewTime          0
reviewerID          0
asin                0
reviewerName        5
reviewText        158
summary            36
unixReviewTime      0
dtype: int64

In [5]:
# Dropping rows wil null values
df.dropna(axis=0,inplace=True)
print("Dataframe shape after dropping nans : ", df.shape)
print("Nan values per column after dropping nans : ",df.isna().sum())

Dataframe shape after dropping nans :  (169606, 9)
Nan values per column after dropping nans :  overall           0
verified          0
reviewTime        0
reviewerID        0
asin              0
reviewerName      0
reviewText        0
summary           0
unixReviewTime    0
dtype: int64


In [6]:
df.rename(columns={"overall":"rating", "asin":"productID" }, inplace=True)

In [7]:
df.head()

Unnamed: 0,rating,verified,reviewTime,reviewerID,productID,reviewerName,reviewText,summary,unixReviewTime
0,5,True,"06 3, 2013",A2TYZ821XXK2YZ,3426958910,Garrett,"This is awesome to listen to, A must-have for ...",Slayer Rules!,1370217600
1,5,True,"10 11, 2014",A3OFSREZADFUDY,3426958910,Ad,bien,Five Stars,1412985600
2,5,True,"02 11, 2014",A2VAMODP8M77NG,3426958910,JTGabq,It was great to hear the old stuff again and I...,SLAYER!!!!!!!!!!!!!!!!!!!!!,1392076800
3,4,False,"12 7, 2013",AAKSLZ9IDTEH0,3426958910,john F&#039;n doe,well best of's are a bit poison normally but t...,slayer greatest hits! you mean everything righ...,1386374400
4,5,True,"06 12, 2016",A3OH43OZJLKI09,5557706259,melinda a goodman,What can I say? This is Casting Crowns!!!This ...,"This is a good, blessing filled",1465689600


In [8]:
truncated_df = df[['reviewerID','productID','rating','unixReviewTime']]
truncated_df.head()

Unnamed: 0,reviewerID,productID,rating,unixReviewTime
0,A2TYZ821XXK2YZ,3426958910,5,1370217600
1,A3OFSREZADFUDY,3426958910,5,1412985600
2,A2VAMODP8M77NG,3426958910,5,1392076800
3,AAKSLZ9IDTEH0,3426958910,4,1386374400
4,A3OH43OZJLKI09,5557706259,5,1465689600


In [9]:
truncated_df.dtypes

reviewerID        object
productID         object
rating             int64
unixReviewTime     int64
dtype: object

In [10]:
n_users = truncated_df['reviewerID'].unique().shape[0]
n_items = truncated_df['productID'].unique().shape[0]
print(str(n_users) + ' users')
print(str(n_items) + ' items')


16560 users
11797 items


In [11]:
codes, uniques = pd.factorize(truncated_df['productID'])
truncated_df['productCode'] = codes

codes, uniques = pd.factorize(truncated_df['reviewerID'])
truncated_df['reviewerCode'] = codes

truncated_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  truncated_df['productCode'] = codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  truncated_df['reviewerCode'] = codes


Unnamed: 0,reviewerID,productID,rating,unixReviewTime,productCode,reviewerCode
0,A2TYZ821XXK2YZ,3426958910,5,1370217600,0,0
1,A3OFSREZADFUDY,3426958910,5,1412985600,0,1
2,A2VAMODP8M77NG,3426958910,5,1392076800,0,2
3,AAKSLZ9IDTEH0,3426958910,4,1386374400,0,3
4,A3OH43OZJLKI09,5557706259,5,1465689600,1,4


In [12]:
ratings = np.zeros((n_users, n_items))
for row in truncated_df.itertuples():
    ratings[row[6], row[5]] = row[3]
ratings


array([[5., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [13]:
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100
print('Sparsity: {:4.2f}%'.format(sparsity))


Sparsity: 0.07%


In [14]:
def train_test_split(ratings):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for user in range(ratings.shape[0]):
        test_ratings = np.random.choice(ratings[user, :].nonzero()[0], 
                                        size=10)
        train[user, test_ratings] = 0.
        test[user, test_ratings] = ratings[user, test_ratings]
        
    # Test and training are truly disjoint
    assert(np.all((train * test) == 0)) 
    return train, test


In [15]:
train, test = train_test_split(ratings)


In [16]:
def get_similarity(ratings, sType='user', epsilon=1e-9):
    if sType == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif sType == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [17]:
user_similarity = get_similarity(train, sType='user')
item_similarity = get_similarity(train, sType='item')
print(item_similarity[:4, :4])


[[1.00000000e+00 2.30940108e-11 2.46182982e-11 1.05851225e-11]
 [2.30940108e-11 1.00000000e+00 1.42133811e-11 6.11132331e-12]
 [2.46182982e-11 1.42133811e-11 1.00000000e+00 6.51469254e-12]
 [1.05851225e-11 6.11132331e-12 6.51469254e-12 1.00000000e+00]]


In [18]:
def predict(ratings, similarity, sType='user'):
    if sType == 'user':
        return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif sType == 'item':
        return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])

In [19]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
def get_rmse(pred, actual):
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual, squared=False)

def get_mae(pred, actual):
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_absolute_error(pred, actual)


In [20]:
item_prediction = predict(train, item_similarity, sType='item')
user_prediction = predict(train, user_similarity, sType='user')



In [21]:
print('User-based CF RMSE: ' + str(get_rmse(user_prediction, test)))
print('Item-based CF RMSE: ' + str(get_rmse(item_prediction, test)))

User-based CF RMSE: 4.727863651274411
Item-based CF RMSE: 4.7456039232984235


In [22]:
print('User-based CF MAE: ' + str(get_mae(user_prediction, test)))
print('Item-based CF MAE: ' + str(get_mae(item_prediction, test)))

User-based CF MAE: 4.673628039552508
Item-based CF MAE: 4.692815256461635


**Recommendation Metrics**

In [30]:
def get_top_n(prediction, actual, k=10):
    top_n = {}
    orig = {}
    for user_num in range(len(prediction)):
        
        userID = truncated_df.loc[truncated_df['reviewerCode'] == user_num, 'reviewerID'].iloc[0]
        
        # CONSTRUCTION OF TOP_N
        
        user = prediction[user_num]
        
        # We only want the predictions of those items which have not been rated in training data
        # i.e, they have a zero entry in train
        l = []
        for i in range(len(user)):
            if (train[user_num][i] == 0):
                # this item/prediction can be a potential recommendation
                l.append((i, user[i]))
        
        # Description of l: [(item index, predicted rating), .......] for those items not rated in train data
        
        # sort l in descending order of prediction, and only consider first k elements for recommendation
        l.sort(key=lambda x: x[1], reverse=True)
        l = l[:k]
        
        # replace the item index with the item id
        l2 = [(truncated_df.loc[truncated_df['productCode'] == x, 'productID'].iloc[0], y) for (x, y) in l]
        
        top_n[userID] = l2
        
        # CONSTRUCTION OF ORIG
        
        test_ratings = actual[user_num]
        
        m = []
        
        # We only want to store those items/ratings that have been rated in test data
        # i.e, whose entry in actual array is 0
        for i in range(len(test_ratings)):
            if(test_ratings[i] != 0):
                m.append((i,test_ratings[i]))
        
        # Description of m: [(item index, true test rating), .......] for non-zero ratings in test data
        
        # replace the item index with the item id
        m2 = [(truncated_df.loc[truncated_df['productCode'] == x, 'productID'].iloc[0], y) for (x, y) in m]
        
        orig[userID] = m2
        
    return top_n, orig

def get_pr(top_n, orig, k=10):
    
    # precision of each user = (# of non-zero rated items in test data that appear in top_n)/k
    # recall of each user = (# of non-zero rated items in test data that appear in top_n)/ (# of non-zero test ratings)
    num_users = len(top_n.keys())
    precisions = {}
    recalls = {}
    for uid in top_n.keys():
        rec = top_n[uid]
        rated = orig[uid]
        items_rated = [x for (x, y) in rated]
        num_rated = len(rated)
        
        num_rated_and_recommended = 0
        
        for iid, pred in rec:
            if(iid in items_rated):
                num_rated_and_recommended += 1
        
        precisions[uid] = num_rated_and_recommended/k
        recalls[uid] = num_rated_and_recommended/num_rated
        
    precision = sum(precisions.values())/num_users
    recall = sum(recalls.values())/num_users
    
    return precision, recall

def dcg_at_k(scores):
    return scores[0] + sum(sc/math.log(ind, 2) for sc, ind in zip(scores[1:], range(2, len(scores) + 1)))

def ndcg_at_k(scores):
    idcg = dcg_at_k(sorted(scores, reverse=True))
    return (dcg_at_k(scores)/idcg) if idcg > 0.0 else 0.0

def get_ndcg(top_n, orig):
    ndcg_scores = dict()
    for uid, user_ratings in top_n.items():
        scores = []
        for iid, est_r in user_ratings:
            iid_found = False
            org_user_ratings = orig[uid]
            for i, r in org_user_ratings:
                if(iid == i):
                    scores.append(r)
                    iid_found = True
                    break
            if not iid_found:
                scores.append(0)
        ndcg_scores[uid] = ndcg_at_k(scores)
    
    ndcg_score = sum(ndcg for ndcg in ndcg_scores.values())/len(ndcg_scores)
    return ndcg_score


def get_f_measure(precision, recall):
    try:
        print("F1 Score: ", str((2*precision*recall)/(precision + recall)))
    except:
        print("values of precision and recall are too low to calculate f_measure")
            
#Prints:
# 1. precision (avg of precision for all users)
# 2. recall (avg of recall for all users)
# 3. F measure
# 4. final ndgc score
# returns: complete top_n recommendation dict {uid: [(item id, predicted rating), ...]}
def get_all_metrics(prediction, actual, k=10):
    top_n, org = get_top_n(prediction, actual, k)
    precision, recall = get_pr(top_n, org, k)
    print("Precision : ", precision)
    print("Recall : ", recall)
    get_f_measure(precision, recall)
    ndcg = get_ndcg(top_n, org)
    print("NGCG : ", ndcg)
    return top_n

In [31]:
# User-User metrics

topN1 = get_all_metrics(user_prediction, test, 10)

Precision :  0.02522342995169135
Recall :  0.04400369507706472
F1 Score:  0.032066162502946034
NGCG :  0.13058131170116788


In [32]:
# Item-Item metrics
topN2 = get_all_metrics(item_prediction, test, 10)

Precision :  0.008266908212560289
Recall :  0.01470398646576176
F1 Score:  0.01058352390478685
NGCG :  0.03583180922751944


In [36]:
list(topN1.keys())[0]

'A2TYZ821XXK2YZ'

In [37]:
list(topN1.values())[0]

[('B00YIBA8J0', 0.45809318681323985),
 ('B00UMI98C6', 0.44795339701095693),
 ('B000W11K5G', 0.39066369337252066),
 ('B001ESSLIU', 0.3624773421866795),
 ('B00P0V7GTM', 0.3014035830398362),
 ('B01F4EPB0K', 0.30140358302414677),
 ('B00P290EFU', 0.28015945388243246),
 ('B003FRN28C', 0.27891623455741277),
 ('B000W1T2TM', 0.27202328012649074),
 ('B00124F2A6', 0.27202328004669923)]

In [38]:
list(topN2.keys())[0]

'A2TYZ821XXK2YZ'

In [39]:
list(topN2.values())[0]

[('B0014JD6KW', 0.6011294970784609),
 ('B01FT5U3OI', 0.45574800026305107),
 ('B00956UPSW', 0.4037119774039456),
 ('B012KYGZ7K', 0.371591760571503),
 ('B01G662874', 0.3686566911670817),
 ('B005D5MA3W', 0.35046555863856826),
 ('B001ESSLIU', 0.31419840597010934),
 ('B004QQWO2Q', 0.3050056218403255),
 ('B01F4EPB0K', 0.2954821769256404),
 ('B00YIBA8J0', 0.2700155379369341)]

In [40]:
df.loc[df['productID'] == 'B0014JD6KW']

Unnamed: 0,rating,verified,reviewTime,reviewerID,productID,reviewerName,reviewText,summary,unixReviewTime
140904,5,False,"08 26, 2014",A1NF9WS7RR82MX,B0014JD6KW,J. Hill,"After appearing on Bloodbath's debut EP, ""Bree...",A Bloody Good EP,1409011200


In [41]:
df.loc[df['productID'] == 'B00UMI98C6']

Unnamed: 0,rating,verified,reviewTime,reviewerID,productID,reviewerName,reviewText,summary,unixReviewTime
110391,5,True,"10 6, 2015",A2HJ421P5A1P6N,B00UMI98C6,marvin feagans,one of the best bands that does not disappoint,Five Stars,1444089600
110392,5,True,"10 1, 2015",A2TZX7JGYO2BED,B00UMI98C6,mike norton,Great,Five Stars,1443657600
110393,5,True,"09 17, 2015",A1SF7FZE2M9KM9,B00UMI98C6,Crimson,The download works great!,The download works great!,1442448000
110394,5,True,"09 14, 2015",A1NQOJEXEAX0PS,B00UMI98C6,woody,Waited a long time for this band's new music. ...,True to style.,1442188800
110395,5,True,"08 28, 2015",A7F2WKMTGENTX,B00UMI98C6,Geff,Good.,Five Stars,1440720000
110396,5,True,"08 23, 2015",A32CPS8Z53G1RM,B00UMI98C6,Christina C.,Awesome first cut from this long awaited album.,Worth the wait!,1440288000
110397,5,True,"08 8, 2015",ACJO6RRTJJY2H,B00UMI98C6,Mrs. Rogers,Breaking Benjamin has always been a favorite o...,Five Stars,1438992000
110398,5,True,"06 21, 2015",A2TMZI8FD935WW,B00UMI98C6,rockdan101,love it,Five Stars,1434844800
110399,5,True,"06 18, 2015",A3EHHUDXBDKLFX,B00UMI98C6,Twitchy,ty,ty,1434585600
110400,5,True,"05 31, 2015",A1MNYP1E1TR6L0,B00UMI98C6,KC,great song!,best band ever,1433030400


In [42]:
df.loc[df['reviewerID'] == 'A2TYZ821XXK2YZ']

Unnamed: 0,rating,verified,reviewTime,reviewerID,productID,reviewerName,reviewText,summary,unixReviewTime
0,5,True,"06 3, 2013",A2TYZ821XXK2YZ,3426958910,Garrett,"This is awesome to listen to, A must-have for ...",Slayer Rules!,1370217600
102914,4,True,"07 31, 2015",A2TYZ821XXK2YZ,B00MYPUDMY,Garrett,Still better than ever. One of my all time fav...,Maggot4Life,1438300800
112370,5,True,"09 17, 2017",A2TYZ821XXK2YZ,B00YI4B79W,Garrett,This band had to grow on me and I'm glad I hav...,Nameless Ghoul,1505606400
112416,5,True,"02 16, 2016",A2TYZ821XXK2YZ,B00YIB0R9Q,Garrett,"Even though I enjoyed ""Atlas"" I think ""Ire"" ve...",BRUTAL,1455580800
113656,5,True,"02 16, 2016",A2TYZ821XXK2YZ,B012BPYS7M,Garrett,No Surprise. Sevendust made yet again another ...,Support 7D,1455580800
115369,5,True,"01 23, 2016",A2TYZ821XXK2YZ,B015U2LLIW,Garrett,Let's just say I'm a huge Chris Adler fan so I...,Still thrashing,1453507200
117256,5,True,"03 25, 2016",A2TYZ821XXK2YZ,B01AUA1MC4,Garrett,I am not even halfway through this album yet b...,SET SAIL ON METAL!,1458864000
118878,5,True,"06 3, 2016",A2TYZ821XXK2YZ,B01DQ6ON70,Garrett,I've listened to this album probably 5 times s...,Volbeat has done it again,1464912000
118880,5,True,"05 1, 2016",A2TYZ821XXK2YZ,B01DQ6ONVG,Garrett,Volbeat always outdo themselves!!,Perfextion,1462060800
119109,5,True,"03 27, 2017",A2TYZ821XXK2YZ,B01EDA9EPE,Garrett,This album is great. If I had to pick a favori...,Perfection,1490572800
