## From demolib.py

In [1]:
import os
import math
import numpy as np
import pandas as pd
import time
from statistics import mean
from math import sqrt
from random import sample 
from tqdm.notebook import tqdm

# convert the transaction data (long data) into a ratings matrix (wide data)
# assume the first two columns are user and item names (which may be strings or integers and may not be contiguous)
# and assume the third column is the rating
# also generate two lookup tables to map user and item names into integer indexes, these are useful for accessing the ratings matrix
def makeratingsmatrix(trans):
    trans.columns = ['user','item','rating'] 
    # create the mappings between user and item names (as in raw data) and the matrix row and column indexes
    unames  = np.sort(trans['user'].unique())
    inames  = np.sort(trans['item'].unique())
    umap = dict(zip(unames,[i for i in range(len(unames))]))
    imap = dict(zip(inames,[i for i in range(len(inames))]))
    # create the ratings matrix, use aggfunc = mean in case multiple ratings exist for same (user,item)
    users = pd.pivot_table(trans, index=['user'], columns=['item'], values=['rating'],aggfunc=[mean]).values
    return [users, umap, imap]

# show percentage of cells in a rating matrix that are empty
def sparsity(arr):
    return float(np.isnan(arr).sum()*100)/np.prod(arr.shape)
    #return (1.0 - ( count_nonzero(arr) / float(arr.size) )) # alternative, gives same result

def wtavg(vals, weights):
    xy = vals * weights
    weights = weights[np.isnan(xy) == False] 
    #if len(weights) == 0 : return np.nan
    if sum(weights) == 0 : return np.nan
    vals = vals[np.isnan(xy)==False]
    return sum(vals * weights)/sum(weights)
        
def pearsonsim(x,y):
    xy = x*y
    x = x[np.isnan(xy)==False]
    y = y[np.isnan(xy)==False]
    if(len(x)==0): return np.nan
    mx=mean(x)
    my=mean(y)
    rt = sqrt(sum((x-mx)**2)*sum((y-my)**2))
    if (rt == 0): return np.nan  #math.isnan(rt)==True or 
    return sum((x-mx)*(y-my))/rt
               
def cosinesim(x,y):
    xy = x*y
    x = x[np.isnan(xy)==False]
    y = y[np.isnan(xy)==False]
    if(len(x)==0): return np.nan
    rt = sqrt(sum(x**2)*sum(y**2))
    if (rt == 0): return np.nan 
    return sum(x*y)/rt

def euclidsim(x,y):
    xy = x*y
    x = x[np.isnan(xy)==False]
    y = y[np.isnan(xy)==False]
    z=(y-x)**2
    sz=sqrt(sum(z))
    return 1/(1+sz)

def euclidsimF(x,y):
    xy = x*y
    x = x[np.isnan(xy)==False]
    y = y[np.isnan(xy)==False]
    z=(y-x)**2
    return 1/(1+sum(z))

def getitemsimsmatrix(ratsmatrix,simfun):
    r,c = ratsmatrix.shape
    matrx = list([])
    for col1 in tqdm(range(0,c)):
        simrow = [0]*col1
        for col2 in range(col1,c):
            simrow.append(simfun(ratsmatrix[:,col1],ratsmatrix[:,col2]))
        matrx.append(simrow)
    matrx = np.array(matrx)
    matrx = matrx + matrx.T - np.diag(np.diag(matrx))
    return matrx

# predict ratings for a given target user on a specified item 
def predictrating_UU(targetrats, ratsmatrix, targetitemindx, simfun):
    return predictratings_UU(targetrats, ratsmatrix, doitems=[targetitemindx], simfun=simfun)[0]

def predictrating_II(targetrats, itemsims, targetitemid):
    return predictratings_II(targetrats, itemsims, doitems=[targetitemid])[0]

# predict ratings for a given target user on a set of items (specified in doitems) 
def predictratings_UU(targetrats, ratsmatrix, doitems, simfun=pearsonsim):
    sims = list([])
    for row in ratsmatrix: sims.append(simfun(row,targetrats))
    sims = np.array(sims)
    with np.errstate(invalid='ignore'): sims[sims < 0] = np.nan
    rats = list([])
    for col in doitems: rats.append(wtavg(ratsmatrix[:,col],sims)) # assumes target rating is NA (if target in usersA)
    return np.array(rats)

def predictratings_II(targetrats,itemsims,doitems):
    seenitems = np.isnan(targetrats)==False
    rats = list([])
    for row in doitems:
        rats.append(wtavg(targetrats[seenitems],itemsims[row,seenitems])) 
    return np.array(rats)

# get recommendations for a given target user
def getRecommendations_UU(targetrats, ratsmatrix, imap, simfun=pearsonsim,topN=5):
    itemnames=list(imap.keys())
    unseenitemids = np.where(np.isnan(targetrats)==True)[0]
    ratsA = predictratings_UU(targetrats, ratsmatrix, doitems=unseenitemids, simfun=simfun)
    rats = pd.DataFrame(ratsA,index=[itemnames[i] for i in unseenitemids],columns=['predrating'])
    rats = rats.sort_values(ascending = False, by=['predrating'])
    return rats[0:min(topN,len(rats))]
    
def getRecommendations_II(targetrats, itemsims, imap, topN=5):
    itemnames=list(imap.keys()) 
    unseenitemids = np.where(np.isnan(targetrats)==True)[0]
    ratsA = predictratings_II(targetrats,itemsims,doitems=unseenitemids)
    rats = pd.DataFrame(ratsA,index=[itemnames[i] for i in unseenitemids],columns=['predrating'])
    rats = rats.sort_values(ascending = False, by=['predrating'])
    return rats[0:min(topN,len(rats))]

# compute prediction errors (predicted rating - actual rating) for the test events (events ~ 'user,item,rating')
def computeErrs_UU(testevents, ratsmatrix, uids, iids, simfun=cosinesim):
    res = list([])
    for testevent in tqdm(testevents):
        # print('.', end = '')
        testuserindx = uids[testevent[0]]
        testitemindx = iids[testevent[1]]
        pred = predictrating_UU(ratsmatrix[testuserindx,],ratsmatrix,testitemindx,simfun=simfun)
        res.append(pred-testevent[2])
    return np.array(res)

def computeErrs_II(testevents, ratsmatrix, uids, iids, itemsims):
    res = list([])
    for testevent in tqdm(testevents):
        #print('.', end = '')
        testuserindx = uids[testevent[0]]
        testitemindx = iids[testevent[1]]
        pred = predictrating_II(ratsmatrix[testuserindx,],itemsims,testitemindx)
        res.append(pred-testevent[2])
    return np.array(res)

# returns the percentage ranking for each test event
# if itemsims is supplied then do item-based CF, else do user-based CF
# note: testevents contain user and item names (as in datafile) not ratmatrix indexes
def computePercentageRanking(testevents, ratsmatrix, uids, iids, itemsims=False, simfun=cosinesim):
    res = list([])
    for testevent in tqdm(testevents):
        testuserindx = uids[testevent[0]]
        testitemname = testevent[1]
        if (type(itemsims) == bool):
            # print('.', end = '')
            recs = getRecommendations_UU(ratsmatrix[testuserindx,], ratsmatrix, iids, simfun=simfun, topN=1000000)
        else:
            recs = getRecommendations_II(ratsmatrix[testuserindx,], itemsims, iids, topN=1000000)
        # recs is a dataframe, the row names are the itemnames (as in the datafile)
        # .index() gets the row names, .get_loc returns a row number (starting at 0)
        rkpc = ((recs.index.get_loc(testitemname) + 1)*100)/len(recs) 
        res.append(rkpc)
    return np.array(res)

# compute hits and lift for the test events
def computeLiftOverRandom(testevents, ratsmatrix, uids, iids, alg="uu", itemsims=False, simfun=cosinesim, topN=10):
    tothits = randhits = totrecs = 0
    # each testevent is (username, itemname, rating)
    for testevent in tqdm(testevents):
        testuserindx = uids[testevent[0]]
        if (alg == "uu"):
            # print('.', end = '')
            recs = getRecommendations_UU(ratsmatrix[testuserindx,], ratsmatrix, iids, simfun=simfun, topN=topN)
        elif alg == "ii":
            recs = getRecommendations_II(ratsmatrix[testuserindx,], itemsims, iids, topN=topN)
        if testevent[1] in recs.index: tothits = tothits + 1
        # do random recommendations
        unseenitemids = list(np.where(np.isnan(ratsmatrix[testuserindx,])==True)[0])
        recs = sample(unseenitemids,min(topN,len(recs)))  # only generate same # recs as CF above
        if iids[testevent[1]] in recs: randhits =  randhits + 1
        totrecs = totrecs + len(recs)
    return tothits, randhits, totrecs

# pretty show head of matrix
def head(arr,r=10,c=10):
    nr, nc = arr.shape
    with np.printoptions(threshold=np.inf):
        if type(arr) == np.ndarray:
            print(arr[0:min(r,nr),0:min(c,nc)])
        else:
            print(arr.iloc[0:min(r,nr),0:min(c,nc)])

### Movielens100K

In [2]:
# load the data
trans = pd.read_csv(r'C:\Users\xBaka\Notebooks\School\Day 7\workshop2\simplemovies-transactions.csv')
trans

Unnamed: 0,user,movie,rating
0,Rose,LadyinWater,2.5
1,Rose,SnakesOnPlane,3.5
2,Rose,JustMyLuck,3.0
3,Rose,Superman,3.5
4,Rose,Dupree,2.5
5,Rose,NightListener,3.0
6,Seymour,LadyinWater,3.0
7,Seymour,SnakesOnPlane,3.5
8,Seymour,JustMyLuck,1.5
9,Seymour,Superman,5.0


In [3]:
# convert from ratings events into a ratings matrix
# Note: uids and iids map the user and item names to matrix indexes (starting at 0)
ratmatrix, uids, iids = makeratingsmatrix(trans) 
print('ratmatrix.shape:', ratmatrix.shape)
print('ratmatrix:\n', ratmatrix)
print('uids:', uids)

ratmatrix.shape: (8, 6)
ratmatrix:
 [[nan 2.  3.  1.  nan nan]
 [2.  2.  3.  3.  4.  3. ]
 [3.5 nan 3.  3.  4.  5. ]
 [nan nan 2.5 4.  3.  3.5]
 [2.5 3.  nan 4.5 3.5 4. ]
 [2.5 3.  2.5 3.  3.5 3.5]
 [3.5 1.5 3.  3.  3.5 5. ]
 [1.  nan nan nan 4.5 4. ]]
uids: {'Barry': 0, 'LaSalle': 1, 'Matthews': 2, 'Philips': 3, 'Puig': 4, 'Rose': 5, 'Seymour': 6, 'Toby': 7}


In [4]:
# make recommendations for a specific user
targetname = "Toby" 
targetrats = ratmatrix[uids[targetname],] # note: index into the ratings matrix is by index, e.g. user "10" ~ index 9
targetrats

array([1. , nan, nan, nan, 4.5, 4. ])

In [5]:
# When using pearson the recommendations for Toby should be: 3.35 (night), 2.83 (lady), 2.53 (luck)
getRecommendations_UU(targetrats, ratmatrix, iids, simfun=pearsonsim, topN = 10)

Unnamed: 0,predrating
NightListener,3.34779
LadyinWater,2.83255
JustMyLuck,2.530981


In [6]:
# compute item-item similarity matrix (to use the timer, select all 3 lines and then run) 
tic = time.perf_counter()
itemsims = getitemsimsmatrix(ratmatrix, simfun=euclidsimF) # use euclidsimF to agree with book/slide calcs
print(f"time {time.perf_counter() - tic:0.4f} seconds")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))


time 0.0291 seconds


In [7]:
head(itemsims) # view the similarity matrix

[[1.         0.18181818 0.4        0.14814815 0.05128205 0.05333333]
 [0.18181818 1.         0.18181818 0.13333333 0.10526316 0.06451613]
 [0.4        0.18181818 1.         0.13333333 0.22222222 0.09090909]
 [0.14814815 0.13333333 0.13333333 1.         0.18181818 0.1025641 ]
 [0.05128205 0.10526316 0.22222222 0.18181818 1.         0.16666667]
 [0.05333333 0.06451613 0.09090909 0.1025641  0.16666667 1.        ]]


In [8]:
# II recommendations for Toby with euclideanF shd be: 3.183 (night), 2.598 (luck), 2.473 (lady)
getRecommendations_II(targetrats, itemsims, iids, topN = 10)

Unnamed: 0,predrating
NightListener,3.182635
JustMyLuck,2.598332
LadyinWater,2.473088


In [9]:
# lets try pre-normalising the data
rowmeans = np.nanmean(ratmatrix,axis=1); rowmeans
normratmatrix = ratmatrix.copy()
for i in range(ratmatrix.shape[0]):  # iterate over rows
    normratmatrix[i] = normratmatrix[i] - rowmeans[i]
head(normratmatrix)

[[        nan  0.          1.         -1.                 nan         nan]
 [-0.83333333 -0.83333333  0.16666667  0.16666667  1.16666667  0.16666667]
 [-0.2                nan -0.7        -0.7         0.3         1.3       ]
 [        nan         nan -0.75        0.75       -0.25        0.25      ]
 [-1.         -0.5                nan  1.          0.          0.5       ]
 [-0.5         0.         -0.5         0.          0.5         0.5       ]
 [ 0.25       -1.75       -0.25       -0.25        0.25        1.75      ]
 [-2.16666667         nan         nan         nan  1.33333333  0.83333333]]


In [10]:
# redo the UU recommendations
targetrats = normratmatrix[uids[targetname],] 
recs = getRecommendations_UU(targetrats, normratmatrix, iids, simfun=pearsonsim, topN = 10); recs # the normalised rating predictions
recs + rowmeans[uids[targetname]] # the unnormalised rating predictions, should be similar but not idential to the unnormalised predictions

Unnamed: 0,predrating
NightListener,3.29336
LadyinWater,2.862307
JustMyLuck,2.576052


## Testing

In [11]:
# load a bigger data so that we can split into training and test sets
trans = pd.read_csv('simplemovies-transactions-moreusers.csv')
trans

Unnamed: 0,user,movie,rating
0,Rose,LadyinWater,2.5
1,Rose,SnakesOnPlane,3.5
2,Rose,JustMyLuck,3.0
3,Rose,Superman,3.5
4,Rose,Dupree,2.5
...,...,...,...
62,Tully,Superman,4.0
63,Tully,NightListener,2.0
64,Billy,SnakesOnPlane,4.5
65,Billy,JustMyLuck,3.0


In [12]:
ratmatrix, uids, iids = makeratingsmatrix(trans)
print('ratmatrix.shape:', ratmatrix.shape)
print('ratmatrix:\n', ratmatrix)
print('uids:', uids)

ratmatrix.shape: (16, 6)
ratmatrix:
 [[nan 2.  3.  1.  nan nan]
 [nan 3.  nan nan 4.5 3. ]
 [nan nan nan 1.5 4.  1. ]
 [nan 1.5 nan nan 1.  nan]
 [2.  2.  3.  3.  4.  3. ]
 [5.  nan 2.  nan 2.5 4.5]
 [3.5 nan 3.  3.  4.  5. ]
 [nan nan nan 4.5 4.5 4.5]
 [nan nan 2.5 4.  3.  3.5]
 [2.5 3.  nan 4.5 3.5 4. ]
 [2.5 3.  2.5 3.  3.5 3.5]
 [1.5 nan 3.  1.  3.5 nan]
 [3.5 1.5 3.  3.  3.5 5. ]
 [1.  nan nan nan 4.5 4. ]
 [nan 4.5 3.5 2.  3.5 4. ]
 [4.  2.  nan 5.  1.  3. ]]
uids: {'Barry': 0, 'Billy': 1, 'Garry': 2, 'Harry': 3, 'LaSalle': 4, 'Larry': 5, 'Matthews': 6, 'Molly': 7, 'Philips': 8, 'Puig': 9, 'Rose': 10, 'Sally': 11, 'Seymour': 12, 'Toby': 13, 'Tully': 14, 'Wally': 15}


In [13]:
# extract a testset from the rating events by random sampling
testsize = 10
testevents = trans.sample(n=testsize).values.tolist()
testevents

[['Seymour', 'SnakesOnPlane', 3.5],
 ['Barry', 'NightListener', 1.0],
 ['LaSalle', 'LadyinWater', 3.0],
 ['Seymour', 'LadyinWater', 3.0],
 ['Tully', 'LadyinWater', 3.5],
 ['Rose', 'Superman', 3.5],
 ['Seymour', 'NightListener', 3.0],
 ['LaSalle', 'JustMyLuck', 2.0],
 ['Sally', 'SnakesOnPlane', 3.5],
 ['Matthews', 'Dupree', 3.5]]

In [14]:
# blank out the testset ratings in the ratings matrix - this now becomes our training data
for (uname,iname,rating) in testevents: ratmatrix[uids[uname],iids[iname]] = np.nan 

# try using each of these in turn 
simfun = pearsonsim 
# simfun = cosinesim
# simfun = euclidsim

# execute the test function in demolib to make ratings predictions for the test events and obtain the prediction errors
# note: to desplay progress, this function prints a "." for every testevent processed
errs = computeErrs_UU(testevents, ratmatrix, uids, iids, simfun=simfun)
errs
np.nanmean(abs(errs))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




0.5327816835567722

In [15]:
# calc the item similarity matrix
# try using each of these in turn
simfun = euclidsim
# simfun = cosinesim

tic = time.perf_counter()
itemsims = getitemsimsmatrix(ratmatrix, simfun = simfun)
print(f"time {time.perf_counter() - tic:0.4f} seconds")

errs = computeErrs_II(testevents, ratmatrix, uids, iids, itemsims)
np.nanmean(abs(errs))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))


time 0.0188 seconds


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




0.6069953298299642

In [16]:
# compute percentage rank for each test event 
# (% position of test event in the list of unseenitems ranked descending by their predicted rating, a low % position is good)
prs = computePercentageRanking(testevents, ratmatrix, uids, iids, simfun=cosinesim) #  user-based
np.nanmean(abs(prs))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




71.66666666666666

In [17]:
prs = computePercentageRanking(testevents, ratmatrix, uids, iids, itemsims=itemsims) # item-based (since item sim.matrix is supplied)
np.nanmean(abs(prs))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




60.833333333333336

In [18]:
# compute lift over random, where lift = (#hits using CF)/(#hits using random)
# Note: small values of topN may not yield any hits , hence below we try a range of values for topN
# Also note that for bigger values of topN the lift over random is expected to be smaller

# lift for user-based CF
for k in [5,10,50]:
    cfhits, randhits, recsmade = computeLiftOverRandom(testevents, ratmatrix, uids, iids, alg = "uu", simfun=simfun, topN=k)
    print("\nlift=", cfhits/randhits if randhits > 0 else "-",
          "cfhits=", cfhits, "randhits=", randhits, "totitemsrecommended=",recsmade)
    
# lift for item-based CF
for k in [5,10,50]:
    cfhits, randhits, recsmade = computeLiftOverRandom(testevents, ratmatrix, uids, iids, alg = "ii", itemsims=itemsims, topN=k)
    print("\nlift=", cfhits/randhits if randhits > 0 else "-",
          "cfhits=", cfhits, "randhits=", randhits, "totitemsrecommended=",recsmade)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))



lift= 1.0 cfhits= 10 randhits= 10 totitemsrecommended= 25


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))



lift= 1.0 cfhits= 10 randhits= 10 totitemsrecommended= 25


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))



lift= 1.0 cfhits= 10 randhits= 10 totitemsrecommended= 25


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))



lift= 1.0 cfhits= 10 randhits= 10 totitemsrecommended= 25


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))



lift= 1.0 cfhits= 10 randhits= 10 totitemsrecommended= 25


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))



lift= 1.0 cfhits= 10 randhits= 10 totitemsrecommended= 25


## MovieLens

In [19]:
trans = pd.read_csv(r'C:\Users\xBaka\Notebooks\School\Day 7\workshop2\u_data.csv') # movielens 100K file (user and itemids start at 1)
trans.drop('datetime',axis=1,inplace=True)
ratmatrix, uids, iids = makeratingsmatrix(trans)
print('ratmatrix.shape:', ratmatrix.shape)
print('sparsity(ratmatrix):', sparsity(ratmatrix))
print('ratmatrix:\n', ratmatrix)

ratmatrix.shape: (943, 1682)
sparsity(ratmatrix): 93.69533063577546
ratmatrix:
 [[ 5.  3.  4. ... nan nan nan]
 [ 4. nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [ 5. nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan  5. nan ... nan nan nan]]


In [20]:
# lets try pre-normalising the data
rowmeans = np.nanmean(ratmatrix,axis=1); rowmeans
normratmatrix = ratmatrix.copy()
for i in range(ratmatrix.shape[0]):  # iterate over rows
    normratmatrix[i] = normratmatrix[i] - rowmeans[i]
head(normratmatrix)

[[ 1.38970588 -0.61029412  0.38970588 -0.61029412 -0.61029412  1.38970588
   0.38970588 -2.61029412  1.38970588 -0.61029412]
 [ 0.29032258         nan         nan         nan         nan         nan
          nan         nan         nan -1.70967742]
 [        nan         nan         nan         nan         nan         nan
          nan         nan         nan         nan]
 [        nan         nan         nan         nan         nan         nan
          nan         nan         nan         nan]
 [ 1.12571429  0.12571429         nan         nan         nan         nan
          nan         nan         nan         nan]
 [ 0.36492891         nan         nan         nan         nan         nan
  -1.63507109  0.36492891  0.36492891         nan]
 [        nan         nan         nan  1.03473945         nan         nan
   1.03473945  1.03473945  1.03473945  0.03473945]
 [        nan         nan         nan         nan         nan         nan
  -0.79661017         nan         nan         nan]


In [21]:
# select any user at random to make recommendations to, e.g.:
targetname = 10 # a movielens user
targetrats = ratmatrix[uids[targetname],] 
uurecs = getRecommendations_UU(targetrats, ratmatrix, iids, simfun=pearsonsim, topN = 20)
uurecs.head()

Unnamed: 0,predrating
1467,5.0
1201,5.0
814,5.0
1358,5.0
1367,5.0


In [22]:
# to view the recommended movie names
titles = pd.read_csv(r'C:\Users\xBaka\Notebooks\School\Day 7\workshop2\u_item.csv') # movielens 100K file (user and itemids start at 1)
for i in uurecs.index: 
    print("rat=%2.2f, movie=%d, %s" % (uurecs['predrating'][i], i, titles['movie name'][i]))

rat=5.00, movie=1467, Cure, The (1995)
rat=5.00, movie=1201, Maybe, Maybe Not (Bewegte Mann, Der) (1994)
rat=5.00, movie=814, One Fine Day (1996)
rat=5.00, movie=1358, Boys in Venice (1996)
rat=5.00, movie=1367, Mina Tannenbaum (1994)
rat=5.00, movie=1599, Guantanamera (1994)
rat=5.00, movie=1500, Prisoner of the Mountains (Kavkazsky Plennik) (1996)
rat=5.00, movie=1189, That Old Feeling (1997)
rat=5.00, movie=1122, Last Time I Saw Paris, The (1954)
rat=5.00, movie=1656, Target (1995)
rat=5.00, movie=1536, Cosi (1996)
rat=5.00, movie=1293, Ayn Rand: A Sense of Life (1997)
rat=4.72, movie=1026, Shooter, The (1995)
rat=4.63, movie=1463, Stars Fell on Henrietta, The (1995)
rat=4.61, movie=1449, Golden Earrings (1947)
rat=4.58, movie=114, Haunted World of Edward D. Wood Jr., The (1995)
rat=4.56, movie=884, Phantoms (1998)
rat=4.53, movie=408, Jack (1996)
rat=4.52, movie=318, Everyone Says I Love You (1996)
rat=4.49, movie=1398, Stranger in the House (1997)


In [23]:
# to make recommendations using item-based CF
itemsims = getitemsimsmatrix(ratmatrix, simfun=euclidsim) # takes ~ 20-30secs
iirecs = getRecommendations_II(targetrats, itemsims, iids, topN = 20)
iirecs

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1682.0), HTML(value='')))




Unnamed: 0,predrating
1493,4.281941
1486,4.281941
1494,4.281941
1500,4.258175
1095,4.257486
397,4.253947
146,4.252602
1250,4.248899
1491,4.248078
1151,4.247126


In [24]:
#----------------------
# An interesting aside:
# how many of the topN recommendations from user-based CF are also in the topN from item-based CF? 
# To compute this we convert the recommended items into sets and compute the intersection, e.g.
uuset = set(uurecs.index)
iiset = set(iirecs.index)
uuset.intersection(iiset) # this shows the set of users (userindexes) who are in the topN list for both UU and II recommendations
# Note1: a bigger value for top N is more likely to result in a bigger intersection.
# Note2: an item does not have to be in both the UU and II sets to be a good recommendation - any of the items in the union
# of the UU and II recommendations are potentially good recommendations.
#---------------------

# Now proceed as before to create train/test sets and to compute MAE using user-based & item-based CF
# and to explore performance of the various similarity measures.
# Normally we would make the testset ~ 10%-30% of the total data, e.g. testsize = int(0.8*len(trans))
# but since demolib is slow we keep the #testevents small, e.g. 200
# If your PC is fast then you can make this larger for more accurate testing (or if your PC is very slow then make it smaller)
testsize = int(0.01*len(trans))
testevents = trans.sample(n=testsize).values.tolist()

# Note: for item-based CF, remember to compute the item similarity matrix ONLY AFTER the testevents have been blanked out (deleted) in the ratings matrix

In [25]:
# # blank out the testset ratings in the ratings matrix - this now becomes our training data
# for (uname,iname,rating) in testevents: 
#     ratmatrix[uids[uname],iids[iname]] = np.nan

# # execute the test function in demolib to make ratings predictions for the test events and obtain the prediction errors
# # note: to desplay progress, this function prints a "." for every testevent processed
# for simfun in [pearsonsim, cosinesim, euclidsim]:
#     errs = computeErrs_UU(testevents, ratmatrix, uids, iids, simfun=simfun)
#     print(np.nanmean(abs(errs)))

In [26]:
# # calc the item similarity matrix
# # try using each of these in turn
# simfun = euclidsim
# # simfun = cosinesim

# tic = time.perf_counter()
# itemsims = getitemsimsmatrix(ratmatrix, simfun = simfun)
# print(f"time {time.perf_counter() - tic:0.4f} seconds")

# errs = computeErrs_II(testevents, ratmatrix, uids, iids, itemsims)
# np.nanmean(abs(errs))

In [27]:
# # compute percentage rank for each test event 
# # (% position of test event in the list of unseen items ranked descending by their predicted rating, a low % position is good)
# prs = computePercentageRanking(testevents, ratmatrix, uids, iids, simfun=cosinesim) #  user-based
# np.nanmean(abs(prs))

In [28]:
# prs = computePercentageRanking(testevents, ratmatrix, uids, iids, itemsims=itemsims) # item-based (since item sim.matrix is supplied)
# np.nanmean(abs(prs))

In [29]:
# # compute lift over random, where lift = (#hits using CF)/(#hits using random)
# # Note: small values of topN may not yield any hits , hence below we try a range of values for topN
# # Also note that for bigger values of topN the lift over random is expected to be smaller

# # lift for user-based CF
# for k in [5,10,50]:
#     cfhits, randhits, recsmade = computeLiftOverRandom(testevents, ratmatrix, uids, iids, alg = "uu", simfun=simfun, topN=k)
#     print("\nlift=", cfhits/randhits if randhits > 0 else "-",
#           "cfhits=", cfhits, "randhits=", randhits, "totitemsrecommended=",recsmade)
    
# # lift for item-based CF
# for k in [5,10,50]:
#     cfhits, randhits, recsmade = computeLiftOverRandom(testevents, ratmatrix, uids, iids, alg = "ii", itemsims=itemsims, topN=k)
#     print("\nlift=", cfhits/randhits if randhits > 0 else "-",
#           "cfhits=", cfhits, "randhits=", randhits, "totitemsrecommended=",recsmade)

## Jester

In [53]:
# dataset2
trans = pd.read_csv(r"C:\Users\xBaka\Notebooks\School\Day 7\workshop2\jester_ratings.dat", sep='\s+',header=0)
ratmatrix, uids, iids = makeratingsmatrix(trans)
print(ratmatrix.shape)
print(sparsity(ratmatrix)) # show % that is empty
print(head(ratmatrix))

# testsize = 100 # for UU keep small, but can increase if you have a fast PC
# testsize = 10000 # for II it can be much larger

# Proceed as above to create train/test sets and to compute MAE using user-based & item-based CF
# and to explore performance of the various similarity measures.
# Note: pearsonsim may be too slow to test for this dataset but cosinesim and euclidsim should be ok
# What MAE is acceptable given that the ratings range -10->10 is much larger than movielens 1->5 ?

#########################################################
# book crossings dataset (optional: if time permits)
#
# This dataset is too big to fit into memory in a uncompressed (non-sparse) matrix format
# Instead we proceed by data sampling: selecting only the most popular books and the most active users
#########################################################

(59132, 140)
78.72268822295881
[[   nan -9.281 -9.281 -6.781  0.875 -9.656 -9.031 -7.469 -8.719 -9.156]
 [-9.688  9.938  9.531  9.938  0.406  3.719  9.656 -2.688 -9.562 -9.125]
 [-9.844 -9.844 -7.219 -2.031 -9.938 -9.969 -9.875 -9.812 -9.781 -6.844]
 [-5.812 -4.5   -4.906    nan    nan    nan    nan    nan    nan    nan]
 [ 6.906  4.75  -5.906 -0.406 -4.031  3.875  6.219  5.656  6.094  5.406]
 [-0.031 -9.094 -0.406  7.5   -7.219 -9.438  0.125 -9.156  3.656 -9.438]
 [-2.906 -2.344 -0.5   -0.969  2.25   0.406  0.875  0.406  1.438 -0.031]
 [ 6.219 -7.438 -0.812 -3.438  0.531  0.531  2.344 -2.219 -4.531  2.281]
 [ 8.25   9.     8.875  9.75   9.375  9.219  9.031  8.844 -9.531 -5.938]
 [-5.75   0.281  0.781  8.281  3.594 -3.844  0.844 -6.312  3.375  6.594]]
None


In [54]:
#----------------------
# An interesting aside:
# how many of the topN recommendations from user-based CF are also in the topN from item-based CF? 
# To compute this we convert the recommended items into sets and compute the intersection, e.g.
uuset = set(uurecs.index)
iiset = set(iirecs.index)
uuset.intersection(iiset) # this shows the set of users (userindexes) who are in the topN list for both UU and II recommendations
# Note1: a bigger value for top N is more likely to result in a bigger intersection.
# Note2: an item does not have to be in both the UU and II sets to be a good recommendation - any of the items in the union
# of the UU and II recommendations are potentially good recommendations.
#---------------------

# Now proceed as before to create train/test sets and to compute MAE using user-based & item-based CF
# and to explore performance of the various similarity measures.
# Normally we would make the testset ~ 10%-30% of the total data, e.g. testsize = int(0.8*len(trans))
# but since demolib is slow we keep the #testevents small, e.g. 200
# If your PC is fast then you can make this larger for more accurate testing (or if your PC is very slow then make it smaller)
testsize = 10000
testevents = trans.sample(n=testsize).values.tolist()
# Note: for item-based CF, remember to compute the item similarity matrix ONLY AFTER the testevents have been blanked out (deleted) in the ratings matrix

In [55]:
# blank out the testset ratings in the ratings matrix - this now becomes our training data
for (uname,iname,rating) in testevents: 
    ratmatrix[uids[uname],iids[iname]] = np.nan

# execute the test function in demolib to make ratings predictions for the test events and obtain the prediction errors
# note: to desplay progress, this function prints a "." for every testevent processed
# for simfun in [pearsonsim, cosinesim, euclidsim]:
#     errs = computeErrs_UU(testevents, ratmatrix, uids, iids, simfun=simfun)
#     print(np.nanmean(abs(errs)))

In [56]:
# calc the item similarity matrix
simfun = cosinesim
# simfun = euclidsim

tic = time.perf_counter()
itemsims = getitemsimsmatrix(ratmatrix, simfun = simfun)
print(f"time {time.perf_counter() - tic:0.4f} seconds")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=140.0), HTML(value='')))


time 38.0568 seconds


In [47]:
errs = computeErrs_II(testevents, ratmatrix, uids, iids, itemsims)
print(np.nanmean(abs(errs)))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=140.0), HTML(value='')))


time 38.6244 seconds


HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=140.0), HTML(value='')))


time 15.7537 seconds


HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




In [52]:
# compute percentage rank for each test event 
# (% position of test event in the list of unseenitems ranked descending by their predicted rating, a low % position is good)
for i, simfun in zip(["cosinesim", "euclidsim"], [cosinesim, euclidsim]):
    print(i)
    prs = computePercentageRanking(testevents, ratmatrix, uids, iids, simfun=simfun) #  user-based
    print("Percentage ranking (User-based):", np.nanmean(abs(prs)))

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))


40.342025995514206


HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))


39.52860192009609


In [64]:
for i, simfun in zip(["cosinesim", "euclidsim"], [cosinesim, euclidsim]):
    print(i)
    itemsims = getitemsimsmatrix(ratmatrix, simfun = simfun)
    prs = computePercentageRanking(testevents, ratmatrix, uids, iids, itemsims=itemsims) # item-based (since item sim.matrix is supplied)
    print("Percentage ranking (Item-based):", np.nanmean(abs(prs)))

cosinesim


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=140.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))


Percentage ranking (Item-based): 41.02973244642595
euclidsim


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=140.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))


Percentage ranking (Item-based): 43.28834646048366


In [63]:
# compute lift over random, where lift = (#hits using CF)/(#hits using random)
# Note: small values of topN may not yield any hits , hence below we try a range of values for topN
# Also note that for bigger values of topN the lift over random is expected to be smaller

for i, simfun in zip(["cosinesim", "euclidsim"], [cosinesim, euclidsim]):
    print("simfun:", i)
    itemsims = getitemsimsmatrix(ratmatrix, simfun = simfun)
    
    # # lift for user-based CF
    # for k in [5,10,50]:
    #     cfhits, randhits, recsmade = computeLiftOverRandom(testevents, ratmatrix, uids, iids, alg = "uu", simfun=simfun, topN=k)
    #     print("\nlift=", cfhits/randhits if randhits > 0 else "-",
    #         "cfhits=", cfhits, "randhits=", randhits, "totitemsrecommended=",recsmade)
        
    # lift for item-based CF
    for k in [5,10,50]:
        cfhits, randhits, recsmade = computeLiftOverRandom(testevents, ratmatrix, uids, iids, alg = "ii", itemsims=itemsims, topN=k)
        print("\nlift=", cfhits/randhits if randhits > 0 else "-",
            "cfhits=", cfhits, "randhits=", randhits, "totitemsrecommended=",recsmade)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=140.0), HTML(value='')))


cosinesim


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))



lift= 2.131259484066768 cfhits= 2809 randhits= 1318 totitemsrecommended= 49976


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))



lift= 1.7652240520873228 cfhits= 4609 randhits= 2611 totitemsrecommended= 99926


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))



lift= 1.1181286549707603 cfhits= 7648 randhits= 6840 totitemsrecommended= 406757


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=140.0), HTML(value='')))


euclidsim


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))



lift= 1.245311327831958 cfhits= 1660 randhits= 1333 totitemsrecommended= 49976


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))



lift= 1.1171417415072238 cfhits= 2861 randhits= 2561 totitemsrecommended= 99926


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))



lift= 1.1809679334916865 cfhits= 7955 randhits= 6736 totitemsrecommended= 406757


### Book Crossing

In [87]:
trans = pd.read_csv(r"C:\Users\xBaka\Notebooks\School\Day 7\workshop2\BX-Book-Ratings.csv", sep=';', error_bad_lines=False, encoding="latin-1")
trans.columns = ['user','item','rating']
trans.shape

(1149780, 3)

In [88]:
# remove implicit ratings
trans = trans[trans.rating != 0]
trans.shape

(433671, 3)

In [89]:
trans

Unnamed: 0,user,item,rating
1,276726,0155061224,5
3,276729,052165615X,3
4,276729,0521795028,6
6,276736,3257224281,8
7,276737,0600570967,6
...,...,...,...
1149773,276704,0806917695,5
1149775,276704,1563526298,9
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [68]:
# this fails since expanded ratings matrix is too big to fit into memory
# ratmatrix, uids, iids = makeratingsmatrix(trans)

In [90]:
# reduce dataset size by sampling
min_item_ratings = 10 # book popularity threshold 
popular_items = trans['item'].value_counts() >= min_item_ratings
popular_items = popular_items[popular_items].index.tolist(); len(popular_items)  # get list of popular items

5645

In [91]:
min_user_ratings = 10 # user activity threshold
active_users = trans['user'].value_counts() >= min_user_ratings
active_users = active_users[active_users].index.tolist(); len(active_users) # get list of active users

7334

In [92]:
print('original data: ',trans.shape)
trans = trans[(trans['item'].isin(popular_items)) & (trans['user'].isin(active_users))] # apply the filter
print('data after filtering: ', trans.shape)

original data:  (433671, 3)
data after filtering:  (90557, 3)


In [93]:
# converting to a matrix now succeeds
ratmatrix, uids, iids = makeratingsmatrix(trans)
print(ratmatrix.shape)
print(sparsity(ratmatrix)) # show % that is empty
print(head(ratmatrix))

(6837, 5640)
-11.616882689828085
[[nan nan nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan nan nan]]
None
  return float(np.isnan(arr).sum()*100)/np.prod(arr.shape)


In [94]:
#----------------------
# An interesting aside:
# how many of the topN recommendations from user-based CF are also in the topN from item-based CF? 
# To compute this we convert the recommended items into sets and compute the intersection, e.g.
uuset = set(uurecs.index)
iiset = set(iirecs.index)
uuset.intersection(iiset) # this shows the set of users (userindexes) who are in the topN list for both UU and II recommendations
# Note1: a bigger value for top N is more likely to result in a bigger intersection.
# Note2: an item does not have to be in both the UU and II sets to be a good recommendation - any of the items in the union
# of the UU and II recommendations are potentially good recommendations.
#---------------------

# Now proceed as before to create train/test sets and to compute MAE using user-based & item-based CF
# and to explore performance of the various similarity measures.
# Normally we would make the testset ~ 10%-30% of the total data, e.g. testsize = int(0.8*len(trans))
# but since demolib is slow we keep the #testevents small, e.g. 200
# If your PC is fast then you can make this larger for more accurate testing (or if your PC is very slow then make it smaller)
testsize = 100
testevents = trans.sample(n=testsize).values.tolist()

# Note: for item-based CF, remember to compute the item similarity matrix ONLY AFTER the testevents have been blanked out (deleted) in the ratings matrix

In [95]:
# blank out the testset ratings in the ratings matrix - this now becomes our training data
for (uname,iname,rating) in testevents: 
    ratmatrix[uids[uname],iids[iname]] = np.nan

# execute the test function in demolib to make ratings predictions for the test events and obtain the prediction errors
# note: to desplay progress, this function prints a "." for every testevent processed
for simfun in [pearsonsim, cosinesim, euclidsim]:
    errs = computeErrs_UU(testevents, ratmatrix, uids, iids, simfun=simfun)
    print(np.nanmean(abs(errs)))

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))


1.7941408343025025


HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))


1.548881788206876


HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))


1.3727899477807353


In [97]:
# calc the item similarity matrix
# try using each of these in turn
# simfun = euclidsim
# simfun = cosinesim

# tic = time.perf_counter()
# itemsims = getitemsimsmatrix(ratmatrix, simfun = simfun)
# print(f"time {time.perf_counter() - tic:0.4f} seconds")

# errs = computeErrs_II(testevents, ratmatrix, uids, iids, itemsims)
# np.nanmean(abs(errs))

In [None]:
# compute percentage rank for each test event 
# (% position of test event in the list of unseenitems ranked descending by their predicted rating, a low % position is good)
prs = computePercentageRanking(testevents, ratmatrix, uids, iids, simfun=cosinesim) #  user-based
np.nanmean(abs(prs))

In [None]:
prs = computePercentageRanking(testevents, ratmatrix, uids, iids, itemsims=itemsims) # item-based (since item sim.matrix is supplied)
np.nanmean(abs(prs))

In [None]:
# compute lift over random, where lift = (#hits using CF)/(#hits using random)
# Note: small values of topN may not yield any hits , hence below we try a range of values for topN
# Also note that for bigger values of topN the lift over random is expected to be smaller

# lift for user-based CF
for k in [5,10,50]:
    cfhits, randhits, recsmade = computeLiftOverRandom(testevents, ratmatrix, uids, iids, alg = "uu", simfun=simfun, topN=k)
    print("\nlift=", cfhits/randhits if randhits > 0 else "-",
          "cfhits=", cfhits, "randhits=", randhits, "totitemsrecommended=",recsmade)
    
# lift for item-based CF
for k in [5,10,50]:
    cfhits, randhits, recsmade = computeLiftOverRandom(testevents, ratmatrix, uids, iids, alg = "ii", itemsims=itemsims, topN=k)
    print("\nlift=", cfhits/randhits if randhits > 0 else "-",
          "cfhits=", cfhits, "randhits=", randhits, "totitemsrecommended=",recsmade)