In [None]:
from load_dataset_module import UserPreference
from math import sqrt
import time

class SimilarityUsers:
    
    def __init__(self,similarityMetric='euclidean',user={'BookID':1}):
        ''' Initialise the object '''
        self.similarityMetric = similarityMetric
        self.user1 = user
        self.user2 = user
        
    def setUser1(self,user):
        ''' Sets dictionary containing books(ISNB) and corresponding rating for user1 '''
        self.user1 = user
        
    def getUser1(self):
        ''' Returns dictionary containing books(ISBN) and correspondin rating for user1 '''
        return self.user1
    
    def setUser2(self,user):
        ''' Sets dictionary containing books(ISNB) and corresponding rating for user2 '''
        self.user2 = user
        
    def getUser2(self):
        ''' Returns dictionary containing books(ISBN) and correspondin rating for user2 '''
        return self.user2 
    
    def setSimilarityMetric(self,similarityMetric):
        ''' Sets similarity metric '''
        self.similarityMetric = similarityMetric
        
    def getSimilarityMetric(self):
        ''' Returns similarity metric '''
        return self.similarityMetric
    
    def checkRated(self):
        ''' Checks whether the reader gave the book a review (1-10) '''
        d1 = self.user1
        d2 = self.user2
        for users in d1.copy():
            if d1[users] == 0 or d2[users] == 0:
                del d1[users]
                del d2[users]
            else:
                continue
        self.setUser1(d1)
        self.setUser2(d2)
        
    def rankUsers(self,dictionary):
        ''' Returns a dictionary with values replaced by their rank '''
        rank = 1
        for user in dictionary:
            dictionary[user] = rank
            rank += 1
        return dictionary
        
    def commonality(self):
        ''' Removes any uncommon books from user dictionaries '''
        bookAndRatings1 = self.user1
        bookAndRatings2 = self.user2
        s1 = set(self.user1)
        s2 = set(self.user2)
        sDiff = s1 ^ s2 # Symmetric difference
        
        for key in sDiff: # Removes any non-common books 
            if key in bookAndRatings1:
                del bookAndRatings1[key]
            if key in bookAndRatings2:
                del bookAndRatings2[key]
        # Maybe don't reset the values here? instead return new lists   
        self.user1 = bookAndRatings1
        self.user2 = bookAndRatings2
        
    def sortedValues(self,user):
        ''' Items are sorted by key then put into a list containing their values in the same order '''
        return dict(sorted(user.items())).values()
                  
    def getEuclideanMetric(self): 
        ''' Returns the inverse of the sum of euclidean distance'''
        # euclidean = Sqrt of Sum of the difference in ratings squared. 
        # return 1 / (1 + (euclidean))
        # The smaller the number, the more similar
        
        self.commonality() # Removes any non-common ratings
        self.checkRated() # Checks that all books have been given a rating
        
        if len(self.user1) == 0:
            return 0 # No similarity
        else:
            sort1 = self.sortedValues(self.user1) # Sorts book IDs in alphabetical order
            sort2 = self.sortedValues(self.user2)

            subtractPower = lambda a,b : (a - b)**2 # Applies the euclidean distance formula to the values

            euclidean = sqrt(sum(map(subtractPower,sort1,sort2)))
            similarityMetric = 1 / (1+(euclidean)) # Calculates the inverse of the similarity metric
            return similarityMetric

    
    def getCosineMetric(self):
        ''' Returns cosine similarity '''
        # Ranges from -1 (Opposite) to 1 (the same) with zero suggesting no correlation
        self.commonality() # Removes any non-common ratings
        self.checkRated() # Checks that all books have been given a rating

        sort1 = self.sortedValues(self.user1)# Sorts book IDs in alphabetical order
        sort2 = self.sortedValues(self.user2)
        
        multiply = lambda a,b : a * b # Returns the numerator for our cosine calculation
        multiplications = sum(list(map(multiply,sort1,sort2)))

        square = lambda num : num ** 2 # return the denominator for our cosine calcuation
        sumSquares1 = sum(map(square,sort1))
        sumSquares2 = sum(map(square,sort2))
    
        try:
            return multiplications / (sqrt(sumSquares1)*sqrt(sumSquares2)) 
        except:
            return 0 # No correlation

        
    def getPearson(self):
        ''' Returns the Pearson Coefficient '''
        self.commonality() # Removes any non-common ratings
        self.checkRated() # Checks that all books have been given a rating

        '''THIS PART KEEPS GETTING REPEATED THUS SHOULD MAKE NEW FUNCTION'''
        sort1 = self.sortedValues(self.user1)
        sort2 = self.sortedValues(self.user2)
        
        mean = lambda x : sum(x)/len(x) # Returns the mean 
        meanUser1 = mean(sort1)
        meanUser2 = mean(sort2)
        
        subMean = lambda x,y : (x - meanUser1) * (y - meanUser2)
        numerator = sum(list(map(subMean,sort1,sort2))) # Returns numerator
        
        diffSqr1 = lambda x : (x - meanUser1)**2
        diffSqr2 = lambda y : (y - meanUser2)**2
        user1Diff = sum(list(map(diffSqr1,sort1)))
        user2Diff = sum(list(map(diffSqr2,sort2)))

        denominator = sqrt(user1Diff*user2Diff) # Returns denominator
        try:
            return numerator/denominator
        except:
            return 0 # No similarity
    
    def getSpearmanCorr(self):
        ''' Returns the Spearman Correlation Coefficient '''
        self.commonality()
        self.checkRated() # Checks that all books have been given a rating

        # Get rankings 
        rankedUser1 = dict(sorted(self.user1.items(), key = lambda x:x[1], reverse = True)) # Sorts dict by value in descending order
        rankedUser2 = dict(sorted(self.user2.items(), key = lambda x:x[1], reverse = True))
        # Replace the value with the ranking'
        rankedUser1 = self.rankUsers(rankedUser1)
        rankedUser2 = self.rankUsers(rankedUser2)
        # Order values by their key as opposed to rank
        rankedUser1 = self.sortedValues(rankedUser1)
        rankedUser2 = self.sortedValues(rankedUser2)
        # Define anon function for subtracting difference squared
        sqrDiff = lambda a,b : (a-b)**2
        # Calculate the numerator of the equation
        numerator = 6 * sum(list(map(sqrDiff,rankedUser1,rankedUser2)))
        # Calculate the denominator of the equation
        denominator = len(rankedUser1)**3 + len(rankedUser1)
        try:
            return 1-(numerator/denominator)
        except:
            return 0 # No similarity 
        
    def getManhattan(self):
        ''' Returns the inverse of Manhattan distance '''
        self.commonality() # Removes any non-common ratings
        self.checkRated() # Checks that all books have been given a rating

        if len(self.user1) == 0: # Checks if any common ratings exist
            return 0 # No similarity
        else:
            rankedUser1 = self.sortedValues(self.user1)
            rankedUser2 = self.sortedValues(self.user2)
            # Define function for calculating the difference of two ratings
            diff = lambda a,b : a-b
            manhattan = sum(list(map(diff,rankedUser1,rankedUser2)))
            return 1/(1+manhattan) # Closer to one = more similar
        
        
class SimilarityBooks:
    
    def __init__(self,bookID='1'):
        ''' Initiates similarity '''
        self.book1 = bookID
        self.book2 = bookID
            
    def setBook1(self,bookID):
        ''' Sets book1 '''
        self.book1 = bookID
    def setBook2(self,bookID):
        ''' Sets book2 '''
        self.book2 = bookID
    
    def getBook1(self):
        ''' Returns book1 dictionary '''
        return self.book1 
    def getBook2(self):
        ''' Returns book2 dictionary '''
        return self.book2
    
    def getSimilarity(self,usersPreference):
        ''' Returns Jaccard similarity metric '''
        s1 = [] # Initialise empty lists, as cannot initialise empty sets
        s2 = []
        
        for user in usersPreference:
            if self.book1 in usersPreference[user]:
                s1.append(user)
            if self.book2 in usersPreference[user]:
                s2.append(user)
                
        # Converts lists to sets for intersection and union functions
        s1 = set(s1)
        s2 = set(s2)
        
        # Calculate the intersect and uion
        intersect = len(s1.intersection(s2))
        union = len(s1.union(s2))
        
        # Jaccard = intersect/union
        try:
            return intersect/union # Closer to one = more similar
        except:
            return 0 # No similarity
        
class NSimilarItems(SimilarityBooks,SimilarityUsers):
    
    def __init__(self,bookID=None,userID=None):
        super(SimilarityUsers,self).__init__()
        super(SimilarityBooks,self).__init__()
        # Both are initialised as null to allow for Boolean operation later
        self.__bookID = bookID
        self.__userID = bookID
    
    def setUserID(self,user):
        self.__userID = user
        
    def setBookID(self,book):
        self.__bookID = book
    
    def getUserID(self):
        return self.__userID
    def getBookID(self):
        return self.__bookID
    
    def reduceDataset(self,userPreference):
        ''' Reduces dataset to users that contain comparable data '''
        # Remove any zeroes as they are not comparable
        # Similar to super().checkRated() however this only modifies one user not two
        print(len(userPreference))
        d1 = userPreference[self.__userID]
        for books in d1.copy():
            if '0' in d1[books]:
                del d1[books]
            else:
                continue
    
                   
        for user in userPreference.copy():
            b = 0 # Initialise b for books 
            for book in d1:
                if book in userPreference[user]:
                    b+=1 # Do not delete if at least one book exists 
                else:
                    continue
            if b == 0: # If no common books we delete
                del userPreference[user]
        print(len(userPreference))
        return userPreference
                        
    def setUserDict(self,users):
        ''' Convert dictionary item to the correct format {'ISBN':Rating} '''
        userDict = {}
        for user in list(users):
            for book in list(users.values()):
                userDict[user] = int(book[-1])
        return userDict
    
    def getNSimilarItems(self,similarityMetric,userPreference):
        
        if self.__userID != None:
            
            rankings = {} # Defines users dictionary with opposing similarity rating
            i = 1
            for user in userPreference:
                print('%d/%d'%(i,len(userPreference)), end="\r")
                i+=1
                super().setUser1(self.setUserDict(userPreference[self.__userID])) # This must be reset each time due to it being removed in commonality...
                super().setUser2(self.setUserDict(userPreference[user]))
                if super().getUser2 == super().getUser1(): # Don't compare to itself
                    next
                else:
                    if similarityMetric == 'Euclidean':
                        similarity = super().getEuclideanMetric()
                    elif similarityMetric == 'Cosine':
                        similarity = super().getCosineMetric()
                    elif similarityMetric == 'Spearman':
                        similarity = super().getSpearmanCorr()
                    elif similarityMetric == 'Pearson':
                        similarity = super().getPearson()
                    else:
                        similarity = super().getManhattan()
                rankings[user] = similarity
            
            self.__userID = None # Reset in case called again without being instantiated as a new object
            return sorted(rankings.items(), key = lambda x:x[1], reverse = True) # Returns list, sorted largest -> smallest
        elif self.__bookID != None:
            ''' THIS DATASET IS V. LARGE, INSTEAD ONLY TAKE ONES FROM THE USER PREFERENCE'''
            '''   up = UserPreference() # Create user pref object s.t. loadCSV can be called to load book dataset
            up.setFilename('Books.csv')
            books = list(up.loadCSV())
            rankings = {}
            super().setBook1(self.__bookID) # Assign book to compare to
            i = 1
            for book in books:
                print('%d/271379'%i, end="\r")
                i+=1
                if book == self.__bookID:
                    next # Don't compare to iteslf
                else:
                    super().setBook2(book)
                    # THIS WILL TAKE FOREVER WITH 271379 Books, lets find out...
                    similarity = super().getSimilarity(userPreference)
                    rankings[book] = similarity
                    
            self.__bookID = None
            return sorted(rankings.items(), key = lambda x:x[1], reverse = True)'''
            books = [] # Take all books that have been read and place them in a list
            i = 1
            userlength = len(userPreference)
            for user in userPreference:
                print('%d/%d'%(i,userlength), end="\r")
                i+=1
                for book in userPreference[user]:
                    if book in books:
                        continue
                    else:
                        books.append(books)
            return books
        else:
            print('Book or User not specified')
    

up = UserPreference()
dictionary = up.userPreference()
sim = SimilarityUsers()

def setUserDict(users):
    userDict = {}
    for user in list(users):
        for book in list(users.values()):
            userDict[user] = int(book[-1])
    return userDict

def testUserSim(dictionary,sim):

    start_time = time.time()

    user1 = dictionary['276772']
    user1Dict = {}
    user2 = dictionary['11676'] 
    user2Dict = {}
    user3 = dictionary['558']
    user3Dict = {}
    user4 = dictionary['183']
    user4Dict = {}
    
    #print(list(user1.values())[0][-1])
    user1Dict = setUserDict(user1)
    user2Dict = setUserDict(user2)
    user3Dict = setUserDict(user3)
    '''    for user in list(user1):
        for book in list(user1.values()):
            user1Dict[user] = int(book[-1]) # ID : rating
    
    for user in list(user2):
        for book in list(user2.values()):
            user2Dict[user] = int(book[-1]) # ID : rating    '''
            
    sim.setUser1(user1Dict)
    sim.setUser2(user2Dict)
    print(sim.getCosineMetric())

    print(time.time() - start_time) # TAKES circa 35 SECONDS!
    
#testUserSim(dictionary,sim)

def testBooks(dictionary):
    start_time = time.time()
    sim = SimilarityBooks()
    sim.setBook1('0345339711')
    sim.setBook2('059035342X')
    print(sim.getSimilarity(dictionary))
    print(time.time() - start_time)
#testBooks(dictionary)

def testNSim(dictionary):
    start_time = time.time()
    sim = NSimilarItems()
    #sim.setUserID('11676')
    #print(sim.getNSimilarItems('Euclidean',dictionary))
    sim.setBookID('3499230933')
    #sim.getNSimilarItems('Euclidean',dictionary) # Shouldn't need to specify a sim metric here
    #sim.setUserID('276925')
    #dictionary = sim.reduceDataset(dictionary) # Greatly reduces the dataset to iterate over
    print(sim.getNSimilarItems('Euclidean',dictionary))
    print(time.time()-start_time)
testNSim(dictionary)            


79234/92107

In [None]:
lists = ['"276725"','"276725"','"276725"']
[s.strip('"') for s in lists]

In [69]:
BooksAndRatings1 = {'barry':100,'abc':2,'abb':3,'cba':7,'bbc':4}
BooksAndRatings2 = {'abc':5,'abg':3,'cbaa':7,'barry':100,'bbc':7}

s1=set(BooksAndRatings1)
s2=set(BooksAndRatings2)
symmetricDiff = s1 ^ s2 # All elements that only appear in set a or only appear in set b

for key in symmetricDiff:
    if key in BooksAndRatings1:
        del BooksAndRatings1[key]
    if key in BooksAndRatings2:
        del BooksAndRatings2[key]
    
print(BooksAndRatings1)
print(BooksAndRatings2)

# Euclidean distance

def summation(dic1,dic2):
    summation = 0
    for i in dic1:
        for j in dic2:
            if i == j:
                summation+= dic1[i]+dic[j]
            else:
                continue
    return summation
    


{'barry': 100, 'abc': 2, 'bbc': 4}
{'abc': 5, 'barry': 100, 'bbc': 7}
218


In [107]:
l1 = {'A':2,'B':3,'C':5}

mean = lambda ratings : sum(ratings.values())/len(ratings)
mean(l1)

3.3333333333333335

In [135]:
sqr = lambda num : num + num
l1 = (1,2,3,4,5)
m = (map(sqr,l1))
print(m)

<map object at 0x11e2568d0>


In [129]:
# Python program to demonstrate working
# of map.
  
# Return double of n
'''def addition(n):
    return n + n'''
lambda addition : addition + addition

# We double all numbers using map()
numbers = (1, 2, 3, 4)
result = map(addition, numbers)
print(list(result))

[2, 4, 6, 8]


In [131]:
add = lambda addition : addition + addition

numbers = (1,2,3,4)

result = map(add,numbers)
print(list(result))

[2, 4, 6, 8]


In [146]:
sqr = lambda num : num **2
l1 = {'A':1,'B':2,'C':3}
m = sum(map(sqr,l1.values()))
print(m)

14


In [179]:
l1 = {'A':23,'C':43,'B':5} 
l2 = {'B':23,'A':4,'C':22}
sort1 = dict(sorted(l1.items())).values()
sort2 = dict(sorted(l2.items())).values()
multiply = lambda a,b : a * b
multiplications = sum(list(map(multiply,sort1,sort2)))
print(multiplications)
dict(sort2)

1153


TypeError: cannot convert dictionary update sequence element #0 to a sequence

In [211]:
l1 = {'A':2,'B':2}
l2 = {'A':5,'B':5}
l3 = [123,4]
l4 = [123,4]
subtractPower = lambda a,b : (a-b)**2
sqrt(sum(map(subtractPower,l3,l4)))

0.0

In [237]:
l1 = [5,3,4,2,4]
l2 = [6,3,2,3,1]

mean1 = 43
mean2 = 41

calc = lambda a,mean : a - mean
list(map())

TypeError: unsupported operand type(s) for -: 'list' and 'int'

In [84]:
class Test1:
    
    def __init__(self,*args):
        args = 1,2,3,4
        self.numbers = args
        
    def getNumbers(self):
        return self.numbers
    
    def setNumbers(self,*args):
        print('setting numbers')
        self.numbers = args
        
    def add(self):
        return sum(self.numbers)

class Test2(Test1):
    
    def __init__(self,*args):
        args=1,2,3,4
        self.numbers = args
    
    def setNumbers(self,*args):
        self.numbers = args
        
    def getNumbers(self):
        return self.numbers

'''t = Test1()
t.setNumbers(1,2,3)
t.add()
'''
t2 = Test2()
t2.setNumbers(1,2,3)
t2.add()


6

In [94]:
dictionary = {1:{1:('Harry Potter','J.k. Rowling',2002),2:('LotR','Tolkien',1952)},2:{1:('Barry Potter','J.k. Rowling',2002),2:('LoyR','Tolkien',1952)}}
for books in dictionary:
    if 1 in dictionary[books]:
        print('oui')
        

oui
oui


In [70]:
usersPref = {'user1':{52:'Book1'},'user2':{52:'Book1',65:'Book2'},'user3':{65:'Book2',42:'Book3'},'user4':{65:'Book2'}}
book1ID = 52
book2ID = 65

s1 = []
s2 = []


for user in usersPref:
    if book1ID in usersPref[user]:
        s1.append(user)
    if book2ID in usersPref[user]:
        s2.append(user)
        
s1 = set(s1)
s2 = set(s2)

intersect = len(s1.intersection(s2))
union = len(s1.union(s2))
jaccard = intersect/union

print(jaccard)

0.25


In [21]:
d1 = {'A':1,'B':0,'C':12}
d2 = {'A':2,'B':3,'C':4}

for users in d1.copy():
    if d1[users] == 0 or d2[users] == 0:
        del d1[users]
        del d2[users]
    else:
        continue
print(d1,d2)

{'A': 1, 'C': 12} {'A': 2, 'C': 4}


In [71]:
d1 = {1:{'A':4}}
del d1[1]['A']
print(d1)

{1: {}}


In [16]:
class Test():
    def __init__(self,bookID=None):
        self.bookID = bookID
    
    def setUserID(self,user):
        self.userID = user
    
    def getUserID(self):
        return self.userID
    
    def testFunc(self):
        self.setUserID(None)
    
t = Test()
t.setUserID('123')
t.testFunc()
print(t.getUserID())

None


In [21]:
import numpy as np
import time 
a1 = list(np.random.randint(10, size=1000))
a2 = list(np.random.randint(10,size=1000))
t1 = []
t2 = []

for x in range(1000):
    start = time.time()
    # Mapping
    euclidean = lambda a,b : (a-b)**2
    sum(list(map(euclidean,a1,a2)))
    t1.append(time.time()-start)

    start = time.time()
    # List comprehension
    sum([(a-b)**2 for a,b in zip(a1,a2)])
    t2.append(time.time()-start)
print('Average time for mapped function')
print(sum(t1)/len(t1))

print('Average time for list comprehension')
print(sum(t2)/len(t2))

Average time for mapped function
0.0006652603149414063
Average time for list comprehension
0.0006150102615356445
