# CHAPTER 2 Making Recommendation

## Collaborative Filtering

### Collecting Preferences

In [1]:
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
                       'Just My Luck': 3.0, 'Superman Returns': 3.5, 
                       'You, Me and Dupree': 2.5, 'The Night Listener': 3.0},
         'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,
                          'Just My Luck': 1.5, 'Superman Returns': 5.0, 
                          'The Night Listener': 3.0, 'You, Me and Dupree': 3.5},
         'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
                              'Superman Returns': 3.5, 'The Night Listener': 4.0},
         'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
                          'The Night Listener': 4.5, 'Superman Returns': 4.0,
                          'You, Me and Dupree': 2.5},
         'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
                          'Just My Luck': 2.0, 'Superman Returns': 3.0, 
                          'The Night Listener': 3.0, 'You, Me and Dupree': 2.0},
         'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
                           'The Night Listener': 3.0, 'Superman Returns': 5.0, 
                           'You, Me and Dupree': 3.5},
         'Toby': {'Snakes on a Plane':4.5, 'You, Me and Dupree':1.0,
                  'Superman Returns':4.0}}

In [2]:
critics['Lisa Rose']['Lady in the Water']

2.5

In [3]:
critics['Toby']['Snakes on a Plane']

4.5

In [4]:
critics.keys()

['Jack Matthews',
 'Mick LaSalle',
 'Claudia Puig',
 'Lisa Rose',
 'Toby',
 'Gene Seymour',
 'Michael Phillips']

In [5]:
critics['Toby'].keys()

['Snakes on a Plane', 'Superman Returns', 'You, Me and Dupree']

## #Fiding Similar Users

### Euclidean distance score

In [6]:
from math import sqrt

In [7]:
pow(2, 2)

4

In [8]:
sqrt(pow(5-4, 2) + pow(4-1, 2))

3.1622776601683795

In [9]:
1/(1 + sqrt(pow(5-4, 2) + pow(4-1, 2)))

0.2402530733520421

In [10]:
# Return a distance-based similarity score for person1 and person2
def sim_distance(prefs, person1, person2):
    # Get the list of shared items
    si = {}
    for item in prefs[person1]:
        for item in prefs[person2]:
            si[item] = 1
            
    # if they have no rating in common, return 0
    if len(si) == 0:
        return 0
    
    # Add up the squares of all the differences
    sum_of_squares = sum([pow(prefs[person1][item] - prefs[person2][item], 2)
                          for item in prefs[person1] if item in prefs[person2]])
    
    return 1/(1 + sqrt(sum_of_squares))

In [11]:
sim_distance(critics, 'Lisa Rose', 'Gene Seymour')

0.29429805508554946

### Pearson Correlation Score

In [58]:
# Return the pearson correlation score for person1 and person2
def sim_pearson(prefs, person1, person2):
    # Get the list of shared items
    si = {}
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item] = 1
    
    # print si
    
    # if they have no rating in common, return 0
    n = len(si)
    if n == 0:
        return 0
    
    # Add up the squares of all the differences
    sum1 = sum([prefs[person1][item] for item in si.keys()])
    sum2 = sum([prefs[person2][item] for item in si.keys()])
    
    # Sum up the squares
    sum1sq = sum([pow(prefs[person1][item], 2) for item in si.keys()])
    sum2sq = sum([pow(prefs[person2][item], 2) for item in si.keys()])
    
    # Sum up the products
    psum = sum([prefs[person1][item] * prefs[person2][item] for item in si.keys()])
    
    # Calculate pearson score
    num = psum - (sum1*sum2/n)
    den = sqrt((sum1sq - pow(sum1, 2)/n) * (sum2sq - pow(sum2, 2)/n))
    
    if den == 0:
        return 0
    
    r = num/den
    return r

In [54]:
sim_pearson(critics, 'Lisa Rose', 'Gene Seymour')

0.39605901719066977

###Step by Step

In [19]:
si = {}
for item in critics['Lisa Rose']:
    print item

Lady in the Water
Snakes on a Plane
Just My Luck
Superman Returns
The Night Listener
You, Me and Dupree


In [20]:
si = {}
for item in critics['Gene Seymour']:
    print item

Lady in the Water
Snakes on a Plane
Just My Luck
Superman Returns
You, Me and Dupree
The Night Listener


In [59]:
si = {}
for item in critics['Gene Seymour']:
    if item in critics['Lisa Rose']:
        si[item] = 1

In [60]:
print si

{'Lady in the Water': 1, 'Snakes on a Plane': 1, 'Just My Luck': 1, 'Superman Returns': 1, 'The Night Listener': 1, 'You, Me and Dupree': 1}


In [34]:
si.keys()

['Lady in the Water',
 'Snakes on a Plane',
 'Just My Luck',
 'Superman Returns',
 'You, Me and Dupree',
 'The Night Listener']

In [30]:
for key, value in si.items():
    print key, value

Lady in the Water 1
Snakes on a Plane 1
Just My Luck 1
Superman Returns 1
You, Me and Dupree 1
The Night Listener 1


In [40]:
sum_1 = sum([critics['Lisa Rose'][key] for key in si.keys()])
sum_2 = sum([critics['Gene Seymour'][key] for key in si.keys()])

In [41]:
print sum_1, sum_2

18.0 19.5


In [42]:
sum_1_sq = sum([pow(critics['Lisa Rose'][key], 2) for key in si.keys()])
sum_2_sq = sum([pow(critics['Gene Seymour'][key], 2) for key in si.keys()])

In [43]:
print sum_1_sq, sum_2_sq

55.0 69.75


In [44]:
product_sum = sum([critics['Lisa Rose'][key]*critics['Gene Seymour'][key] for key in si.keys()])

In [45]:
print product_sum

59.5


In [46]:
num = product_sum - (sum_1*sum_2/len(si))

In [48]:
print num

1.0


In [47]:
den = sqrt((sum_1_sq - pow(sum_1, 2)/2) * (sum_2_sq - pow(sum_2, 2)/2))

In [49]:
print den

113.49063838


In [50]:
r = num/den
print r

0.00881129945404


In [36]:
from __future__ import division
from math import sqrt

def sim_pearson(prefs, p1, p2):
    si = {}
    for item in prefs[p1].keys():
#        for item in prefs[p2].keys():
            if item in prefs[p2].keys():
                si[item] = 1
                
    # Find the number of elements
    n=float(len(si))


    # if they are no ratings in common, return 0
    if n==0:
        print 'n=0'
        return 0


    # Add up all the preferences
    sum1=float(sum([prefs[p1][it] for it in si.keys()]))
    sum2=float(sum([prefs[p2][it] for it in si.keys()]))
    print 'sum1=', sum1, 'sum2=', sum2
    # Sum up the squares
    sum1Sq=float(sum([pow(prefs[p1][it],2) for it in si.keys()]))
    sum2Sq=float(sum([pow(prefs[p2][it],2) for it in si.keys()]))
    print 'sum1s=', sum1Sq, 'sum2s=', sum2Sq
    # Sum up the products
    pSum=float(sum([prefs[p1][it]*prefs[p2][it] for it in si.keys()]))


    # Calculate Pearson score
    num=(pSum/n)-(1.0*sum1*sum2/pow(n,2))
    den=sqrt(((sum1Sq/n)-float(pow(sum1,2))/float(pow(n,2)))*((sum2Sq/n)-float(pow(sum2,2))/float(pow(n,2))))
    if den==0:
        print 'den=0'
        return 0

    r=num/den

    return r

In [33]:
sim_pearson(critics, critics.keys()[1], critics.keys()[2])

sum1= 14.0 sum2= 17.5
sum1s= 42.0 sum2s= 63.75


0.5669467095138385

##Which similarity metrics should you use

### Ranking the critics

In [61]:
# Returns the best matches for people from the prefs dictionay
# Number of results and similarity function are optional paras.
def top_match(prefs, person, n = 5, similarity = sim_pearson):
    score = [(similarity(prefs, person, other), other) for other in prefs if other != person]
    print score
    
    score.sort()
    score.reverse()
    
    return score[0: n]

In [62]:
top_match(critics, 'Toby', n = 3)

[(0.66284898035987, 'Jack Matthews'), (0.9244734516419049, 'Mick LaSalle'), (0.8934051474415647, 'Claudia Puig'), (0.9912407071619299, 'Lisa Rose'), (0.38124642583151164, 'Gene Seymour'), (-1.0, 'Michael Phillips')]


[(0.9912407071619299, 'Lisa Rose'),
 (0.9244734516419049, 'Mick LaSalle'),
 (0.8934051474415647, 'Claudia Puig')]

## Recommending Items

In [79]:
def get_recommendation(prefs, person, similarity = sim_pearson):
    total = {}
    sim_sum = {}
    for other in prefs:
        if other == person:
            continue
        sim = similarity(prefs, person, other)
        
        if sim <= 0:
            continue
    
        for item in prefs[other]:
            if item not in prefs[person] or prefs[person][item] == 0:
                total.setdefault(item, 0)
                total[item] += prefs[other][item] * sim
                sim_sum.setdefault(item, 0)
                sim_sum[item] += sim
                
    ranking = [(total / sim_sum[item], item) for item, total in total.items()]

    ranking.sort()
    ranking.reverse()

    return ranking

In [80]:
get_recommendation(critics, 'Toby')

[(3.3477895267131013, 'The Night Listener'),
 (2.8325499182641614, 'Lady in the Water'),
 (2.5309807037655645, 'Just My Luck')]

###Step by Step

In [77]:
total = {}
sim_sum = {}
for other in critics:
    if other == 'Toby':
        continue
    sim = sim_pearson(critics, other, 'Toby')
    
    if sim <= 0:
        continue
    
    for item in critics[other]:
        if item not in critics['Toby'] or critics['Toby'][item] == 0:
            total.setdefault(item, 0)
            total[item] += critics[other][item] * sim
            sim_sum.setdefault(item, 0)
            sim_sum[item] += sim
            
print total, sim_sum

{'Lady in the Water': 8.383808341404684, 'Just My Luck': 8.074754105841562, 'The Night Listener': 12.89975185847269} {'Lady in the Water': 2.9598095649952167, 'Just My Luck': 3.1903657320769114, 'The Night Listener': 3.853214712436781}


In [78]:
total = {}
sim_sum = {}
for other in critics:
    if other == 'Toby':
        continue
    sim = sim_pearson(critics, other, 'Toby')
    
    if sim <= 0:
        continue
    
    for item in critics[other]:
        if item not in critics['Toby'] or critics['Toby'][item] == 0:
            total.setdefault(item, 0)
            total[item] += critics[other][item] * sim
            sim_sum.setdefault(item, 0)
            sim_sum[item] += sim
ranking = [(total / sim_sum[item], item) for item, total in total.items()]

ranking.sort()
ranking.reverse()

print ranking

[(3.3477895267131013, 'The Night Listener'), (2.8325499182641614, 'Lady in the Water'), (2.5309807037655645, 'Just My Luck')]


## Matching Products

## Building a del.icio.us Link Recommender

### The del.icio.us API

In [1]:
import pydelicious

In [3]:
pydelicious.get_popular(tag = 'programming')

[{'description': u'something went wrong',
  'dt': '',
  'extended': '',
  'tags': '',
  'url': '',
  'user': ''}]

###Building the dataset

In [4]:
from pydelicious import get_popular, get_urlposts, get_urlposts

In [5]:
def initialize_user_dict(tag, count = 5):
    user_dict = {}
    for p1 in get_popular(tag = tag)[0: count]:
        for p2 in get_urlposts(p1['href']):
            user = p2['user']
            user_dict[user] = {}
    return user_dict