## Recommendation Movies

In [12]:
# A dictionary of movie critics and their ratings of a small
# set of movies
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
 'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 
 'The Night Listener': 3.0},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 
 'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0, 
 'You, Me and Dupree': 3.5}, 
'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
 'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
 'The Night Listener': 4.5, 'Superman Returns': 4.0, 
 'You, Me and Dupree': 2.5},
'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 
 'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
 'You, Me and Dupree': 2.0}, 
'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
 'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}


from math import sqrt

# similarity distance between 
def sim_distance(prefs, person1, person2):
    si = {}
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item] = 1
            
            
    if len(si) == 0:
        return 0
    
    
    sum_of_suqares = sum([pow(prefs[person1][item] - prefs[person2][item], 2) for item in prefs[person1] if item in prefs[person2]])
    
    return 1 / (1.0 + sqrt(sum_of_suqares))

    
print sim_distance(critics, "Gene Seymour", "Toby")
    

0.258245699761


In [13]:
#Peason correlation score
#https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient

def sim_pearson_distance(prefs, person1, person2):
    si = {}
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item] = 1
            
    n = len(si)
    
    if n == 0:
        return 0
    
    sum1 = sum([prefs[person1][item] for item in si])
    sum2 = sum([prefs[person2][item] for item in si])
    
    sq_sum1 = sum([pow(prefs[person1][item], 2) for item in si])
    sq_sum2 = sum([pow(prefs[person2][item], 2) for item in si])
    
    product_sum = sum([prefs[person1][item] * prefs[person2][item] for item in si])
    
    num = (product_sum - sum1 * sum2/n)
    den = sqrt((sq_sum1 - pow(sum1, 2) / n) * (sq_sum2 - pow(sum2, 2)/n))
    if den == 0:
        return 0
    
    r = num / den
    
    return r
  
    
print sim_pearson_distance(critics, "Gene Seymour", "Lisa Rose")    

0.396059017191


In [14]:
def top_matches(prefs, person, n = 5, similarity = sim_pearson_distance):
    scores = [(similarity(prefs, person, other), other) for other in prefs if other != person]
    
    scores.sort()
    scores.reverse()
    return scores[0:n]

print top_matches(critics, "Toby", 3)


[(0.9912407071619299, 'Lisa Rose'), (0.9244734516419049, 'Mick LaSalle'), (0.8934051474415647, 'Claudia Puig')]


In [15]:
def get_recommendations(prefs, person, similarity = sim_pearson_distance):
    totals = {}
    sim_sums = {}
    for other in prefs:
        if other == person:
            continue
        
        sim = similarity(prefs, person, other)
        if sim <= 0:  #quite different preference
            continue
            
        for item in prefs[other]:
            if item not in prefs[person] or prefs[person][item] == 0:
                totals.setdefault(item, 0)
                totals[item] += prefs[other][item] * sim
                sim_sums.setdefault(item, 0)
                sim_sums[item] += sim
        
    

    
    rankings = [((total/sim_sums[item]), item) for item, total in totals.items()]
    
    rankings.sort()
    rankings.reverse()
    return rankings

print get_recommendations(critics, "Toby")

                

[(3.3477895267131013, 'The Night Listener'), (2.8325499182641614, 'Lady in the Water'), (2.5309807037655645, 'Just My Luck')]


## Matching Products

In [16]:
def transform_prefs(prefs):
    result = {}
    for person in prefs:
        for item in prefs[person]:
            if prefs[person][item] == None :
                continue
                
            result.setdefault(item, {})            
            result[item][person] = prefs[person][item]
            
    return result

movies = transform_prefs(critics)

print get_recommendations(movies, "Just My Luck")


[(4.0, 'Michael Phillips'), (3.0, 'Jack Matthews')]


In [17]:
def calculate_similar_items(prefs, n = 10):
    result = {}
    item_prefs = transform_prefs(prefs)
    
    c = 0
    for item in item_prefs:
        c += 1
        if c % 100 == 0:
            print "%d / %d" % (c, len(item_prefs))
        
        scores = top_matches(item_prefs,  item, n = n, similarity = sim_pearson_distance)
        result[item] = scores
        
    return result

print calculate_similar_items(critics)

{'Lady in the Water': [(0.7637626158259785, 'Snakes on a Plane'), (0.4879500364742689, 'Superman Returns'), (0.3333333333333333, 'You, Me and Dupree'), (-0.6123724356957927, 'The Night Listener'), (-0.9449111825230676, 'Just My Luck')], 'Snakes on a Plane': [(0.7637626158259785, 'Lady in the Water'), (0.11180339887498941, 'Superman Returns'), (-0.3333333333333333, 'Just My Luck'), (-0.5663521139548527, 'The Night Listener'), (-0.6454972243679047, 'You, Me and Dupree')], 'Just My Luck': [(0.5555555555555556, 'The Night Listener'), (-0.3333333333333333, 'Snakes on a Plane'), (-0.42289003161103106, 'Superman Returns'), (-0.4856618642571827, 'You, Me and Dupree'), (-0.9449111825230676, 'Lady in the Water')], 'Superman Returns': [(0.6579516949597695, 'You, Me and Dupree'), (0.4879500364742689, 'Lady in the Water'), (0.11180339887498941, 'Snakes on a Plane'), (-0.1798471947990544, 'The Night Listener'), (-0.42289003161103106, 'Just My Luck')], 'You, Me and Dupree': [(0.6579516949597695, 'Sup

In [21]:
def get_recommended_items(prefs, item_match, user):
    user_ratings = prefs[user]
    scores = {}
    total_sim = {}
    
    for (item, rating) in user_ratings.items():
        for (similarity, item2) in item_match[item]:
            if item2  in user_ratings:
                continue
            
            scores.setdefault(item2, 0)
            scores[item2] += similarity * rating
            
            total_sim.setdefault(item2, 0)
            total_sim[item2] += similarity
            
    # rankings = [(score/total_sim[item], item) for item, score in scores.items()] #normalization
    rankings = []
    for item, score in scores.items():
        if total_sim[item] == 0:
            print "item = ", item, ", score = ", score, "total_sim = ", total_sim[item]
        rankings.append((score/total_sim[item], item))
    
    rankings.sort(reverse = True)
    return rankings

print get_recommended_items(critics, calculate_similar_items(critics), "Toby")

    

[(3.610031066802182, 'Lady in the Water'), (3.531395034185976, 'The Night Listener'), (2.9609998607242685, 'Just My Luck')]


In [23]:
def load_movie_lens(path):
    movies = {}
    for line in open(path + r"\movies.dat"):
        (movie_id, title) = line.split("::")[0:2]
        movies[movie_id] = title
    
    prefs = {}
    for line in open(path + r"\ratings.dat"):
        (user_id, movie_id, rating) = line.split("::")[0:3]
        prefs.setdefault(user_id, {})
        prefs[user_id][movies[movie_id]] = float(rating)
        
    return movies, prefs
        
path = r"C:\Localdata\data\CollectiveIntelligence\ml-1m\\"

movies, prefs = load_movie_lens(path)



print "start analyzing..."
print get_recommended_items(prefs, calculate_similar_items(prefs), "20")
print "finished"

start analyzing...
100 / 3706
200 / 3706
300 / 3706
400 / 3706
500 / 3706
600 / 3706
700 / 3706
800 / 3706
900 / 3706
1000 / 3706
1100 / 3706
1200 / 3706
1300 / 3706
1400 / 3706
1500 / 3706
1600 / 3706
1700 / 3706
1800 / 3706
1900 / 3706
2000 / 3706
2100 / 3706
2200 / 3706
2300 / 3706
2400 / 3706
2500 / 3706
2600 / 3706
2700 / 3706
2800 / 3706
2900 / 3706
3000 / 3706
3100 / 3706
3200 / 3706
3300 / 3706
3400 / 3706
3500 / 3706
3600 / 3706
3700 / 3706
[(5.0, 'Window to Paris (1994)'), (5.0, 'Ugly, The (1997)'), (5.0, 'Trans (1998)'), (5.0, 'Train of Life (Train De Vie) (1998)'), (5.0, 'Time of the Gypsies (Dom za vesanje) (1989)'), (5.0, 'Theodore Rex (1995)'), (5.0, 'Taffin (1988)'), (5.0, 'Synthetic Pleasures (1995)'), (5.0, 'Stonewall (1995)'), (5.0, 'Seven Chances (1925)'), (5.0, 'Second Best (1994)'), (5.0, 'Roadside Prophets (1992)'), (5.0, "Pot O' Gold (1941)"), (5.0, 'Pandora and the Flying Dutchman (1951)'), (5.0, 'N\xe9nette et Boni (1996)'), (5.0, 'Nosferatu a Venezia (1986)')