In [1]:
from scipy.stats import beta
import operator
import numpy as np
from scipy.special import betaln

In [2]:
def load_1_m():
    ratings = open('../ml-1m/ratings.dat', 'r').read().split('\n')
    user_item_map = {}
    for r in ratings:
        attrs = r.split('::')
        if len(attrs) < 4:
            continue
        user = int(attrs[0])
        item = int(attrs[1])
        rating = int(attrs[2])
        if user in user_item_map:
            user_item_map[user][item] = rating
        else:
            user_item_map[user] = {}
            user_item_map[user][item] = rating
    for user in user_item_map:
        sum = 0
        for item in user_item_map[user]:
            sum += user_item_map[user][item]
        avg_rating_user = sum * 1.0 / len(user_item_map[user])
        for item in user_item_map[user]:
            if user_item_map[user][item] >= avg_rating_user:
                user_item_map[user][item] = 1
            else:
                user_item_map[user][item] = 0
    for user in user_item_map:
        if len(user_item_map[user]) < 10:
            del user_item_map[user]
    return user_item_map

In [3]:
def form_graph(user_item_map):
    graph = {}
    for user in user_item_map:
        if 'u' + str(user) not in graph:
            graph['u' + str(user)] = set([])
        for item in user_item_map[user]:
            if 'i' + str(item) not in graph:
                graph['i' + str(item)] = set([])
            graph['u' + str(user)].add('i' + str(item))
            graph['i' + str(item)].add('u' + str(user))
    return graph

In [4]:
def clean_graph(graph):
    while True:
        changed = False
        delete_nodes = []
        for node in graph:
            if len(graph[node]) < 10:
                changed = True
                delete_nodes.append(node)
        for node in delete_nodes:
            del graph[node]
        for node1 in graph:
            delete_nodes = []
            for node2 in graph[node1]:
                if node2 not in graph:
                    changed = True
                    delete_nodes.append(node2)
            for node2 in delete_nodes:
                graph[node1].remove(node2)
        if not changed:
            break
    for node in graph:
        graph[node] = list(graph[node])
    return graph

In [5]:
def get_num_ratings(user_item_map):
    item_rating_map = {}
    for user in user_item_map:
        for item in user_item_map[user]:
            if item not in item_rating_map:
                item_rating_map[item] = [1, 1]
            if user_item_map[user][item] == 0:
                item_rating_map[item][1] += 1
            else:
                item_rating_map[item][0] += 1
    return item_rating_map

In [6]:
user_item_map = load_1_m()

In [7]:
item_rating_map = get_num_ratings(user_item_map)

In [8]:
graph = form_graph(user_item_map)

In [9]:
graph = clean_graph(graph)

In [10]:
PIS_map = {}
PPS_map = {}
PORS_map = {}

In [11]:
def PIS(item_pair):
    item1 = int(item_pair[0][1:])
    item2 = int(item_pair[1][1:])
    total = 0
    for k in xrange(item_rating_map[item1][0]):
        total += np.exp(k * np.log(item_rating_map[item1][1]) + \
                        item_rating_map[item2][0] * np.log(item_rating_map[item2][1]) - \
                        (k + item_rating_map[item2][0]) * np.log(item_rating_map[item1][1] + item_rating_map[item2][1]) - \
                        np.log(k + item_rating_map[item2][0]) - \
                        betaln(k + 1, item_rating_map[item2][0]))
    return total

In [12]:
def PPS(item_pair):
    item1 = int(item_pair[0][1:])
    item2 = int(item_pair[1][1:])
    p1 = (item_rating_map[item1][0]) * 1.0 / (item_rating_map[item1][0] + item_rating_map[item1][1])
    p2 = (item_rating_map[item2][0]) * 1.0 / (item_rating_map[item2][0] + item_rating_map[item2][1])
    return p1 * p2

In [13]:
def PORS(item_pair):
    item1 = int(item_pair[0][1:])
    item2 = int(item_pair[1][1:])
    o1 = (item_rating_map[item1][0]) * 1.0 / (item_rating_map[item1][1])
    o2 = (item_rating_map[item2][0]) * 1.0 / (item_rating_map[item2][1])
    return o2 / o1

In [14]:
def rank(graph, target_user):
    score_map_PPS = {}
    score_map_PORS = {}
    score_map_PIS = {}
    for primary_item in graph[target_user]:
        score_map_PPS[primary_item] = 0.0
        score_map_PORS[primary_item] = 0.0
        score_map_PIS[primary_item] = 0.0
        for secondary_user in graph[primary_item]:
            if secondary_user == target_user:
                continue
            for secondary_item in graph[secondary_user]:
                if secondary_item in graph[target_user]:
                    continue
                if (primary_item, secondary_item) in PIS_map:
                    score_map_PIS[primary_item] += PIS_map[(primary_item, secondary_item)]
                else:
                    PIS_map[(primary_item, secondary_item)] = PIS((primary_item, secondary_item))
                    score_map_PIS[primary_item] += PIS_map[(primary_item, secondary_item)]
                if (primary_item, secondary_item) in PPS_map:
                    score_map_PPS[primary_item] += PPS_map[(primary_item, secondary_item)]
                else:
                    PPS_map[(primary_item, secondary_item)] = PPS((primary_item, secondary_item))
                    score_map_PPS[primary_item] += PPS_map[(primary_item, secondary_item)]
                if (primary_item, secondary_item) in PORS_map:
                    score_map_PORS[primary_item] += PORS_map[(primary_item, secondary_item)]
                else:
                    PORS_map[(primary_item, secondary_item)] = PORS((primary_item, secondary_item))
                    score_map_PORS[primary_item] += PORS_map[(primary_item, secondary_item)]
    return score_map_PIS, score_map_PPS, score_map_PORS

In [15]:
ranking = rank(graph, 'u1')

In [16]:
ranking_PIS = ranking[0]
ranking_PPS = ranking[1]
ranking_PORS = ranking[2]

In [20]:
sorted_1 = sorted(ranking_PIS.items(), key=operator.itemgetter(1))
sorted_2 = sorted(ranking_PPS.items(), key=operator.itemgetter(1))
sorted_3 = sorted(ranking_PORS.items(), key=operator.itemgetter(1))

print sorted_1[:5]
print sorted_2[:5]
print sorted_3[:5]

[('i1545', 22380.754472319401), ('i938', 26437.537668806315), ('i1836', 29879.566955937455), ('i48', 33801.68039857551), ('i783', 44708.832377621235)]
[('i1545', 10568.064315519812), ('i1836', 17773.79575468348), ('i938', 19792.267945228297), ('i48', 23890.925660660796), ('i2340', 28800.92743752021)]
[('i1545', 16468.67506574581), ('i745', 31586.12838257131), ('i720', 39623.37200612985), ('i1207', 83710.71764228817), ('i1836', 95238.31769131651)]


In [21]:
print ranking_PIS

{'i595': 256175.53052375047, 'i594': 182497.29664155206, 'i2687': 84601.906019442366, 'i3105': 164996.78701696778, 'i914': 194693.36275612802, 'i48': 33801.68039857551, 'i2355': 306586.88065263059, 'i919': 442822.00917179394, 'i1035': 208684.44546359841, 'i2762': 569327.53994494327, 'i938': 26437.537668806315, 'i2028': 590918.30898730026, 'i661': 69423.894443574114, 'i1193': 422844.34132531757, 'i1097': 430706.74542712723, 'i1270': 490834.29180657631, 'i1197': 559202.32816141541, 'i1721': 205275.77721870903, 'i150': 356519.00173197739, 'i720': 136031.13785423755, 'i2294': 97891.25365521753, 'i608': 578742.32083193038, 'i3186': 55415.7556456315, 'i527': 539914.66617395694, 'i1961': 329627.56738324871, 'i2398': 118793.49328272487, 'i745': 179168.27398347176, 'i2791': 392142.05988478119, 'i2797': 363786.99977048568, 'i1287': 207117.81267752033, 'i1': 486932.15916609764, 'i588': 286324.85858594102, 'i2692': 285015.09224309091, 'i2340': 48832.842814360607, 'i1962': 184628.29280518269, 'i783