In [122]:
from scipy.stats import beta
import operator
import numpy as np
from scipy.special import betaln

In [123]:
dataset = 1
if dataset==0:
    # MovieLens-1M dataset
    ratings_file = '../data/ml-1m/ratings.dat'
    delimiter = '::'
elif dataset==1:
    # MovieLens-100k dataset
    ratings_file = '../data/ml-100k/u.data'
    delimiter = '\t'

In [124]:
def load_1_m():
    ratings = open(ratings_file, 'r').read().split('\n')
    user_item_map = {}
    for r in ratings:
        attrs = r.split(delimiter)
        if len(attrs) < 4:
            continue
        user = int(attrs[0])
        item = int(attrs[1])
        rating = int(attrs[2])
        if user in user_item_map:
            user_item_map[user][item] = rating
        else:
            user_item_map[user] = {}
            user_item_map[user][item] = rating
    for user in user_item_map:
        sum = 0
        for item in user_item_map[user]:
            sum += user_item_map[user][item]
        avg_rating_user = sum * 1.0 / len(user_item_map[user])
        for item in user_item_map[user]:
            if user_item_map[user][item] >= avg_rating_user:
                user_item_map[user][item] = 1
            else:
                user_item_map[user][item] = 0
    for user in user_item_map:
        if len(user_item_map[user]) < 10:
            del user_item_map[user]
    return user_item_map

In [125]:
def form_graph(user_item_map):
    graph = {}
    for user in user_item_map:
        if 'u' + str(user) not in graph:
            graph['u' + str(user)] = set([])
        for item in user_item_map[user]:
            if 'i' + str(item) not in graph:
                graph['i' + str(item)] = set([])
            graph['u' + str(user)].add('i' + str(item))
            graph['i' + str(item)].add('u' + str(user))
    return graph

In [126]:
def clean_graph(graph):
    while True:
        changed = False
        delete_nodes = []
        for node in graph:
            if len(graph[node]) < 10:
                changed = True
                delete_nodes.append(node)
        for node in delete_nodes:
            del graph[node]
        for node1 in graph:
            delete_nodes = []
            for node2 in graph[node1]:
                if node2 not in graph:
                    changed = True
                    delete_nodes.append(node2)
            for node2 in delete_nodes:
                graph[node1].remove(node2)
        if not changed:
            break
    for node in graph:
        graph[node] = list(graph[node])
    return graph

In [127]:
def get_num_ratings(user_item_map):
    item_rating_map = {}
    for user in user_item_map:
        for item in user_item_map[user]:
            if item not in item_rating_map:
                item_rating_map[item] = [1, 1]
            if user_item_map[user][item] == 0:
                item_rating_map[item][1] += 1
            else:
                item_rating_map[item][0] += 1
    return item_rating_map

In [128]:
user_item_map = load_1_m()

In [129]:
item_rating_map = get_num_ratings(user_item_map)

In [130]:
graph = form_graph(user_item_map)

In [131]:
graph = clean_graph(graph)

In [132]:
len(graph.keys())

2095

## Split MovieLens-100k data so that for each user, 80% is training, 20% is for test


In [135]:
first_20percent_of_u1 = [graph['u1'][i] for i in range(int(0.2*len(graph['u1'])))] 
print first_20percent_of_u1

['i148', 'i149', 'i142', 'i143', 'i140', 'i141', 'i146', 'i147', 'i144', 'i145', 'i177', 'i233', 'i229', 'i228', 'i270', 'i235', 'i88', 'i89', 'i205', 'i82', 'i83', 'i80', 'i81', 'i86', 'i87', 'i84', 'i85', 'i258', 'i259', 'i77', 'i76', 'i73', 'i72', 'i71', 'i70', 'i220', 'i79', 'i78', 'i252', 'i253', 'i212', 'i207', 'i218', 'i260', 'i266', 'i265', 'i264', 'i269', 'i268', 'i249', 'i64', 'i65']


In [136]:
len(graph['u1'])

263

In [137]:
graph['u1'][3]

'i143'

In [163]:
# Input: a graph called g
# Output: two test sets, one containing 20% of g's items per user, 
#         the other containing 80% 

from collections import defaultdict

def create_sets(g): 
    test_set = defaultdict(list) 
    training_set = defaultdict(list)
    
    for key in g:
        if key[0]=='u': 
            # For length of items belonging to that key, split 20% and 80%
            first_20 = int(0.2*len(g[key]))
            
            for i in range(first_20): 
                test_set[key].append(g[key][i])

            for j in range(first_20, len(g[key])):
                training_set[key].append(g[key][i])
            
    for user in test_set.keys():
        for item in test_set[user]:
            test_set[item].append(user)
            
    for user in training_set.keys(): 
        for item in training_set: 
            training_set[item].append(user)
            
    return test_set, training_set

In [164]:
movie_100k_test_set, movie_100k_training_set = create_sets(graph)

In [165]:
print first_20percent_of_u1==movie_100k_test_set['u1']

True


In [170]:
print "movie_100k_test_set has ", len(movie_100k_test_set.keys()), "keys"
print "movie_100k_training_set has ", len(movie_100k_training_set.keys()), " keys"

movie_100k_test_set has  1805 keys
movie_100k_training_set has  943  keys


In [11]:
PIS_map = {}
PPS_map = {}
PORS_map = {}

In [12]:
def PIS(item_pair):
    item1 = int(item_pair[0][1:])
    item2 = int(item_pair[1][1:])
    total = 0
    for i in range(0,item_rating_map[item2][0]-1):
        total += np.exp(betaln(item_rating_map[item1][0]+i,item_rating_map[item1][1]+item_rating_map[item2][1]) -\
                        np.log(item_rating_map[item2][1]+i) - \
                        betaln(1+i, item_rating_map[item2][1]) -\
                        betaln(item_rating_map[item1][0],item_rating_map[item1][1])
                       )
    return total

In [13]:
def PPS(item_pair):
    item1 = int(item_pair[0][1:])
    item2 = int(item_pair[1][1:])
    p1 = (item_rating_map[item1][0]) * 1.0 / (item_rating_map[item1][0] + item_rating_map[item1][1])
    p2 = (item_rating_map[item2][0]) * 1.0 / (item_rating_map[item2][0] + item_rating_map[item2][1])
    return p1 * p2

In [14]:
def PORS(item_pair):
    item1 = int(item_pair[0][1:])
    item2 = int(item_pair[1][1:])
    o1 = (item_rating_map[item1][0]) * 1.0 / (item_rating_map[item1][1])
    o2 = (item_rating_map[item2][0]) * 1.0 / (item_rating_map[item2][1])
    return o2 / o1

In [15]:
def rank(graph, target_user):
    score_map_PPS = {}
    score_map_PORS = {}
    score_map_PIS = {}
    for primary_item in graph[target_user]:
        score_map_PPS[primary_item] = 0.0
        score_map_PORS[primary_item] = 0.0
        score_map_PIS[primary_item] = 0.0
        for secondary_user in graph[primary_item]:
            if secondary_user == target_user:
                continue
            for secondary_item in graph[secondary_user]:
                if secondary_item in graph[target_user]:
                    continue
                if (primary_item, secondary_item) in PIS_map:
                    score_map_PIS[primary_item] += PIS_map[(primary_item, secondary_item)]
                else:
                    PIS_map[(primary_item, secondary_item)] = PIS((primary_item, secondary_item))
                    score_map_PIS[primary_item] += PIS_map[(primary_item, secondary_item)]
                if (primary_item, secondary_item) in PPS_map:
                    score_map_PPS[primary_item] += PPS_map[(primary_item, secondary_item)]
                else:
                    PPS_map[(primary_item, secondary_item)] = PPS((primary_item, secondary_item))
                    score_map_PPS[primary_item] += PPS_map[(primary_item, secondary_item)]
                if (primary_item, secondary_item) in PORS_map:
                    score_map_PORS[primary_item] += PORS_map[(primary_item, secondary_item)]
                else:
                    PORS_map[(primary_item, secondary_item)] = PORS((primary_item, secondary_item))
                    score_map_PORS[primary_item] += PORS_map[(primary_item, secondary_item)]
    return score_map_PIS, score_map_PPS, score_map_PORS

In [16]:
ranking = rank(graph, 'u1')

In [17]:
ranking_PIS = ranking[0]
ranking_PPS = ranking[1]
ranking_PORS = ranking[2]

In [18]:
sorted_1 = sorted(ranking_PIS.items(), key=operator.itemgetter(1))
sorted_2 = sorted(ranking_PPS.items(), key=operator.itemgetter(1))
sorted_3 = sorted(ranking_PORS.items(), key=operator.itemgetter(1))

print sorted_1[:5]
print sorted_2[:5]
print sorted_3[:5]

[('i745', 206.4680107331514), ('i720', 2106.8424939979082), ('i1207', 3885.7999156029359), ('i527', 3887.8614210312967), ('i1545', 4612.421581657597)]
[('i1545', 10568.064315519812), ('i1836', 17773.79575468348), ('i938', 19792.267945228297), ('i48', 23890.925660660796), ('i2340', 28800.92743752021)]
[('i1545', 16468.67506574581), ('i745', 31586.12838257131), ('i720', 39623.37200612985), ('i1207', 83710.71764228817), ('i1836', 95238.31769131651)]


In [19]:
print ranking_PIS

{'i595': 87717.470884331851, 'i594': 89390.897256695491, 'i2687': 61884.944074727624, 'i3105': 92404.876355702916, 'i914': 25491.453800161973, 'i48': 108245.60713107722, 'i2355': 117093.59538529013, 'i919': 44720.311925311216, 'i1035': 78226.211239892524, 'i2762': 8192.5401107103753, 'i938': 50542.262611704675, 'i2028': 19826.964942116028, 'i661': 109242.23052503054, 'i1193': 8326.7635344314676, 'i1097': 161547.63750853803, 'i1270': 149694.89901657874, 'i1197': 24804.944604878856, 'i1721': 212917.49076007781, 'i150': 33339.406782028149, 'i720': 2106.8424939979082, 'i2294': 139326.6863857549, 'i608': 40541.11915146446, 'i3186': 86098.568771103673, 'i527': 3887.8614210312967, 'i1961': 69898.666898030628, 'i2398': 43528.094461544948, 'i745': 206.4680107331514, 'i2791': 95525.341970821741, 'i2797': 126976.39641321448, 'i1287': 38483.278731781298, 'i1': 53800.761370730514, 'i588': 126260.07467461516, 'i2692': 17754.375151528529, 'i2340': 80547.991885797717, 'i1962': 73868.765279109473, 'i78