# Computer Systems 2016/17

### Practice 3 - MAP

In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [2]:
from urllib.request import urlretrieve
import zipfile

In [3]:
urlretrieve ("http://files.grouplens.org/datasets/movielens/ml-100k.zip", "ml-100k.zip")

('ml-100k.zip', <http.client.HTTPMessage at 0x7ffb6062aac8>)

In [4]:
dataFile = zipfile.ZipFile("ml-100k.zip")

URM_path = dataFile.extract("ml-100k/u.data")
URM = sc.textFile(URM_path)


def rowSplit (rowString):
    split = rowString.split("\t")
    result = tuple(split)
    return result


URM_tuple = URM.map(rowSplit)

URM_tuple.take(10)

[('196', '242', '3', '881250949'),
 ('186', '302', '3', '891717742'),
 ('22', '377', '1', '878887116'),
 ('244', '51', '2', '880606923'),
 ('166', '346', '1', '886397596'),
 ('298', '474', '4', '884182806'),
 ('115', '265', '2', '881171488'),
 ('253', '465', '5', '891628467'),
 ('305', '451', '3', '886324817'),
 ('6', '86', '3', '883603013')]

In [5]:
URM_training, URM_test = URM_tuple.randomSplit([0.6, 0.4])

URM_tuple = None

print("Train tuples {}, test tuples {}".format(URM_training.count(), URM_test.count()))

Train tuples 60214, test tuples 39786


In [6]:
userRelevantItems = URM_test.map(lambda x: (x[0], [x[1]])).reduceByKey(lambda x,y : x + y)

userRelevantItems_dict = userRelevantItems.collectAsMap()

user_id = "42"
print("User {} saw {} items, some of them are: {}".
      format(user_id, len(userRelevantItems_dict[user_id]), userRelevantItems_dict[user_id][0:10]))

User 42 saw 74 items, some of them are: ['274', '98', '953', '456', '143', '746', '48', '999', '781', '1043']


#### We implement a function calculating MAP given two lists
* RankedList refers to the recommendations the user will receive
* PositiveItems refers to the items in its test set

In [7]:
import numpy as np

def map(RankedList, PositiveItems, at=None):
    """
    Calculates MAP@__ 
    """
    
    RankedList = RankedList[:at]
    is_relevant = np.in1d(RankedList, PositiveItems, assume_unique=True)
    
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(len(is_relevant)))
    map_score = np.sum(p_at_k) / np.min([len(PositiveItems), len(RankedList)])
    
    assert 0 <= map_score <= 1, map_score
    
    return map_score


#### Let's see how well a Random recommender performs

In [8]:
trainingItemList = URM_training.map(lambda x: int(x[1])).distinct().collect()

def recommendRandom (size=10):

    #randomRecommendedItems = trainingItemList.takeSample(False, 10)
    randomRecommendedItems = np.random.choice(trainingItemList, size=size)

    return randomRecommendedItems


recommendRandom ()

array([1437, 1426,  662,  725,  688, 1335, 1429, 1176,  654,  907])

In [9]:
mapScore = 0

mapScore += map(recommendRandom (), userRelevantItems_dict[user_id])

mapScore

0.0

#### We have to calculate the cumulative MAP for all users and then divide for their number

In [10]:
userList_rdd = URM_training.map(lambda x: int(x[0])).distinct()

userList_rdd.take(10)

[2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

In [11]:
userList_rdd_map = userList_rdd.map(lambda x: map(recommendRandom (), userRelevantItems_dict[str(x)]))

userList_rdd_map.take(10)

[0.016666666666666666,
 0.033333333333333333,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [12]:
testMAP = userList_rdd_map.sum()/userList_rdd_map.count()

print("Overall MAP for a random recommender is {:.5f}".format(testMAP))

Overall MAP for a random recommender is 0.00872


## MAP for TopPop

In [13]:
itemPopularity = URM_training.map(lambda x: (x[1],1))

itemPopularity = itemPopularity.reduceByKey(lambda x,y : x+y)

itemPopularity_sorted = itemPopularity.takeOrdered(itemPopularity.count(), key=lambda x: -x[1])



def recommendTopPopRDDfree(user_id, numberOfItemsToRecommend = 10):
    
    ### Here we ignore the seenItems

    recommendedList = itemPopularity_sorted
    
    return recommendedList[0:numberOfItemsToRecommend]

In [14]:
userList_rdd_map = userList_rdd.map(lambda x: map(recommendTopPopRDDfree(user_id=str(x)), userRelevantItems_dict[str(x)]))

userList_rdd_map.take(10)

[0.2956501831501831,
 0.28166666666666662,
 0.089848484848484844,
 0.050000000000000003,
 0.048834498834498835,
 0.020098039215686276,
 0.24250626566416039,
 0.025757575757575757,
 0.25465201465201465,
 0.15357142857142855]

In [15]:
testMAP = userList_rdd_map.sum()/userList_rdd_map.count()

print("Overall MAP for a TopPop recommender is {:.5f}".format(testMAP))

Overall MAP for a TopPop recommender is 0.10125
