In [1]:
import pandas as pd
import os

In [2]:
import numpy as np

In [60]:
def precision_at_k(k: int, y: list, u: int, M: list) -> float:
    """
    Gets the value of P_k.
    
    parameters:
        k: The value of k to compute P_k
        y: A ranking over items, y(j) = i indicates that item i is ranked at position j
        u: The index of the user in the user_to_integer_mapping
        M: The users-by-items feedback matrix
    
    returns:
        P_k: The value of P_k.
    """
    running_sum = 0
    for j in range(k):
        running_sum += M[u][y[j]]
    return running_sum / k

In [62]:
def AP(u: int, y: list, tau: int, M: list) -> float:
    """
    Gets the average precision value for given tau (in our example, tau = 500, and we'll use that in mAP).
    
    parameters:
        u: The index of the user in the user_to_integer_mapping
        y: A ranking over items, y(j) = i indicates that item i is ranked at position j
        tau: The threshold to evaluate the top portion of the predicted rankings
        M: The users-by-items feedback matrix
    
    returns:
        The AP value.
    """
    np = positive(u)  # number of positively associated songs for user u
    running_sum = 0
    for k in range(tau):
        running_sum += precision_at_k(k, y, u, M) * M[u][y[k]]
    return running_sum / np

In [63]:
def mAP(users: list, y: list, M: list) -> float:
    """
    Gets the mean average precision values
    
    parameters:
        users: The list of indices of all users in the mapping
        y: A ranking over items, y(j) = i indicates that item i is ranked at position j
        M: The users-by-items feedback matrix
    """
    running_sum = 0
    for u in users:
        running_sum += AP(u, y, 500, M)
    return running_sum / len(users)

We have a simple implementation of mAP. We now need:
* A way to map the string users to integers (these could be random)
* An implementation of `positive` to get the number of songs in the hidden file for user `u`

In [11]:
user_to_integer_mapping = []

In [38]:
def build_user_to_integer_map():
    hidden_file_path = os.path.join(os.path.pardir, "data", "evaluation", "year1_test_triplets_hidden.txt")
    hidden_df = pd.read_csv(hidden_file_path, header=None, sep="\t")
    
    unique_users = hidden_df[0].unique()
    for i, user in enumerate(unique_users):
        user_to_integer_mapping.append((user, i))

In [39]:
build_user_to_integer_map()

In [42]:
user_to_integer_mapping[:2]

[('00007a02388c208ea7176479f6ae06f8224355b3', 0),
 ('00014a76ed063e1a749171a253bca9d9a0ff1782', 1)]

In [51]:
def positive(u: int):
    user_id = user_to_integer_mapping[u][0]  # Because the tuples are organized from a 0-based index
    hidden_file_path = os.path.join(os.path.pardir, "data", "evaluation", "year1_test_triplets_hidden.txt")
    hidden_df = pd.read_csv(hidden_file_path, header=None, sep="\t")
    
    return len(hidden_df[hidden_df[0] == user_id])

In [52]:
positive(0)

9

So now we've implemented the functions we need. Let's now give it a sample 500 songs and check for a few cases the mAP values to make sure it works right.

The first case is where we give the exact order for the songs for the first user, and the rest of the songs can be random. This should get us a score of 1.0

In [68]:
hidden_df = None

In [81]:
hidden_file_path = os.path.join(os.path.pardir, "data", "evaluation", "year1_test_triplets_hidden.txt")
hidden_df = pd.read_csv(hidden_file_path, header=None, sep="\t")
print(hidden_df)

                                                0                   1   2
0        00007a02388c208ea7176479f6ae06f8224355b3  SOOFKYO12AF72A2640   1
1        00007a02388c208ea7176479f6ae06f8224355b3  SOIHOIQ12A8C138593   2
2        00007a02388c208ea7176479f6ae06f8224355b3  SOYIZSN12A6701E0BB   3
3        00007a02388c208ea7176479f6ae06f8224355b3  SODYZAD12A58A7A525   1
4        00007a02388c208ea7176479f6ae06f8224355b3  SOXLWPN12A8C143667   1
5        00007a02388c208ea7176479f6ae06f8224355b3  SOOHJTL12AB0185497   1
6        00007a02388c208ea7176479f6ae06f8224355b3  SOAHLGV12AF72A6DFC   1
7        00007a02388c208ea7176479f6ae06f8224355b3  SOHDPUQ12A6701FB97   1
8        00007a02388c208ea7176479f6ae06f8224355b3  SOPCYXP12A58A75EB1   1
9        00014a76ed063e1a749171a253bca9d9a0ff1782  SOTRMSR12A8C132CBE   1
10       00014a76ed063e1a749171a253bca9d9a0ff1782  SOHYHMJ12A6D4F615E   1
11       00014a76ed063e1a749171a253bca9d9a0ff1782  SOXTRHS12A6D4F992C   1
12       00014a76ed063e1a749171a253bca

In [114]:
def alternate_AP(u: int, predictions: list) -> float:
    """
    Gets the AP value
    
    parameters:
        u: The index of the user in the user_to_integer_mapping
        predictions: List of song ID strings in order
    """
    total_songs = positive(u)
    running_sum = 0
    
    songs_for_user = list(hidden_df[hidden_df[0] == user_to_integer_mapping[u][0]][1])

    for i in range(total_songs):
        try:
            pred_ith_song_index = predictions.index(songs_for_user[i])
            running_sum += ((i + 1) / (pred_ith_song_index + 1))
        except:
            pass
    return running_sum / total_songs

In [83]:
def alternate_mAP(predictions: list) -> float:
    running_sum = 0
    for i in range(len(user_to_integer_mapping)):
        print("Processing user " + str(i))
        running_sum += alternate_AP(i, predictions)
    return running_sum / len(user_to_integer_mapping)

## Test Case 1

In [57]:
hidden_file_path = os.path.join(os.path.pardir, "data", "evaluation", "year1_test_triplets_hidden.txt")
hidden_df = pd.read_csv(hidden_file_path, header=None, sep="\t")
first_songs = list(hidden_df[hidden_df[0] == user_to_integer_mapping[0][0]][1])

for i in range(492):
    first_songs.append("kjslfks")

In [59]:
first_songs[:10]

['SOOFKYO12AF72A2640',
 'SOIHOIQ12A8C138593',
 'SOYIZSN12A6701E0BB',
 'SODYZAD12A58A7A525',
 'SOXLWPN12A8C143667',
 'SOOHJTL12AB0185497',
 'SOAHLGV12AF72A6DFC',
 'SOHDPUQ12A6701FB97',
 'SOPCYXP12A58A75EB1',
 'kjslfks']

In [116]:
alternate_AP(0, first_songs)

1.0

This is the expected value for this case. Now let's try the next case.

## Test Case 2

In [112]:
rotated_song_list = first_songs[9:] + first_songs[:9]
rotated_song_list[490:]

['kjslfks',
 'kjslfks',
 'SOOFKYO12AF72A2640',
 'SOIHOIQ12A8C138593',
 'SOYIZSN12A6701E0BB',
 'SODYZAD12A58A7A525',
 'SOXLWPN12A8C143667',
 'SOOHJTL12AB0185497',
 'SOAHLGV12AF72A6DFC',
 'SOHDPUQ12A6701FB97',
 'SOPCYXP12A58A75EB1']

In [115]:
alternate_AP(0, rotated_song_list)

0.010033642851571899

This is also correct, implying that our implementation of `alternate_AP` is right, and thus, so is `alternate_mAP`. We can now use this to evaluate future predictions. One next step is to try and reduce the time it takes.