## Imports

In [1]:
from collections import Counter, defaultdict

import numpy as np
from tqdm import tqdm

from rec.data_loader import Dataset
from rec.evaluation import evaluate_on_valid_set

## Model definition

* This is the example model provided by the [Kaggle competition](https://www.kaggle.com/c/msdchallenge).
* It simply orders the songs by popularity (number of users played by), then for each user, predicts the most popular songs not already seen in that user's library.

In [2]:
class DummyModel(object):
    
    def __init__(self):
        self.song_to_count = Counter()
        self.user_to_songs = defaultdict(set)
        
        self.songs_ordered = None
        
    @property
    def fitted(self):
        return self.songs_ordered is not None
        
    def predict_for_user(self, user_id, user_data):
        out = []
        for song in self.songs_ordered:
            if len(out) >= 500:
                break
                
            if song not in self.user_to_songs[user_id]:
                out.append(song)
                
        return np.array(out)
    
    def fit(self, train_data):
        for (user_id, user_data) in tqdm(
            train_data.iterate_over_visible_data(), total=len(train_data)
        ):
            for (song_id, _) in user_data:
                self.song_to_count[song_id] += 1
                self.user_to_songs[user_id].add(song_id)

        self.songs_ordered = sorted(
            self.song_to_count, key=lambda item: item[1]
        )

## 'Train' the model and evaluate on validation set 

* Evaluation metric: the _mean average precision_ (MAP), or the average number of songs the model correctly predicted over each user.
* This toy model gets an MAP of approximately 0.001821.

In [3]:
train_data = Dataset(which='train')
dummy = DummyModel()
dummy.fit(train_data)

result = evaluate_on_valid_set(dummy)
result

1019318it [02:35, 6562.22it/s]                             
100%|██████████| 10000/10000 [00:42<00:00, 233.62it/s]


0.0018208000000000002