# Example of collaborative filtering

In [149]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from collections import defaultdict
import operator

## Setup data

Let us define a random data describing young people tastes.
We store 'age', 'gender' as profile information and targets : 'like_comics', 'like_games', 'like_movies', 'like_sport'

In [77]:
size = 1000
DATA = pd.DataFrame(
    {
        '__id': range(size),
        '_age': np.random.randint(7, 25, size=size),
        '_gender': np.random.randint(2, size=size),
        'like_movies': np.random.randint(2, size=size),
        'like_comics': np.random.randint(2, size=size),
        'like_games': np.random.randint(2, size=size),
        'like_sport': np.random.randint(2, size=size),
    }
)

In [78]:
DATA.tail()

Unnamed: 0,__id,_age,_gender,like_comics,like_games,like_movies,like_sport
995,995,8,0,0,0,1,1
996,996,10,0,1,0,1,0
997,997,22,0,1,0,0,0
998,998,16,1,0,1,1,1
999,999,11,1,1,0,0,1


In [79]:
TRAIN, TEST = train_test_split(DATA, train_size=0.7)

## Compute recommendations

From training dataset define a dictionary 'recommendations' with keys `('age', 'gender')` and values `{'like_comics': count1, 'like_games': count2, 'like_movies': count3, 'like_sport': count4}`
Then sort it by most popular

In [80]:
def get_key(row):
    return tuple(row[1:3])

def get_targets(row):
    return tuple(row[3:])

In [100]:
recommendations = defaultdict(lambda: defaultdict(float))
target_labels = ['like_comics','like_games','like_movies','like_sport']
for row in TRAIN.values:
    key = get_key(row)
    targets = get_targets(row)
    
    for l, t in zip(target_labels, targets):
        if t > 0:
            recommendations[key][l] += 1

Transform counts to frequencies : 
```count -> count / nb_votes_per_age_gender```


In [102]:
nb_votes_per_age_gender = TRAIN.groupby(['_age', '_gender']).agg('size')
for key, value in recommendations.items():
    for k in value:
        value[k] /= nb_votes_per_age_gender[key]    

In [103]:
sorted_recommendations = defaultdict(lambda: defaultdict(float))
for key, value in recommendations.items():
    value = sorted(value.items(), key=operator.itemgetter(1), reverse=True)
    sorted_recommendations[key] = value

In [104]:
sorted_recommendations

defaultdict(<function __main__.<lambda>>,
            {(7, 0): [('like_comics', 0.6),
              ('like_movies', 0.4),
              ('like_sport', 0.36),
              ('like_games', 0.36)],
             (7, 1): [('like_games', 0.5714285714285714),
              ('like_sport', 0.5238095238095238),
              ('like_comics', 0.42857142857142855),
              ('like_movies', 0.3333333333333333)],
             (8, 0): [('like_movies', 0.5217391304347826),
              ('like_sport', 0.5217391304347826),
              ('like_comics', 0.391304347826087),
              ('like_games', 0.2608695652173913)],
             (8, 1): [('like_comics', 0.5714285714285714),
              ('like_movies', 0.5),
              ('like_games', 0.5),
              ('like_sport', 0.42857142857142855)],
             (9, 0): [('like_movies', 0.7692307692307693),
              ('like_sport', 0.6923076923076923),
              ('like_games', 0.6923076923076923),
              ('like_comics', 0.3076923076

## Compute predictions from recommendations

In [116]:
def compute_predictions(age, gender, recommendations, threshold=0.6):
    key = (age, gender)
    assert key in recommendations, "Recommendations do not contain information for input age=%s, gender=%s" % (age, gender)
    predictions = []
    for i, pr in recommendations[key].items():
        if i not in predictions and pr >= threshold:
            predictions.append(i)
    return predictions

For example, a boy of 10 years old will like : 

In [117]:
compute_predictions(10, 0, recommendations)

['like_movies']

## Validate predictions

In [166]:
target_labels = ['like_comics','like_games','like_movies','like_sport']
threshold = 0.6

predictions = pd.DataFrame()
predictions = pd.concat([predictions, TEST])
predictions[target_labels] = 0

for i in predictions.index:
    row = predictions.loc[i]
    key = get_key(row)
    predicted_targets_labels = compute_predictions(*key, recommendations=recommendations, threshold=threshold)
    predicted_targets = [1 if k in predicted_targets_labels else 0 for k in target_labels ]    
    predictions.loc[i, target_labels] = predicted_targets
    
predictions.head()

Unnamed: 0,__id,_age,_gender,like_comics,like_games,like_movies,like_sport
256,256,22,1,0,0,1,0
276,276,7,0,1,0,0,0
821,821,19,0,0,1,1,0
483,483,17,0,0,0,0,0
409,409,23,1,0,1,0,0


In [167]:
mean_absolute_error(TEST[target_labels], predictions[target_labels])

0.47999999999999998