# Example 2 of collaborative filtering

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from collections import defaultdict
import operator

## Setup data

Let us define a random data describing young people tastes per months
We store 'id', 'date' and 'age', 'gender', 'residence' as profile information and targets : 'like_comics', 'like_games', 'like_movies', 'like_sport'

In [89]:
size_per_month = 1000
last_month = 5

DATA = pd.DataFrame()

profiles = {
    'id': range(size_per_month),
    'age': np.random.randint(7, 25, size=size_per_month),
    'gender': np.random.randint(2, size=size_per_month),
    'residence': np.random.randint(5, size=size_per_month)
}

for i in range(1, last_month+1):
    df = pd.DataFrame({
            '__id': profiles['id'],
            '_age': profiles['age'],
            '_gender': profiles['gender'],            
            '_residence': profiles['residence'], 
            '_date': [i] * size_per_month,
            'like_movies': np.random.randint(2, size=size_per_month),
            'like_comics': np.random.randint(2, size=size_per_month),
            'like_games': np.random.randint(2, size=size_per_month),
            'like_sport': np.random.randint(2, size=size_per_month),
        }, index=range((i-1)*size_per_month, i*size_per_month))
    DATA = pd.concat([DATA, df], axis=0)

In [90]:
DATA[DATA['_date'] == 1].head()

Unnamed: 0,__id,_age,_date,_gender,_residence,like_comics,like_games,like_movies,like_sport
0,0,17,1,0,4,0,1,1,1
1,1,15,1,0,4,0,0,1,0
2,2,14,1,1,4,1,1,1,1
3,3,17,1,1,4,0,0,1,0
4,4,9,1,0,4,0,1,0,1


In [91]:
DATA[DATA['_date'] == 2].head()

Unnamed: 0,__id,_age,_date,_gender,_residence,like_comics,like_games,like_movies,like_sport
1000,0,17,2,0,4,1,1,0,0
1001,1,15,2,0,4,0,0,1,0
1002,2,14,2,1,4,1,0,0,0
1003,3,17,2,1,4,0,1,0,1
1004,4,9,2,0,4,1,1,0,1


In [92]:
TRAIN = DATA[DATA['_date'] < last_month]
TEST = DATA[DATA['_date'] == last_month]

## Compute recommendations

*Common recommendations*:

From training dataset define a dictionary 'recommendations' with keys `('age', 'gender')` and values `{'like_comics': count1, 'like_games': count2, 'like_movies': count3, 'like_sport': count4}`. Then sort it by most popular.

*Personal recommendations*:

This recommendation stores user's preferences per month evolution.
When user chooses a target (sets to `1`), then the recommendation adds `1` to the target.
When user does not chooses a target (sets to `0`), then the recommendation adds `-1` to the target, however the total value remain positive (if value is `0`, and we add `-1`, the value remains `0`).



In [116]:
common_recommendations = defaultdict(lambda: defaultdict(float))
personal_recommendations = defaultdict(lambda: defaultdict(float))

profile_labels = ['_age', '_gender', '_residence']
target_labels = ['like_comics', 'like_games', 'like_movies', 'like_sport']
for i in TRAIN.index:
    row = TRAIN.loc[i]
    user = row['__id']
    key = tuple(row[profile_labels].values)
    targets = row[target_labels].values    
    for l, t in zip(target_labels, targets):
        # Update common recommendations
        if t > 0:
            common_recommendations[key][l] += 1
    # Init/Update personal recommendations
    if user not in personal_recommendations:
        personal_recommendations[user] = targets
    else:
        updates = targets.copy()
        updates[updates == 0] = -1
        current_values = personal_recommendations[user]
        updates += current_values
        updates[updates < 0] = 0
        personal_recommendations[user] = updates

Transform counts to frequencies : 
```count -> count / nb_votes_per_age_gender```


In [117]:
nb_votes_per_profile = TRAIN.groupby(['_age', '_gender', '_residence']).agg('size')
for key, value in common_recommendations.items():
    for k in value:
        value[k] /= nb_votes_per_profile[key]

In [118]:
common_recommendations.items()[:3]

[((15, 0, 4),
  defaultdict(float,
              {'like_comics': 0.21875,
               'like_games': 0.53125,
               'like_movies': 0.625,
               'like_sport': 0.5625})),
 ((21, 0, 3),
  defaultdict(float,
              {'like_comics': 0.4,
               'like_games': 0.45,
               'like_movies': 0.6,
               'like_sport': 0.4})),
 ((14, 1, 3),
  defaultdict(float,
              {'like_comics': 0.25,
               'like_games': 0.25,
               'like_movies': 0.5,
               'like_sport': 0.75}))]

Transform personal recommendation values to probabilities using the following rule :
```
Pr[value] = value/2*number_of_months + 0.5
```
with `value` is between 0 and `number_of_months`. The variable `number_of_months` corresponds to the number of months used for predictions.

In [119]:
for key in personal_recommendations:
    values = personal_recommendations[key]
    proba = values/(2.0*(last_month-1)) + 0.5
    personal_recommendations[key] = proba

In [120]:
personal_recommendations.items()[:10]

[(0, array([ 0.5  ,  0.75 ,  0.625,  0.625])),
 (1, array([ 0.5  ,  0.75 ,  0.75 ,  0.625])),
 (2, array([ 0.5  ,  0.5  ,  0.625,  0.5  ])),
 (3, array([ 0.5  ,  0.5  ,  0.75 ,  0.625])),
 (4, array([ 0.625,  0.75 ,  0.5  ,  0.75 ])),
 (5, array([ 1.  ,  0.5 ,  0.75,  0.75])),
 (6, array([ 0.5  ,  0.5  ,  0.75 ,  0.625])),
 (7, array([ 0.75 ,  0.625,  0.75 ,  0.75 ])),
 (8, array([ 0.625,  0.625,  0.75 ,  1.   ])),
 (9, array([ 0.75 ,  0.5  ,  0.625,  0.5  ]))]

## Compute predictions from recommendations

In [150]:
common_rc_weight = 0.2
personal_rc_weight = 1.0 - common_rc_weight

In [151]:
def compute_predictions(user_id, profile, threshold=0.6, verbose=False):
    common_predictions = None        
    if profile in common_recommendations:
        common_predictions = []
        for t in target_labels:            
            common_predictions.append(common_recommendations[profile][t])            
    if verbose: print "Common predictions: ", common_predictions
    
    personal_predictions = None    
    if user_id in personal_recommendations:
        personal_predictions = personal_recommendations[user_id]
    if verbose: print "Personal predictions: ", personal_predictions

    
    if common_predictions is not None and personal_predictions is not None:
        predictions = np.array(common_predictions) * common_rc_weight + np.array(personal_predictions) * personal_rc_weight
    elif common_predictions is not None:
        predictions = np.array(common_predictions)
    elif personal_predictions is not None:
        predictions = np.array(personal_predictions)
    else:
        raise Exception("Failed to compute predictions")
        
    if verbose: print "Total predictions: ", predictions
    
    predictions[predictions >= threshold] = 1
    predictions[predictions < threshold] = 0
    return predictions.astype(np.int)

For example, a boy of 15 years old, residing in '4' with the id=1 will like : 

In [139]:
compute_predictions(1, (15, 0, 4), 0.6, True)

Common predictions:  [0.21875, 0.53125, 0.625, 0.5625]
Personal predictions:  [ 0.5    0.75   0.75   0.625]
Total predictions:  [ 0.3875  0.6625  0.7     0.6   ]


array([0, 1, 1, 1])

## Validate predictions

In [156]:
threshold = 0.65

predictions = pd.DataFrame()
predictions = pd.concat([predictions, TEST])
predictions[target_labels] = 0

for i in predictions.index:
    row = predictions.loc[i]
    user_id = row['__id']
    profile = tuple(row[profile_labels].values)
    predictions.loc[i, target_labels] = compute_predictions(user_id, profile, threshold)
    
predictions.head()

Unnamed: 0,__id,_age,_date,_gender,_residence,like_comics,like_games,like_movies,like_sport
4000,0,17,5,0,4,0,1,0,0
4001,1,15,5,0,4,0,1,1,0
4002,2,14,5,1,4,0,0,0,0
4003,3,17,5,1,4,0,0,1,0
4004,4,9,5,0,4,0,1,0,1


In [157]:
mean_absolute_error(TEST[target_labels], predictions[target_labels])

0.48799999999999999