# Exercises for "Hands-on with Pydata: How to Build a Minimal Recommendation Engine"

# Systems check: imports and files

In [1]:
import numpy as np
import pandas as pd

# Pandas questions: Series and DataFrames
## 1. Adding a column in a DataFrame

In [2]:
# given the following DataFrame, add a new column to it
df = pd.DataFrame({'col1': [1,2,3,4], 'col2': [1,2,3,4]})
df

Unnamed: 0,col1,col2
0,1,1
1,2,2
2,3,3
3,4,4


## 2. Deleting a row in a DataFrame

In [3]:
# given the following DataFrame, delete row 'd' from it
df = pd.DataFrame({'col1': [1,2,3,4]}, index = ['a','b','c','d'])
df.drop('d')

Unnamed: 0,col1
a,1
b,2
c,3


## 3. Creating a DataFrame from a few Series

In [4]:
# given the following three Series, create a DataFrame such that it holds them as its columns
ser_1 = pd.Series(np.random.randn(6))
ser_2 = pd.Series(np.random.randn(6))
ser_3 = pd.Series(np.random.randn(6))
df = pd.DataFrame({'col1': ser_1, 'col2': ser_2, 'col3': ser_3})
df

Unnamed: 0,col1,col2,col3
0,-0.306488,-0.92945,0.528474
1,2.100963,-0.248999,0.161737
2,-1.285978,-0.650965,-0.047917
3,-0.267161,0.491528,-0.407227
4,1.260118,0.800706,2.040324
5,-1.017585,0.076163,-0.179965


# Pandas questions: Indexing

## 1. Indexing into a specific column

In [5]:
# given the following DataFrame, try to index into the 'col_2' column
df = pd.DataFrame(data={'col_1': [0.12, 7, 45, 10], 'col_2': [0.9, 9, 34, 11]},
                  columns=['col_1', 'col_2', 'col_3'],
                  index=['obs1', 'obs2', 'obs3', 'obs4'])
df.col_1

obs1     0.12
obs2     7.00
obs3    45.00
obs4    10.00
Name: col_1, dtype: float64

## 2. Label-based indexing

In [8]:
# using the same DataFrame, index into the row whose index is 'obs3'
df.ix['obs3']

col_1     45
col_2     34
col_3    NaN
Name: obs3, dtype: object

## 2. Position-based indexing

In [9]:
# using the same DataFrame, index into into its first row
df.ix[0]

col_1    0.12
col_2     0.9
col_3     NaN
Name: obs1, dtype: object

# Mini-Challenge prep: data loading

## 1. How to load the `users` and `movies` portions of MovieLens

In [10]:
import pandas as pd

users = pd.read_table('data/ml-1m/users.dat',
                      sep='::', header=None,
                      names=['user_id', 'gender', 'age', 'occupation', 'zip'], engine='python')

movies = pd.read_table('data/ml-1m/movies.dat',
                       sep='::', header=None,
                       names=['movie_id', 'title', 'genres'], engine='python')

## 2. How to load the training and testing subsets

In [44]:
# subset version (hosted notebook)
movielens_train = pd.read_csv('data/movielens_train.csv', index_col=0, encoding="ISO-8859-1")
movielens_test = pd.read_csv('data/movielens_test.csv', index_col=0, encoding="ISO-8859-1")

In [16]:
movielens_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
593263,3562,3798,4,967332344,F,25,6,32812,What Lies Beneath (2000),Thriller,False
235597,1051,3793,4,974958593,F,25,0,60513,X-Men (2000),Action|Sci-Fi,False
219003,3727,2366,3,966309522,M,35,7,74401,King Kong (1933),Action|Adventure|Horror,False
685090,4666,1094,3,963843918,M,35,1,53704,"Crying Game, The (1992)",Drama|Romance|War,False
312377,3261,1095,4,968251750,M,45,20,87505,Glengarry Glen Ross (1992),Drama,False


In [17]:
movielens_test.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
693323,4653,2648,4,975532459,M,35,12,95051,Frankenstein (1931),Horror,False
24177,2259,1270,4,974591524,F,56,16,70503,Back to the Future (1985),Comedy|Sci-Fi,False
202202,3032,1378,5,970343147,M,25,0,47303,Young Guns (1988),Action|Comedy|Western,False
262003,3029,2289,4,972846393,M,18,4,92037,"Player, The (1992)",Comedy|Drama,False
777848,4186,2403,3,1017931262,M,25,7,33308,First Blood (1982),Action,False


In [81]:
# this is for genre manipulation
movielens_train_expand = movielens_train.set_index(movielens_train.columns.drop('genres',1).tolist()).genres.str.split('|', expand=True).stack().reset_index().rename(columns={0:'genres'}).loc[:, movielens_train.columns]
movielens_test_expand = movielens_test.set_index(movielens_test.columns.drop('genres',1).tolist()).genres.str.split('|', expand=True).stack().reset_index().rename(columns={0:'genres'}).loc[:, movielens_test.columns]

In [82]:
# replace genre with int
movielens_train_expand.genres = movielens_train_expand['genres'].replace(['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 
                                          'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
                                          'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'], 
                                         [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18])
movielens_test_expand.genres = movielens_test_expand['genres'].replace(['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 
                                          'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
                                          'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'], 
                                         [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18])

In [79]:
#Xtrain = movielens_train.assign(**{'genres':movielens_train['genres'].str.split('|')})
#Xtest = movielens_test.assign(**{'genres':movielens_test['genres'].str.split('|')})

In [77]:
#Xtrain['genres'] = Xtrain['genres'].replace(to_replace=['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 
#                                          'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
#                                          'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'], 
#                                         value=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18])
#Xtest.genres = Xtest['genres'].replace(['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 
#                                          'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
#                                          'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'], 
#                                         [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18])

In [83]:
movielens_train_expand.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
0,3562,3798,4,967332344,F,25,6,32812,What Lies Beneath (2000),16,False
1,1051,3793,4,974958593,F,25,0,60513,X-Men (2000),1,False
2,1051,3793,4,974958593,F,25,0,60513,X-Men (2000),15,False
3,3727,2366,3,966309522,M,35,7,74401,King Kong (1933),1,False
4,3727,2366,3,966309522,M,35,7,74401,King Kong (1933),2,False


# Mini-Challenge prep: evaluation functions

These are the two functions that you will need to test your `estimate` method.

In [18]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [19]:
def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = movielens_test.rating.values
    return compute_rmse(estimated, real)

In [84]:
def evaluate_expand(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(movielens_test_expand.user_id, movielens_test_expand.movie_id)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = movielens_test.rating.values
    return compute_rmse(estimated, real)

Test a dummy solution!

In [20]:
def my_estimate_func(user_id, movie_id):
    return 3.0

You can test for performance with the following line, which assumes that your function is called `my_estimate_func`:

In [21]:
print('RMSE for my estimate function: ', evaluate(my_estimate_func))

RMSE for my estimate function:  1.23237195265


# Reco systems questions: Minimal reco engine v1.0

## 1. Simple collaborative filtering using mean ratings

In [22]:
# write an 'estimate' function that computes the mean rating of a particular user
def collab_mean(user_id, movie_id):
    # first, index into all ratings of this movie
    # second, compute the mean of those ratings
    # 
    item_condition = movielens_train.movie_id == movie_id
    return movielens_train.loc[item_condition, 'rating'].mean()
    
# try it out for a user_id, movie_id pair
collab_mean(4653, 2648)

4.0

# Mini-Challenge: first round
Implement an `estimate` function of your own using other similarity notions, eg.:

- collaborative filter based on age similarities
- collaborative filter based on zip code similarities
- collaborative filter based on occupation similarities
- content filter based on movie genre

In [24]:
def collab_age(user_id, movie_id):
    """ Collaborative filtering using an implicit sim(u,u') based on age. """

    user_info = users.set_index('user_id')
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    if ratings_by_others.empty: 
        return 3.0
    
    means_by_age = ratings_by_others.pivot_table('rating', index='movie_id', columns='age')
    user_age = user_info.ix[user_id, 'age']
    if user_age in means_by_age.columns: 
        return means_by_age.ix[movie_id, user_age]
    else:
        return means_by_age.ix[movie_id].mean()

print ('RMSE for collab_age: ', evaluate(collab_age))

RMSE for collab_age:  1.20520133441


In [25]:
def collab_zip(user_id, movie_id):
    """ Collaborative filtering using an implicit sim(u,u') based on zip. """

    user_info = users.set_index('user_id')
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    if ratings_by_others.empty: 
        return 3.0
    
    means_by_zip = ratings_by_others.pivot_table('rating', index='movie_id', columns='zip')
    user_zip = user_info.ix[user_id, 'zip']
    if user_zip in means_by_zip.columns: 
        return means_by_zip.ix[movie_id, user_zip]
    else:
        return means_by_zip.ix[movie_id].mean()

print ('RMSE for collab_zip: ', evaluate(collab_zip))

RMSE for collab_zip:  1.12566403192


In [26]:
def collab_occupation(user_id, movie_id):
    """ Collaborative filtering using an implicit sim(u,u') based on occupation. """

    user_info = users.set_index('user_id')
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    if ratings_by_others.empty: 
        return 3.0
    
    means_by_occupation = ratings_by_others.pivot_table('rating', index='movie_id', columns='occupation')
    user_occupation = user_info.ix[user_id, 'occupation']
    if user_occupation in means_by_occupation.columns: 
        return means_by_occupation.ix[movie_id, user_occupation]
    else:
        return means_by_occupation.ix[movie_id].mean()

print ('RMSE for collab_occupation: ', evaluate(collab_occupation))

RMSE for collab_occupation:  1.20287696436


In [103]:
def genres_mean(user_id, movie_id):
    """ Simple content-filtering based on genre ratings. """
    
    user_condition = movielens_train_expand.user_id == user_id
    return movielens_train_expand.loc[user_condition, 'genres'].mean()

print ('RMSE for estimate1: ', evaluate(genres_mean))

RMSE for estimate1:  5.26050114727


# Mini-Challenge: second round
Implement an `estimate` function of your own using other custom similarity notions, eg.:

- euclidean
- cosine

In [None]:
# see pycon notebook