# Exercises for "Hands-on with Pydata: How to Build a Minimal Recommendation Engine"

# Systems check: imports and files

In [1]:
import numpy as np
import pandas as pd

# Pandas questions: Series and DataFrames
## 1. Adding a column in a DataFrame

In [2]:
# given the following DataFrame, add a new column to it
df = pd.DataFrame({'col1': [1,2,3,4]})

# my addition:
df['col2'] = range(5,9)

df

Unnamed: 0,col1,col2
0,1,5
1,2,6
2,3,7
3,4,8


## 2. Deleting a row in a DataFrame

In [3]:
# given the following DataFrame, delete row 'd' from it
df = pd.DataFrame({'col1': [1,2,3,4]}, index = ['a','b','c','d'])

# my addition:
df.drop('d', axis = 0, inplace = True )

df

Unnamed: 0,col1
a,1
b,2
c,3


## 3. Creating a DataFrame from a few Series

In [4]:
# given the following three Series, create a DataFrame such that it holds them as its columns
ser_1 = pd.Series(np.random.randn(6))
ser_2 = pd.Series(np.random.randn(6))
ser_3 = pd.Series(np.random.randn(6))

# my addition:
df = pd.DataFrame({'col1': ser_1, 'col2':ser_2, 'col3': ser_3})
df

Unnamed: 0,col1,col2,col3
0,0.303783,-0.408232,0.563959
1,0.496282,0.340504,-1.735576
2,1.662035,0.234688,-0.076108
3,1.813168,-0.13273,-1.258663
4,0.371668,1.146204,-0.625404
5,-0.448433,0.658471,2.281013


# Pandas questions: Indexing

## 1. Indexing into a specific column

In [5]:
# given the following DataFrame, try to index into the 'col_2' column
df = pd.DataFrame(data={'col_1': [0.12, 7, 45, 10], 'col_2': [0.9, 9, 34, 11]},
                  columns=['col_1', 'col_2', 'col_3'],
                  index=['obs1', 'obs2', 'obs3', 'obs4'])

# my addition:
df[['col_2']]

Unnamed: 0,col_2
obs1,0.9
obs2,9.0
obs3,34.0
obs4,11.0


## 2. Label-based indexing

In [6]:
# using the same DataFrame, index into the row whose index is 'obs3'
df.loc['obs3']

col_1     45
col_2     34
col_3    NaN
Name: obs3, dtype: object

## 2. Position-based indexing

In [7]:
# using the same DataFrame, index into into its first row
df.iloc[0,:]

col_1    0.12
col_2     0.9
col_3     NaN
Name: obs1, dtype: object

# Mini-Challenge prep: data loading

## 1. How to load the `users` and `movies` portions of MovieLens

In [8]:
# import pandas as pd

users = pd.read_table('data/ml-1m/users.dat',
                      sep='::', header=None,
                      names=['user_id', 'gender', 'age', 'occupation', 'zip'],
                      engine = 'python')

movies = pd.read_table('data/ml-1m/movies.dat',
                       sep='::', header=None,
                       names=['movie_id', 'title', 'genres'],
                       engine = 'python')
                       
# later in this analysis, index-based references are used, but the data don't have indices that 
#   match the user and movie id's. This breaks the references
# the most effective way to fix this is to fix the indices
users.index = users.user_id
movies.index = movies.movie_id

users.head()

Unnamed: 0_level_0,user_id,gender,age,occupation,zip
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,F,1,10,48067
2,2,M,56,16,70072
3,3,M,25,15,55117
4,4,M,45,7,2460
5,5,M,25,20,55455


In [9]:
movies.head()

Unnamed: 0_level_0,movie_id,title,genres
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,Toy Story (1995),Animation|Children's|Comedy
2,2,Jumanji (1995),Adventure|Children's|Fantasy
3,3,Grumpier Old Men (1995),Comedy|Romance
4,4,Waiting to Exhale (1995),Comedy|Drama
5,5,Father of the Bride Part II (1995),Comedy


## 2. How to load the training and testing subsets

In [10]:
# subset version (hosted notebook)
movielens_train = pd.read_csv('data/movielens_train.csv', index_col=0, encoding = 'latin-1')
movielens_test = pd.read_csv('data/movielens_test.csv', index_col=0, encoding = 'latin-1')

In [11]:
print(movielens_train.shape)
movielens_train.head()

(5838, 11)


Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
593263,3562,3798,4,967332344,F,25,6,32812,What Lies Beneath (2000),Thriller,False
235597,1051,3793,4,974958593,F,25,0,60513,X-Men (2000),Action|Sci-Fi,False
219003,3727,2366,3,966309522,M,35,7,74401,King Kong (1933),Action|Adventure|Horror,False
685090,4666,1094,3,963843918,M,35,1,53704,"Crying Game, The (1992)",Drama|Romance|War,False
312377,3261,1095,4,968251750,M,45,20,87505,Glengarry Glen Ross (1992),Drama,False


In [12]:
print(movielens_test.shape)
movielens_test.head()

(2668, 11)


Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
693323,4653,2648,4,975532459,M,35,12,95051,Frankenstein (1931),Horror,False
24177,2259,1270,4,974591524,F,56,16,70503,Back to the Future (1985),Comedy|Sci-Fi,False
202202,3032,1378,5,970343147,M,25,0,47303,Young Guns (1988),Action|Comedy|Western,False
262003,3029,2289,4,972846393,M,18,4,92037,"Player, The (1992)",Comedy|Drama,False
777848,4186,2403,3,1017931262,M,25,7,33308,First Blood (1982),Action,False


# Mini-Challenge prep: evaluation functions

These are the two functions that you will need to test your `estimate` method.

In [13]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    
    return(np.sqrt(np.mean(np.power(y_pred - y_true, 2))))

In [14]:
def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = movielens_test.rating.values
    return(compute_rmse(estimated, real))

Test a dummy solution!

In [15]:
def my_estimate_func(user_id, movie_id):
    return(3.5)

You can test for performance with the following line, which assumes that your function is called `my_estimate_func`:

In [16]:
print('RMSE for my estimate function: %s' % evaluate(my_estimate_func))

RMSE for my estimate function: 1.09088495498


# Reco systems questions: Minimal reco engine v1.0

## 1. Simple collaborative filtering using mean ratings

In [17]:
# write an 'estimate' function that computes the mean rating of a particular user
def collab_mean(user_id, movie_id):
    
    # first, index into all ratings of this movie
    movie_condition = movielens_train.movie_id == movie_id
    user_condition = movielens_train.user_id != user_id
    
    # second, compute the mean of those ratings
    ratings = movielens_train.rating[movie_condition & user_condition]
    
    if ( ratings.empty ):
        return( 3.0 )
    else:
        return( ratings.mean() )

    
# try it out for a user_id, movie_id pair
print(collab_mean(4653, 2648))
print('RMSE for a simple collaborative filter is: %s' % evaluate(collab_mean))

4.0
RMSE for a simple collaborative filter is: 1.1234279896


# Mini-Challenge: first round
Implement an `estimate` function of your own using other similarity notions, eg.:

- collaborative filter based on age similarities
- collaborative filter based on zip code similarities
- collaborative filter based on occupation similarities
- content filter based on movie genre

## Age Similarity

In [18]:
# attempting to develop the class estimator as shown in the video example
# the functions have proven slow and I'd like to see the computational performance improvement

class CollabAgeReco:
    # Collaborative filtering using an implicit sim(u,u'), based on age
    # and maybe age ranges...
    
    def learn( self ):
        # prepare data structures
        self.means_by_age = movielens_train.pivot_table('rating', index = 'movie_id', columns = 'age')
                
    def estimate( self, user_id, movie_id ):
        # Mean ratings by other users of the same age
        if( movie_id not in self.means_by_age.index ):
            return( 3.0 )
        
        user_age = users.loc[user_id, 'age']
        if( ~np.isnan(self.means_by_age.loc[movie_id, user_age]) ):
            return( self.means_by_age.loc[movie_id, user_age] )
        else:
            return( self.means_by_age.loc[movie_id].mean())

reco = CollabAgeReco()
reco.learn()
print('RMSE for CollabAgeReco: %s.' % evaluate(reco.estimate))

RMSE for CollabAgeReco: 1.20520133441.


The computational performance in vastly improved; this took far less time to compute a RMSE.

But the estimation performance got noticably worse, so I'd like to see about using age ranges. First, I'd like to look at the distribution and frequencies of different ages to see if this would be of any use.

In [19]:
print("There are %s discrete age values in the dataset." % str(len(movielens_train.age.value_counts())))
movielens_train.age.value_counts()

There are 7 discrete age values in the dataset.


25    2441
35    1144
18    1037
45     476
50     409
56     177
1      154
Name: age, dtype: int64

Based on these results, the data is clearly already grouped into age ranges, and so I'm abandoning further development in this direction and just proceding with the remaining 3 methods of building the recommendation system under round 1 of the challenge.

## Zip Code Similarity

In [20]:
class CollabZipReco:
    # Collaborative filtering using an implicit sim(u,u'), based on zip code
    
    def learn( self ):
        # prepare data structures
        self.means_by_zip = movielens_train.pivot_table('rating', index = 'movie_id', columns = 'zip')
        
    def estimate( self, user_id, movie_id ):
        # Mean ratings by other users of the same zip code
        if( movie_id not in self.means_by_zip.index ):
            return( 3.0 )
        
        # get the users zip
        user_zip = users.loc[user_id, 'zip']
        
        # many test users have zip codes that are not in the training data
        # we have to then filter those out and return a dummy value, just as for those
        #   where the movie isn't in the training data
        if( user_zip not in self.means_by_zip.columns ):
            return( 3.0 )
        
        if( ~np.isnan(self.means_by_zip.loc[movie_id, user_zip]) ):
            return( self.means_by_zip.loc[movie_id, user_zip] )
        else:
            return( self.means_by_zip.loc[movie_id].mean())

reco = CollabZipReco()
reco.learn()
print('RMSE for CollabZipReco: %s.' % evaluate(reco.estimate))

RMSE for CollabZipReco: 1.12566403192.


Time time, we have some slight improvement over similarity by age, but not over the unfiltered means.

## Occupation Similarities

I would like to note that this method could standardized so that it can accept any one variable as the basis for collaborative similarity. This would be done by including an additional input parameter in the class that included the name of this input parameter. Then, when the `.learn()` method is called, the parameter would specified.

Additionally, I think the performance benefits of formatting this analysis could be maintained while using the simpler function setup simply by separating out the construction of the group means matrix from the estimation, as is done in the class-based approach. That is, the performance benefit is likely not derived from the fact that a class is used that this group mean matrix is only constructed once, rather than during each iteration of the estimate function. This could be done by running two functions, similiar to the two methods in the class function, and done in a simpler code structure that doesn't define a class. It would, however, require a change in that the group means matrix would need to be passed to the estimate function, which is somewhat different than the current project parameters, which insists that the estimate function only take a user id and a movie id.

With these points noted, I am proceeding following the existing coding structure.

In [21]:
class CollabJobReco:
    # Collaborative filtering using an implicit sim(u,u'), based on zip code
    
    def learn( self ):
        # prepare data structures
        self.means_by_job = movielens_train.pivot_table('rating', index = 'movie_id', columns = 'occupation')
        
    def estimate( self, user_id, movie_id ):
        # Mean ratings by other users of the same occupation
        if( movie_id not in self.means_by_job.index ):
            return( 3.0 )
        
        # get the users zip
        user_job = users.loc[user_id, 'occupation']
        
        # maintaining the additional test required for zip analysis
        if( user_job not in self.means_by_job.columns ):
            return( 3.0 )
        
        if( ~np.isnan(self.means_by_job.loc[movie_id, user_job]) ):
            return( self.means_by_job.loc[movie_id, user_job] )
        else:
            return( self.means_by_job.loc[movie_id].mean())

reco = CollabJobReco()
reco.learn()
print('RMSE for CollabJobReco: %s.' % evaluate(reco.estimate))

RMSE for CollabJobReco: 1.20287696436.


The occupation approached performs better than the age-based groups, but not as well as zip codes, unfiltered means, or even the base case (fixed prediction). Still, none of these single predictors produces a noticably improved prediction performance. I am already interested in looking into ways to combine them during the second round of the mini-challenge.

## Content Based Approach: Movie Genre

In [22]:
class ContentGenreReco:
    # Collaborative filtering using an implicit sim(u,u'), based on zip code
    
    def learn( self ):
        # prepare data structures
        # where collaborative pivoted on movie id's, content will pivot on user id's
        self.means_by_user = movielens_train.pivot_table('rating', index = 'user_id', columns = 'genres')
        
    def estimate( self, user_id, movie_id ):
        # Mean ratings by this user for other genres
        if( user_id not in self.means_by_user.index ):
            return( 3.5 )
        
        # get the movie's genre
        movie_genre = movies.loc[movie_id, 'genres']
        
        # maintaining the additional test required for zip analysis
        if( movie_genre not in self.means_by_user.columns ):
            return( 3.5 )
        
        if( ~np.isnan(self.means_by_user.loc[user_id, movie_genre]) ):
            return( self.means_by_user.loc[user_id, movie_genre] )
        else:
            return( self.means_by_user.loc[user_id].mean())

reco = ContentGenreReco()
reco.learn()
print('RMSE for ContentGenreReco: %s.' % evaluate(reco.estimate))

RMSE for ContentGenreReco: 1.25351668567.


Results from all the types:

| Filter Type | Filter Content | RMSE |
|:-----|:-----|-----|
| Base | Fixed at 3.5 | 1.091 |
| Collaborative | Unfiltered Mean | 1.123 |
| Collaborative | Age | 1.210 |
| Collaborative | Zip Code | 1.126 |
| Collaborative | Occupation | 1.203 |
| Content | Genre | 1.253 |

The genre approach, on its own, is not as good as any of the collaborative approaches we tried above. That said, the the genre approach is still very poorly refined. It requires that the movie have all of the same genres. A more refined version will break about genre labels and search based on individual genres and flexible groups rather than fixed groups.

The unfiltered means approach is still the best performing method that we've seen, even if it doesn't outperform the base case. This is defimitely a disappointing result; we're best off just expecting the rating to be 3.5, regardless of any one characteristic. What we need to do is to build hybrid approaches, including multiple filter contents and both collaborative- and content-based filtering to get better performance.

# Mini-Challenge: second round
Implement an `estimate` function of your own using other custom similarity notions, eg.:

- euclidean
- cosine

I will start doing this by recreating the Pearson $\rho$-based method that performed so well in the PyCon presentation, adjusted for Python 3. This method uses a good improvement: weighted averages for distance. We could not use this before as no distance was measured, only a consideration for 'same' v. 'not same'.

In [23]:
# first, I want to bring in the functions for computing pearson correlation, euclidean distance, and cosine similarity

def pearson(s1, s2):
    """Take two pd.Series objects and return a pearson correlation."""
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    if ((np.sum(s1_c ** 2) == 0.0) | (np.sum(s2_c ** 2) == 0.0)):
        return(0.0)
    else:
        return(np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2)))

def euclidean(s1, s2):
    """Take two pd.Series objects and return their euclidean 'similarity'."""
    diff = s1 - s2
    return(1 / (1 + np.sqrt(np.sum(diff ** 2))))

def cosine(s1, s2):
    """Take two pd.Series objects and return their cosine similarity."""
    return(np.sum(s1 * s2) / np.sqrt(np.sum(s1 ** 2) * np.sum(s2 ** 2)))

## Run the pearson correlation based approach

In [24]:
class CollabPearsonReco:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens_train.pivot_table('rating', index='movie_id', columns='user_id')

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train[user_condition & movie_condition]
        if(ratings_by_others.empty): 
            return(3.0)
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: pearson(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if(ratings_sims.empty):
            return(their_ratings.mean())
        else:
            return(np.average(ratings_sims.rating, weights=ratings_sims.sim))
        
reco = CollabPearsonReco()
reco.learn()
print('RMSE for CollabPearsonReco: %.3f' % evaluate(reco.estimate))

RMSE for CollabPearsonReco: 1.126


## Euclidean Distance-Based

In [25]:
class CollabEuclidReco:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens_train.pivot_table('rating', index='movie_id', columns='user_id')

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train[user_condition & movie_condition]
        if(ratings_by_others.empty): 
            return(3.0)
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: euclidean(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if(ratings_sims.empty):
            return(their_ratings.mean())
        else:
            return(np.average(ratings_sims.rating, weights=ratings_sims.sim))
        
reco = CollabEuclidReco()
reco.learn()
print('RMSE for CollabEuclidReco: %.3f' % evaluate(reco.estimate))

RMSE for CollabEuclidReco: 1.123


## Cosine Similarity

In [26]:
class CollabCosineReco:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens_train.pivot_table('rating', index='movie_id', columns='user_id')

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train[user_condition & movie_condition]
        if(ratings_by_others.empty): 
            return(3.0)
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: cosine(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if(ratings_sims.empty):
            return(their_ratings.mean())
        else:
            return(np.average(ratings_sims.rating, weights=ratings_sims.sim))
        
reco = CollabCosineReco()
reco.learn()
print('RMSE for CollabCosineReco: %.3f' % evaluate(reco.estimate))

RMSE for CollabCosineReco: 1.134


Results from all the types:

| Filter Type | Filter Content | RMSE |
|:-----|:-----|-----|
| Base | Fixed at 3.5 | 1.091 |
| Collaborative | Unfiltered Mean | 1.123 |
| Collaborative | Age | 1.210 |
| Collaborative | Zip Code | 1.126 |
| Collaborative | Occupation | 1.203 |
| Content | Genre | 1.253 |
| Collaborative | Pearson Correlation | 1.126 |
| Collaborative | Euclidean Distance | 1.123 |
| Collaborative | Cosine Similarity | 1.134 |

These performance scores are still quite disappointing. Nothing has done better than just assuming that a user's rating will be 3.5. What is needed is a hybrid approach.

## Looking at a Hybrid Approach

The PyCon conference showed a method of hybrid estimation, which I have recreated here. It works by weighting two different values. Training ratings are drawn from viewers who saw the same movie and those who are of similiar age and the same gender and occupation. The approach draws 60% of the predicted rating from the average of the ratings from all other viewers of the same movie. The remaining 40% is drawn from the ratings of all movies for viewers of the same gender and occupation, who are also within 10 users of age of the user whose rating is being predicted. If, at any point, there are no matching users for comparison, then the algorithm replaces the average of the users ratings with the overall mean rating for all movies.

In [27]:
from functools import reduce

overall_mean = movielens_train['rating'].mean()

def hybrid_estimate(user_id, movie_id):
    measures = [
        (0.6, movielens_train[movielens_train.movie_id == movie_id]),
        (0.4, movielens_train[reduce(np.logical_and,
                                     [movielens_train.gender == users.loc[user_id].gender,
                                      movielens_train.occupation == users.loc[user_id].occupation,
                                      abs(movielens_train.age - users.loc[user_id].age) < 10 ])])
    ]
        
    return( sum((weight * ((measure.empty and overall_mean) or measure['rating'].mean())
                for weight, measure in measures)) )

RMSE = evaluate(hybrid_estimate)
print('RMSE for hybrid_estimate: %s.' % RMSE )
print('Percent improvement: %.3f percent.' % (100 * ( (1.091 - RMSE) / 1.091) ) )

RMSE for hybrid_estimate: 1.05625258746.
Percent improvement: 3.185 percent.


The hybrid approach is, indeed, the most effective method. It alone has beat simply assuming that the rating will be 3.5 for accuracy. That said, the improvement is still pretty marginal, at only 3.185%. In the future, I would like to return to this algorithm and attempt to improve it further, but adjusting rates, separating rates out further, bringing collaborative and content approaches together and exploring weighting predictions by their performance.