In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.matplotlib.rcParams['savefig.dpi'] = 144
import seaborn

# Recommendation Engine, Session 1

## Welcome, overview and infrastructure

- Goal of module
- Overview of ipython notebook

## Problem definition and data format

- Source of data
- Place on FS
- Format of data
- Using pandas to read data

In [1]:
import pandas as pd

In [4]:
def parse_movie_line(l):
    id_, title, cats = l.strip().split('::')
    return {'id': int(id_), 'title': title, 'year': int(title.rsplit(' ')[-1][1:-1]), 
            'categories': cats.split('|')}

with open('ml-10M100K/movies.dat', 'r') as f:
    df = pd.DataFrame([parse_movie_line(l) for l in f]).set_index('id')
    
def parse_tag_line(l):
    uid, mid, tag, time = l.strip().split('::')
    return {'user_id': int(uid), 'movie_id': int(mid), 'tag': tag}

with open('ml-10M100K/tags.dat', 'r') as f:
    df_tags = pd.DataFrame([parse_tag_line(l) for l in f])

def parse_rating_line(l):
    uid, mid, rating, time = l.strip().split('::')
    return {'user_id': int(uid), 'movie_id': int(mid), 'rating': float(rating)}

with open('ml-10M100K/ratings.dat', 'r') as f:
    df_ratings = pd.DataFrame([parse_rating_line(l) for l in f])

## Feature engineering

- Start with movies and simple features
- Convert categories to one-hot encoding
- Here using year as a feature, but save for an exercise?
- Intro sklearn, transformers, pipeline, feature union

In [6]:
from sklearn import base
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD

In [7]:
class DictEncoder(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, col):
        self.col = col
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        
        def to_dict(l):
            try:
                return {x: 1 for x in l}
            except TypeError:
                return {}
        
        return X[self.col].apply(to_dict)

In [8]:
class ColExtractor(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, col):
        self.col = col
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.col].reshape(-1,1)

In [9]:
cat_pipe = Pipeline([('encoder', DictEncoder('categories')),
                     ('vectorizer', DictVectorizer())])
features = FeatureUnion([('year', ColExtractor('year')),
                         ('categories', cat_pipe)])
pipe = Pipeline([('features', features),
                 ('scaler', StandardScaler(with_mean=False))])

In [10]:
features = pipe.fit_transform(df)

## Nearest Neighbors

- Find similar movies based on features

In [11]:
nn = NearestNeighbors(n_neighbors=20, metric='l1').fit(features)

In [12]:
dists, indices = nn.kneighbors(features[:3])

In [13]:
df.iloc[indices[0]]['title']

id
1                                         Toy Story (1995)
2294                                           Antz (1998)
3114                                    Toy Story 2 (1999)
4016                      Emperor's New Groove, The (2000)
3754        Adventures of Rocky and Bullwinkle, The (2000)
26662    Kiki's Delivery Service (Majo no takkyûbin) (1...
47124                                Ant Bully, The (2006)
45074                                     Wild, The (2006)
53121                               Shrek the Third (2007)
3400                 We're Back! A Dinosaur's Story (1993)
32153                            Once Upon a Forest (1993)
1881                              Quest for Camelot (1998)
5618     Spirited Away (Sen to Chihiro no kamikakushi) ...
5672     Pokemon 4 Ever (a.k.a. Pokémon 4: The Movie) (...
27731           Cat Returns, The (Neko no ongaeshi) (2002)
6350     Castle in the Sky (Tenkû no shiro Rapyuta) (1986)
2116                         Lord of the Rings, The (

Now include tags

In [14]:
all_tags = df_tags.groupby('movie_id')['tag'].apply(lambda x: x.tolist())

In [15]:
df = df.merge(all_tags.to_frame(), left_index=True, right_index=True, how='left')

In [16]:
tag_pipe = Pipeline([('encoder', DictEncoder('tag')),
                     ('vectorizer', DictVectorizer()),
                     ('svd', TruncatedSVD(n_components=100))])
features = FeatureUnion([('year', ColExtractor('year')),
                         ('categories', cat_pipe),
                         ('tags', tag_pipe)])
pipe = Pipeline([('features', features),
                 ('scaler', StandardScaler(with_mean=False))])

In [17]:
features = pipe.fit_transform(df)

In [18]:
nn.fit(features)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='l1',
         metric_params=None, n_jobs=1, n_neighbors=20, p=2, radius=1.0)

In [19]:
dists, indices = nn.kneighbors(features[:3])

In [20]:
df.iloc[indices[0]]['title']

id
1                        Toy Story (1995)
3114                   Toy Story 2 (1999)
4016     Emperor's New Groove, The (2000)
2355                 Bug's Life, A (1998)
5444                 Lilo & Stitch (2002)
595           Beauty and the Beast (1991)
1907                         Mulan (1998)
5109          Return to Never Land (2002)
33615                   Madagascar (2005)
31687       Pooh's Heffalump Movie (2005)
7720          Four Musketeers, The (1974)
1030                 Pete's Dragon (1977)
2092          Return of Jafar, The (1994)
4366     Atlantis: The Lost Empire (2001)
45517                         Cars (2006)
6093             Last Unicorn, The (1982)
1025       Sword in the Stone, The (1963)
596                      Pinocchio (1940)
616                Aristocats, The (1970)
6251            Piglet's Big Movie (2003)
Name: title, dtype: object

In [22]:
m_locs = df_ratings[df_ratings.user_id == 142]['movie_id'].apply(lambda x: df.index.get_loc(x))

In [23]:
df.iloc[m_locs]

Unnamed: 0_level_0,categories,title,year,tag
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,"[Adventure, Animation, Children, Comedy, Fantasy]",Toy Story (1995),1995,"[Pixar, Pixar, Pixar, animation, Pixar, animat..."
14,[Drama],Nixon (1995),1995,"[5, biopic, biography, Historical opinion, Ant..."
25,"[Drama, Romance]",Leaving Las Vegas (1995),1995,"[alcoholism, library, Oscar (Best Actor), addi..."
26,[Drama],Othello (1995),1995,"[based on a play, Shakespearean, Shakespeare, ..."
29,"[Adventure, Drama, Fantasy, Mystery, Sci-Fi]","City of Lost Children, The (Cité des enfants p...",1995,"[Jeunet, visually appealing, beautiful, Dark f..."
32,"[Sci-Fi, Thriller]",12 Monkeys (Twelve Monkeys) (1995),1995,"[biology, genetics, Brad Pitt, Bruce Willis, t..."
36,"[Crime, Drama]",Dead Man Walking (1995),1995,"[death penalty, based on a book, death penalty..."
43,[Drama],Restoration (1995),1995,"[England, 17th century, Nudity (Topless)]"
50,"[Crime, Mystery, Thriller]","Usual Suspects, The (1995)",1995,"[Kevin Spacey, ensemble cast, complicated, mus..."
52,"[Comedy, Drama, Romance]",Mighty Aphrodite (1995),1995,"[Woody Allen, Woody Allen, adoption, prostitut..."


In [24]:
avg_movie = features[m_locs,:].mean(axis=0)

In [25]:
dists, indices = nn.kneighbors(avg_movie)

In [26]:
df.iloc[indices[0]]

Unnamed: 0_level_0,categories,title,year,tag
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8698,[Drama],"Comfort of Strangers, The (1990)",1990,[Christopher Walken]
1114,"[Crime, Drama]","Funeral, The (1996)",1996,[Christopher Walken]
3115,[Drama],Flawless (1999),1999,"[men in drag, Philip Seymour Hoffman]"
2673,[Drama],Eternity and a Day (Mia aoniotita kai mia mera...,1998,[Golden Palm]
1687,"[Action, Thriller]","Jackal, The (1997)",1997,"[Bruce Willis, Bruce Willis]"
418,[Drama],Being Human (1993),1993,[Special]
4122,[Drama],Ironweed (1987),1987,[Jack Nicholson]
8887,[Thriller],Whispers in the Dark (1992),1992,[psychiatrist as protagonist]
3010,[Drama],Rosetta (1999),1999,"[belgium, dardenne, dequenne, palm d'or, verit..."
55,[Drama],Georgia (1995),1995,[musicians]


In [27]:
dists

array([[ 43.61149992,  45.17384853,  45.46654876,  45.55864265,
         45.69033453,  45.69456779,  45.72958385,  45.78592218,
         45.84565342,  45.87966462,  45.94489211,  45.94642274,
         45.96428817,  45.99233858,  46.02905425,  46.05216161,
         46.05484741,  46.06187672,  46.09157163,  46.09935714]])

In [28]:
scaled_ratings = df_ratings[df_ratings.user_id == 142]['rating'] * 0.2

In [29]:
weighted_avg_movie = scaled_ratings.reshape(1,-1).dot(features[m_locs,:].toarray()) / len(scaled_ratings)

In [30]:
nn = NearestNeighbors(n_neighbors=20, metric='cosine', algorithm='brute').fit(features)

In [31]:
dists, indices = nn.kneighbors(weighted_avg_movie)

In [32]:
df.iloc[indices[0]]

Unnamed: 0_level_0,categories,title,year,tag
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8698,[Drama],"Comfort of Strangers, The (1990)",1990,[Christopher Walken]
2673,[Drama],Eternity and a Day (Mia aoniotita kai mia mera...,1998,[Golden Palm]
3010,[Drama],Rosetta (1999),1999,"[belgium, dardenne, dequenne, palm d'or, verit..."
6151,[Drama],Yol (1982),1982,[Golden Palm]
4046,[Drama],Friendly Persuasion (1956),1956,"[Quakers, Quakers, Golden Palm]"
1157,[Drama],"Symphonie pastorale, La (1946)",1946,[Golden Palm]
4122,[Drama],Ironweed (1987),1987,[Jack Nicholson]
5928,[Drama],"Border, The (1982)",1982,[Jack Nicholson]
64716,[Drama],Seven Pounds (2008),2008,[Will Smith]
43556,[Drama],Annapolis (2006),2006,"[Naval story interests me, Naval story interes..."


## Dimensional Reduction

- Problem of too many tags.  Similar movies don't necessarily share tags.
- So let's combine them -- SVD for dimensional reduction

In [33]:
svd = tag_pipe.named_steps['svd']
vect = tag_pipe.named_steps['vectorizer']

In [34]:
first = svd.components_[0]

In [35]:
first.argsort()[:-20:-1]

array([ 6452, 11154,  9727,  8652,  9568, 14277,  5195,  7780,  4527,
        4865,  7496, 13092, 15029,   279, 12376, 13089, 14274,  9480,  7015])

In [36]:
[", ".join([vect.feature_names_[i] for i in c.argsort()[:-10:-1]])
 for c in svd.components_]

["Tumey's DVDs, imdb top 250, erlend's DVDs, classic, dvd, seen more than once, R, based on a book, National Film Registry",
 'R, ClearPlay, Nudity (Topless), movie to see, less than 300 ratings, Nudity (Topless - Brief), owned, based on a book, seen at the cinema',
 "seen more than once, 70mm, action, seen at the cinema, dvd, Futuristmovies.com, franchise, sci-fi, Eric's Dvds",
 'less than 300 ratings, based on a book, National Film Registry, adapted from:book, classic, 70mm, AFI 100, movie to see, AFI 100 (Cheers)',
 'less than 300 ratings, stylized, atmospheric, tense, 70mm, Criterion, quirky, humorous, seen more than once',
 'based on a book, adapted from:book, stylized, atmospheric, Criterion, disturbing, based on book, tense, Nudity (Topless)',
 'comedy, quirky, funny, humorous, seen more than once, satirical, classic, irreverent, witty',
 "Tumey's DVDs, less than 300 ratings, Bibliothek, based on a book, adapted from:book, movie to see, Eric's Dvds, funny, own",
 'movie to see, 

## Cooperative Learning

- Instead of finding movies like a given movie, find users like a given user, and then report what they liked.
- How to measure users?  By the ratings they give.

In [37]:
by_user_ratings = df_ratings.groupby('user_id').apply(
#    lambda items: type(items))
    lambda items: {i[1]: i[2] for i in items.itertuples()}) # 0 is index

In [38]:
features = DictVectorizer().fit_transform(by_user_ratings)

In [39]:
nn = NearestNeighbors(n_neighbors=20, metric='l1')

In [40]:
nn.fit(features)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='l1',
         metric_params=None, n_jobs=1, n_neighbors=20, p=2, radius=1.0)

In [41]:
df_ratings_title = df_ratings.merge(df[['title']], left_on='movie_id', right_index=True)

In [42]:
uid = 142

In [43]:
df_ratings_title[df_ratings_title['user_id'] == uid].sort_values('rating', ascending=False)

Unnamed: 0,movie_id,rating,user_id,title
16495,111,5.0,142,Taxi Driver (1976)
16504,608,5.0,142,Fargo (1996)
16487,52,4.0,142,Mighty Aphrodite (1995)
16484,36,4.0,142,Dead Man Walking (1995)
16486,50,4.0,142,"Usual Suspects, The (1995)"
16501,457,4.0,142,"Fugitive, The (1993)"
16503,593,4.0,142,"Silence of the Lambs, The (1991)"
16509,661,4.0,142,James and the Giant Peach (1996)
16498,296,4.0,142,Pulp Fiction (1994)
16516,1084,4.0,142,Bonnie and Clyde (1967)


In [44]:
dists, indices = nn.kneighbors(features[by_user_ratings.index.get_loc(142), :])

In [45]:
indices

array([[  131,  3708, 18280, 29355, 50439, 50340, 10557, 23433, 68555,
        27571, 35021, 19922, 37308, 36124, 10244, 12794,  6743, 13816,
        30846, 32679]])

In [46]:
neighbors = [by_user_ratings.index[i] for i in indices[0]][1:]

In [47]:
ratings_grp = df_ratings_title[df_ratings_title['user_id'].isin(neighbors)] \
    .groupby('title')[['rating']]

In [48]:
def bayes_sum(N, mu):
    return lambda x: (x.sum() + mu*N) / (x.count() + N)

ratings_grp.aggregate(bayes_sum(2, 3)).sort_values('rating', ascending=False)

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
Fargo (1996),4.000000
Star Wars: Episode VI - Return of the Jedi (1983),3.750000
"Postman, The (Postino, Il) (1994)",3.666667
It's a Wonderful Life (1946),3.666667
"Fugitive, The (1993)",3.666667
"Silence of the Lambs, The (1991)",3.666667
Pulp Fiction (1994),3.666667
Toy Story (1995),3.631579
Dead Man Walking (1995),3.529412
White Squall (1996),3.500000


## Exercises

1. Add in the year as a feature.  Should it be continuous or categorical, or something in between?  How does the scale of the year affect the KNN calculation?  Use a scaler to reduce this effect.

2. In weighting reviews, we consider not reviewing a movie to be less of a recommendation that scoring it a one.  Shift the rating scale to change this, and see how the affects the resultant recommendations.

3. Explore different distance metrics.