In [101]:
import tensorflow as tf
import sys
import numpy as np
import pandas as pd
import sklearn
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

class HybridCollabFilter():

    def __init__(self, numUsers, embedding_dim,input_dim):

        # hyper parameters
        self.batch_size = 300
        self.numUsers = numUsers
        self.epochs = 10
        self.init_var =.01

        #Movie Features
        self.movieFeatures = tf.placeholder(tf.float32, shape=(None,input_dim))

        # input tensors for movies, usres, ratings
        self.users = tf.placeholder(tf.int32, shape=(None))
        self.rating = tf.placeholder(tf.float32, shape=(None))

        # embedding matricies for users
        self.userMat = tf.Variable(self.init_var*tf.random_normal([numUsers, embedding_dim]))
        self.userBias = tf.Variable(self.init_var*tf.random_normal([numUsers,]))

        #Model parameters for movies
        self.W = tf.Variable(self.init_var*tf.random_normal([input_dim, embedding_dim]))
        self.b = tf.Variable(self.init_var*tf.random_normal([embedding_dim]))

        movieTensor = tf.matmul(self.movieFeatures,self.W) + self.b

        # map each user/movie to its feature vector
        self.U = tf.nn.embedding_lookup(self.userMat, self.users)
        self.u_b = tf.nn.embedding_lookup(self.userBias, self.users)

        # predicted rating is dot product of user and movie
        self.yhat = tf.reduce_sum(tf.mul(self.U, movieTensor) , 1) + self.u_b

        self.cost = tf.nn.l2_loss(self.yhat - self.rating)

        self.optimizer = tf.train.AdamOptimizer(learning_rate=.01).minimize(self.cost)
        
        self.session = tf.Session()
        self.session.run(tf.initialize_all_variables())


    def train_test_split(self,users,movies,ratings,split=.1):

        shuffle  = np.random.permutation(len(users))

        partition = np.floor(len(users) * (1-split))

        train_idx = shuffle[:partition]
        test_idx = shuffle[partition:]

        users_train = users[train_idx]
        users_test = users[test_idx]

        movies_train = movies[train_idx]
        movies_test = movies[test_idx]

        ratings_train = ratings[train_idx]
        ratings_test = ratings[test_idx]

        return users_train,movies_train,ratings_train , users_test,movies_test,ratings_test


    def train(self, users, movies, ratings,val_freq=5):

        users_train, movies_train, ratings_train, users_test, movies_test, ratings_test = \
            self.train_test_split(users,movies,ratings)

        num_batches = movies_train.shape[0] // self.batch_size

        for i in range(self.epochs):

            avg_cost = 0

            for b_idx in range(num_batches):

                ratings_batch  = ratings_train[self.batch_size * b_idx:self.batch_size * (b_idx + 1)]

                users_batch = users_train[self.batch_size * b_idx:self.batch_size * (b_idx + 1)]
                movie_batch = movies_train[self.batch_size * b_idx:self.batch_size * (b_idx + 1)]

                avg_cost +=  (self.session.run([self.cost, self.optimizer],
                                             {self.users: users_batch, self.movieFeatures: movie_batch,
                                              self.rating: ratings_batch})[0] ) / self.batch_size



            print ("Epoch: ", i, " Average Cost: ",avg_cost / num_batches)

            if i % val_freq ==0:
                auc_mean = 0
                uni_users = np.unique(users_test)
                for usr in uni_users:
                    usr_idxes = users_test == usr
                    usr_idxes = np.where(usr_idxes)
                    usr_u = users_test[usr_idxes]
                    movie_u = movies_test[usr_idxes]
                    rtg_u = ratings_test[usr_idxes]
                    if len(usr_u) < 3:
                        continue
                    yhat = (self.session.run([self.yhat],
                                             {self.users: usr_u, self.movieFeatures: movie_u,
                                              self.rating: rtg_u})[0] )
                    auc_mean += sklearn.metrics.auc(yhat, rtg_u, reorder = True) / len(uni_users)

                print ("Testing AUC mean: " , auc_mean)
                

    @staticmethod
    def map2idx(movieratings, mergedScrape_ML):

        users = movieratings['userId'].values
        movies = movieratings['movieId'].values

        # unique users / movies
        uni_users = movieratings['userId'].unique()
        uni_movies = mergedScrape_ML['movieId'].unique()

        print len(uni_movies)

        # dict mapping the id to an index
        user_map = dict(zip(uni_users, range(len(uni_users))))
        movie_map = dict(zip(uni_movies, range(len(uni_movies))))

        pairs = []
        for user, movie, rating in zip(users, movies, movieratings['rating']):
            if movie in movie_map:
                pairs.append((user_map[user], movie_map[movie], rating))

        return np.array(pairs), len(uni_users), len(uni_movies)



def featureMatrix(movieData):

    movieData.ix[pd.isnull(movieData[['plot']]).as_matrix()[:,0],'plot'] = ''
    
    vectorizer = CountVectorizer(max_features=200)

    vectorizer.fit(movieData['plot'])
    
    movieFeatures = vectorizer.transform(movieData['plot'])

    return movieFeatures.toarray()


#The first iteration here will be just using plot
if __name__ == '__main__':
    scrapedMovieData = pd.read_csv('movieDataList.csv', index_col=0)

    # Movie Lens rating data
    movieratings = pd.read_csv('ratings.csv')

    # List of movies in order
    movieLenseMovies = pd.read_csv('movies.csv')

    featMat = featureMatrix(scrapedMovieData)

    movieLenseMovies.drop('genres', axis=1, inplace=True)

    mergedScrape_ML = pd.merge(scrapedMovieData, movieLenseMovies, left_on='movie_len_title',
                               right_on='title',
                               how='left')

    mergedScrape_ML.drop_duplicates(subset='movie_len_title', inplace=True)

    #User and movie ids mapped to be on continuous interval
    triples, num_users, num_movie = HybridCollabFilter.map2idx(movieratings,mergedScrape_ML)

    user_idx = triples[:,0]
    movie_idx = triples[ :,1]
    ratings = triples[:, 2]

    movieFeatures = featMat[movie_idx.astype(int)]

    print triples.shape
    print movieFeatures.shape

    #(self, numUsers, embedding_dim,input_dim):
    movieModel = HybridCollabFilter(num_users, 10, 200)
    movieModel.train(user_idx, movieFeatures, ratings)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.where(-key, value, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(24437, 1)
24432


IndexError: index 28 is out of bounds for axis 0 with size 1

In [97]:
featMat


array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]])

In [72]:
len(pd.merge(scrapedMovieData, movieLenseMovies, left_on='movie_len_title',
                               right_on='title',
                               how='left')['movie_len_title'].unique())



24432

In [73]:
featMat.shape

(23261, 200)

In [92]:
scrapedMovieData[['plot']]

Unnamed: 0,plot
0,[u'A little boy named Andy loves to be in his ...
1,[u'After being trapped in a jungle board game ...
2,[u'Things don\'t seem to change much in Wabash...
3,"[u""This story based on the best selling novel ..."
4,"[u'In this sequel to ""Father of the Bride"", Ge..."
5,"[u""Hunters and their prey--Neil and his profes..."
6,"[u""While she was growing up, Sabrina Fairchild..."
7,"[u'A mischievous young boy, Tom Sawyer (', u'T..."
8,[u'Some terrorists kidnap the Vice President o...
9,[u'When a deadly satellite weapon system falls...


In [77]:
scrapedMovieData.shape

(24437, 24)

In [81]:
scrapedMovieData.keys()

Index([u'rating', u'runtimes', u'year', u'languages', u'votes', u'producer',
       u'title', u'writer', u'editor', u'certificates', u'country codes',
       u'language codes', u'cover url', u'genres', u'director',
       u'production companies', u'countries', u'plot outline', u'plot',
       u'cast', u'original music', u'full-size cover url', u'movie_len_title',
       u'mpaa'],
      dtype='object')

In [103]:
movieData  = scrapedMovieData
vectorizer = CountVectorizer(max_features=200)


In [105]:

movie_plot = movieData[['plot']]

movieData[['plot']][ pd.isnull(movieData[['plot']]) ] = ""

movie_plot.dropna(inplace=True)

vectorizer.fit(movieData[['plot']])

print movieData[['plot']].shape

movieFeatures = vectorizer.transform(movieData[['plot']])



(24437, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [119]:
movieData.ix[pd.isnull(movieData[['plot']]).as_matrix()[:,0],'plot'] = ''

In [118]:
pd.isnull(movieData[['plot']]).as_matrix()[:,0]

array([False, False, False, ..., False, False, False], dtype=bool)

In [121]:
pd.isnull(movieData['plot']).shape

(24437,)