In [None]:
import tensorflow as tf
import sys
import numpy as np
import pandas as pd
import sklearn
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

class HybridCollabFilter():

    def __init__(self, numUsers, embedding_dim_plot, embedding_dim, input_dim):

        # hyper parameters
        self.batch_size = 300
        self.numUsers = numUsers
        self.epochs = 10
        self.init_var =.01

        #Movie Features
        self.movieFeatures = tf.placeholder(tf.float32, shape=(None,input_dim))

        # input tensors for movies, usres, ratings
        self.users = tf.placeholder(tf.int32, shape=(None))
        self.rating = tf.placeholder(tf.float32, shape=(None))

        # embedding matricies for users
        self.userMat = tf.Variable(self.init_var*tf.random_normal([numUsers, embedding_dim_plot]))
        self.userBias = tf.Variable(self.init_var*tf.random_normal([numUsers,]))

        #Model parameters for movies
        self.W = tf.Variable(self.init_var*tf.random_normal([input_dim, embedding_dim_plot]))
        self.b = tf.Variable(self.init_var*tf.random_normal([embedding_dim_plot]))

        movieTensor = tf.matmul(self.movieFeatures,self.W) + self.b

        # map each user/movie to its feature vector
        self.U = tf.nn.embedding_lookup(self.userMat, self.users)
        self.u_b = tf.nn.embedding_lookup(self.userBias, self.users)

        # predicted rating is dot product of user and movie
        self.yhat = tf.reduce_sum(tf.mul(self.U, movieTensor) , 1) + self.u_b

        self.cost = tf.nn.l2_loss(self.yhat - self.rating)

        self.optimizer = tf.train.AdamOptimizer(learning_rate=.01).minimize(self.cost)
        
        self.session = tf.Session()
        self.session.run(tf.initialize_all_variables())


    def train_test_split(self,users,movies,ratings,split=.1):

        shuffle  = np.random.permutation(len(users))

        partition = np.floor(len(users) * (1-split))

        train_idx = shuffle[:partition]
        test_idx = shuffle[partition:]

        users_train = users[train_idx]
        users_test = users[test_idx]

        movies_train = movies[train_idx]
        movies_test = movies[test_idx]

        ratings_train = ratings[train_idx]
        ratings_test = ratings[test_idx]

        return users_train,movies_train,ratings_train , users_test,movies_test,ratings_test


    def train(self, users, movies, ratings,val_freq=5):

        users_train, movies_train, ratings_train, users_test, movies_test, ratings_test = \
            self.train_test_split(users,movies,ratings)

        num_batches = movies_train.shape[0] // self.batch_size

        for i in range(self.epochs):

            avg_cost = 0

            for b_idx in range(num_batches):

                ratings_batch  = ratings_train[self.batch_size * b_idx:self.batch_size * (b_idx + 1)]

                users_batch = users_train[self.batch_size * b_idx:self.batch_size * (b_idx + 1)]
                movie_batch = movies_train[self.batch_size * b_idx:self.batch_size * (b_idx + 1)]

                avg_cost +=  (self.session.run([self.cost, self.optimizer],
                                             {self.users: users_batch, self.movieFeatures: movie_batch,
                                              self.rating: ratings_batch})[0] ) / self.batch_size



            print ("Epoch: ", i, " Average Cost: ",avg_cost / num_batches)

            if i % val_freq ==0:
                auc_mean = 0
                uni_users = np.unique(users_test)
                for usr in uni_users:
                    usr_idxes = users_test == usr
                    usr_idxes = np.where(usr_idxes)
                    usr_u = users_test[usr_idxes]
                    movie_u = movies_test[usr_idxes]
                    rtg_u = ratings_test[usr_idxes]
                    if len(usr_u) < 3:
                        continue
                    yhat = (self.session.run([self.yhat],
                                             {self.users: usr_u, self.movieFeatures: movie_u,
                                              self.rating: rtg_u})[0] )
                    auc_mean += sklearn.metrics.auc(yhat, rtg_u, reorder = True) / len(uni_users)

                print ("Testing AUC mean: " , auc_mean)
                

    @staticmethod
    def map2idx(movieratings, mergedScrape_ML):

        users = movieratings['userId'].values
        movies = movieratings['movieId'].values

        # unique users / movies
        uni_users = movieratings['userId'].unique()
        uni_movies = mergedScrape_ML['movieId'].unique()

        print len(uni_movies)

        # dict mapping the id to an index
        user_map = dict(zip(uni_users, range(len(uni_users))))
        movie_map = dict(zip(uni_movies, range(len(uni_movies))))

        pairs = []
        for user, movie, rating in zip(users, movies, movieratings['rating']):
            if movie in movie_map:
                pairs.append((user_map[user], movie_map[movie], rating))

        return np.array(pairs), len(uni_users), len(uni_movies)


def featureMatrix(movieData):
    
    #movieData.ix[pd.isnull(movieData[['plot']]).as_matrix()[:, 0], 'plot'] = ''

    movieData[pd.isnull(movieData)] = ''
    
    vectorizer = CountVectorizer(max_features=200)

    vectorizer.fit(movieData['plot'])

    movieFeaturesplot = vectorizer.transform(movieData['plot'])

    
    
    return movieFeatures.toarray()


#The first iteration here will be just using plot
if __name__ == '__main__':
    scrapedMovieData = pd.read_csv('movieDataList.csv', index_col=0)

    # Movie Lens rating data
    movieratings = pd.read_csv('ratings.csv')

    # List of movies in order
    movieLenseMovies = pd.read_csv('movies.csv')

    featMat = featureMatrix(scrapedMovieData)

    movieLenseMovies.drop('genres', axis=1, inplace=True)

    mergedScrape_ML = pd.merge(scrapedMovieData, movieLenseMovies, left_on='movie_len_title',
                               right_on='title',
                               how='left')

    mergedScrape_ML.drop_duplicates(subset='movie_len_title', inplace=True)

    #User and movie ids mapped to be on continuous interval
    triples, num_users, num_movie = HybridCollabFilter.map2idx(movieratings,mergedScrape_ML)

    user_idx = triples[:,0]
    movie_idx = triples[ :,1]
    ratings = triples[:, 2]

    movieFeatures = featMat[movie_idx.astype(int)]


    #(self, numUsers, embedding_dim_plot, embedding_dim,input_dim):
    movieModel = HybridCollabFilter(num_users, 5, 5, 200)
    movieModel.train(user_idx, movieFeatures, ratings)



In [140]:
scrapedMovieData = pd.read_csv('movieDataList.csv', index_col=0).fillna('')

movieData = scrapedMovieData

In [141]:
pd.isnull(movieData.fillna('')).sum()

rating                  0
runtimes                0
year                    0
languages               0
votes                   0
producer                0
title                   0
writer                  0
editor                  0
certificates            0
country codes           0
language codes          0
cover url               0
genres                  0
director                0
production companies    0
countries               0
plot outline            0
plot                    0
cast                    0
original music          0
full-size cover url     0
movie_len_title         0
mpaa                    0
dtype: int64

In [142]:
cast = movieData['cast']

In [143]:
test_cast = cast[0]

In [144]:
test_cast

c = test_cast.split('>')
c2 = [ s.split(":_")[1] for s in c[0:-1]]
c3 = [ "_".join(s.split(",")).replace(" ", "") for s in c2]

print c3

['Hanks_Tom_', 'Allen_Tim_', 'Rickles_Don_', 'Varney_Jim_', 'Shawn_Wallace_', 'Ratzenberger_John_', 'Potts_Annie_', 'Morris_John_', 'vonDetten_Erik_', 'Metcalf_Laurie_', 'Ermey_R.Lee_', 'Freeman_Sarah_', 'Jillette_Penn_', 'Angel_Jack_', 'Aste_Spencer_', 'Berg_Greg_', 'Bradley_Lisa_', 'Cunningham_Kendall_', 'Derryberry_Debi_', 'Dorkin_Cody_', 'Farmer_Bill_', 'Good_Craig_', 'Grudt_Gregory_', 'Judovits_Danielle_', 'Lasseter_Sam_', 'Levenbrown_Brittany_', 'Lynn_Sherry_', 'McAfee_Scott_', 'McGowan_Mickie_', "O'Donohue_Ryan_", 'Pidgeon_Jeff_', 'Pinney_Patrick_', 'Proctor_Phil_', 'Rabson_Jan_', 'Ranft_Joe_', 'Stanton_Andrew_', 'Sweet_Shane_', 'Erbil_MehmetAli_', 'Lane_Nathan_', 'Lasseter_John_', 'Sabella_Ernie_', 'Unkrich_Hannah_', 'Welker_Frank_']


In [149]:
def clean_person_string(raw_text):

    if raw_text == '':
        return ''

    cast = raw_text.split('>')
    cast = [ s.split(":_")[1] for s in cast[0:-1]]
    cast = ["_".join(s.split(",")).replace(" ", "") for s in cast]

    return " ".join(cast)

In [150]:
cast_str = movieData['cast'].apply(clean_person_string)
director_str = movieData['director'].apply(clean_person_string)

people_df = pd.DataFrame([cast_str,director_str])

people_strings = people_df.apply( lambda x:  ' '.join(x) , axis=0)


In [152]:
people_strings[0]

"Hanks_Tom_ Allen_Tim_ Rickles_Don_ Varney_Jim_ Shawn_Wallace_ Ratzenberger_John_ Potts_Annie_ Morris_John_ vonDetten_Erik_ Metcalf_Laurie_ Ermey_R.Lee_ Freeman_Sarah_ Jillette_Penn_ Angel_Jack_ Aste_Spencer_ Berg_Greg_ Bradley_Lisa_ Cunningham_Kendall_ Derryberry_Debi_ Dorkin_Cody_ Farmer_Bill_ Good_Craig_ Grudt_Gregory_ Judovits_Danielle_ Lasseter_Sam_ Levenbrown_Brittany_ Lynn_Sherry_ McAfee_Scott_ McGowan_Mickie_ O'Donohue_Ryan_ Pidgeon_Jeff_ Pinney_Patrick_ Proctor_Phil_ Rabson_Jan_ Ranft_Joe_ Stanton_Andrew_ Sweet_Shane_ Erbil_MehmetAli_ Lane_Nathan_ Lasseter_John_ Sabella_Ernie_ Unkrich_Hannah_ Welker_Frank_ Lasseter_John_"

In [153]:
director_str[0]

'Lasseter_John_'