In [1]:
import pandas as pd
import numpy as np
import random
import joblib

#### Purpose of Notebook

This notebook was created to create sample datasets to use within a heroku app.New files created are:
1. ratings_demo
2. movies_demo
3. ratings_demo_mx
4. Vectors (U, sigma, Vt) which will be used to calculate dot product in function and user predictions when the function is run.
5. movies_bow_demo

In [76]:
movies = pd.read_csv('Data/movies_sml.csv')
ratings = pd.read_hdf('Data/ratings_hdf.h5')

In [77]:
movies.head()

Unnamed: 0,movieId,Title,genres,Actors,Director,Plot,Poster
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney",John Lasseter,A cowboy doll is profoundly threatened and jea...,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
1,2,Jumanji,Adventure|Children|Fantasy,"Robin Williams, Jonathan Hyde, Kirsten Dunst, ...",Joe Johnston,When two kids find and play a magical board ga...,https://m.media-amazon.com/images/M/MV5BZTk2Zm...
2,3,Grumpier Old Men,Comedy|Romance,"Walter Matthau, Jack Lemmon, Sophia Loren, Ann...",Howard Deutch,John and Max resolve to save their beloved bai...,https://m.media-amazon.com/images/M/MV5BMjQxM2...
3,4,Waiting to Exhale,Comedy|Drama|Romance,"Whitney Houston, Angela Bassett, Loretta Devin...",Forest Whitaker,"Based on Terry McMillan's novel, this film fol...",https://m.media-amazon.com/images/M/MV5BYzcyMD...
4,5,Father of the Bride Part II,Comedy,"Steve Martin, Diane Keaton, Martin Short, Kimb...",Charles Shyer,George Banks must deal not only with the pregn...,https://m.media-amazon.com/images/M/MV5BOTEyNz...


#### Shrink file sizes further for Heroku app

In [40]:
ratings.shape

(2892878, 5)

In [41]:
movies.shape

(11102, 7)

In [78]:
#shrink ratings for demo purposes
ratings_demo = ratings.sample(frac=0.01, random_state=1)

print(ratings_demo.shape)
ratings_demo.head()

(28929, 5)


Unnamed: 0,userId,num_user_rated,movieId,Title,rating
573437,85983,132,2716,Ghostbusters,3.0
7500,71558,150,110,Braveheart,3.5
2623282,220354,80,218,Boys on the Side,4.0
498686,243570,134,410,Addams Family Values,3.5
414056,260431,137,93510,21 Jump Street,3.5


In [79]:
user_count = ratings_demo['userId'].unique()
print(len(user_count))

17664


In [80]:
# create new ratings file
ratings_demo.to_hdf('Data/ratings_demo.h5', key='ratings_demo',complib='blosc',complevel=9, mode='w' )

In [81]:
# find only the movies rated by the users in the smaller ratings data set

movies_demo = pd.merge(ratings_demo, movies, how = 'inner', on = 'movieId')

movies_demo.head()

Unnamed: 0,userId,num_user_rated,movieId,Title_x,rating,Title_y,genres,Actors,Director,Plot,Poster
0,85983,132,2716,Ghostbusters,3.0,Ghostbusters,Action|Comedy|Sci-Fi,"Bill Murray, Dan Aykroyd, Sigourney Weaver, Ha...",Ivan Reitman,Three former parapsychology professors set up ...,https://m.media-amazon.com/images/M/MV5BMTkxMj...
1,181815,143,2716,Ghostbusters,5.0,Ghostbusters,Action|Comedy|Sci-Fi,"Bill Murray, Dan Aykroyd, Sigourney Weaver, Ha...",Ivan Reitman,Three former parapsychology professors set up ...,https://m.media-amazon.com/images/M/MV5BMTkxMj...
2,114424,115,2716,Ghostbusters,3.0,Ghostbusters,Action|Comedy|Sci-Fi,"Bill Murray, Dan Aykroyd, Sigourney Weaver, Ha...",Ivan Reitman,Three former parapsychology professors set up ...,https://m.media-amazon.com/images/M/MV5BMTkxMj...
3,235740,136,2716,Ghostbusters,1.0,Ghostbusters,Action|Comedy|Sci-Fi,"Bill Murray, Dan Aykroyd, Sigourney Weaver, Ha...",Ivan Reitman,Three former parapsychology professors set up ...,https://m.media-amazon.com/images/M/MV5BMTkxMj...
4,119804,128,2716,Ghostbusters,4.5,Ghostbusters,Action|Comedy|Sci-Fi,"Bill Murray, Dan Aykroyd, Sigourney Weaver, Ha...",Ivan Reitman,Three former parapsychology professors set up ...,https://m.media-amazon.com/images/M/MV5BMTkxMj...


In [82]:

movies_demo = movies_demo[['movieId','Title_x','genres','Actors', 'Director', 'Plot', 'Poster' ]]

movies_demo = movies_demo.rename(columns = {'Title_x': 'Title'})

# drop duplicates

movies_demo.drop_duplicates(subset ="movieId", 
                     keep = 'first', inplace = True) 

print(movies_demo.shape)


(3199, 7)


In [83]:
movies_demo.drop_duplicates(subset ="Title", 
                     keep = 'first', inplace = True) 

In [84]:
print(movies_demo.shape)


(3094, 7)


In [85]:
movies_demo.to_csv('Data/movies_demo.csv', index=False)

In [86]:
movies_demo.head()

Unnamed: 0,movieId,Title,genres,Actors,Director,Plot,Poster
0,2716,Ghostbusters,Action|Comedy|Sci-Fi,"Bill Murray, Dan Aykroyd, Sigourney Weaver, Ha...",Ivan Reitman,Three former parapsychology professors set up ...,https://m.media-amazon.com/images/M/MV5BMTkxMj...
88,110,Braveheart,Action|Drama|War,"James Robinson, Sean Lawlor, Sandy Nelson, Jam...",Mel Gibson,When his secret bride is executed for assaulti...,https://m.media-amazon.com/images/M/MV5BMzkzMm...
219,218,Boys on the Side,Comedy|Drama,"Whoopi Goldberg, Mary-Louise Parker, Drew Barr...",Herbert Ross,"Jane is a night club singer, out of work. Robi...",https://m.media-amazon.com/images/M/MV5BMDlhZG...
236,410,Addams Family Values,Children|Comedy|Fantasy,"Anjelica Huston, Raul Julia, Christopher Lloyd...",Barry Sonnenfeld,The Addams Family try to rescue their beloved ...,https://m.media-amazon.com/images/M/MV5BZWFhNj...
275,93510,21 Jump Street,Action|Comedy|Crime,"Jonah Hill, Channing Tatum, Brie Larson, Dave ...","Phil Lord, Christopher Miller",A pair of underachieving cops are sent back to...,https://m.media-amazon.com/images/M/MV5BMTc3Nz...


#### Create new Bag of Words csv
New movies_bow csv created based on smaller dataset

In [87]:
movies_bow = movies_demo.copy()

In [88]:
# function to switch all words to lower and remove spaces
def clean_data(x):
    if isinstance(x, list):
        return[str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [89]:
credits = ['Actors', 'Director']

for credit in credits:
    movies_bow[credit] = movies_bow[credit].apply(clean_data)

In [90]:
# remove | from genres

movies_bow['genres'] = movies_bow['genres'].str.replace("|", " ")

In [91]:
#### Combining data 

def create_soup(x):
    return  x['Actors'].replace(',',' ') + ' ' + x['Director'] + ' ' + ' '.join(x['genres'])

In [92]:
movies_bow['bag_of_words'] = movies_bow.apply(create_soup, axis=1)

In [93]:
movies_bow = movies_bow[['movieId','Title','bag_of_words']]
movies_bow.to_csv('Data/movies_bow_demo.csv', index=False)
movies_bow.head()

Unnamed: 0,movieId,Title,bag_of_words
0,2716,Ghostbusters,billmurray danaykroyd sigourneyweaver haroldra...
88,110,Braveheart,jamesrobinson seanlawlor sandynelson jamescosm...
219,218,Boys on the Side,whoopigoldberg mary-louiseparker drewbarrymore...
236,410,Addams Family Values,anjelicahuston rauljulia christopherlloyd joan...
275,93510,21 Jump Street,jonahhill channingtatum brielarson davefranco ...


#### Perform part of the SVD model process
All of the vectors are computed and pickled so that less time will be spent on data processing when the app is being run

In [94]:
# pivot ratings into movie features
ratings_demo_matrix = ratings_demo.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

In [95]:
# create ratings_demo_matrix for heroku app

ratings_demo_matrix.to_hdf('Data/ratings_demo_mx.h5', key='ratings_demo_matrix',complib='blosc',complevel=9, mode='w' )

In [96]:
# normalize the data by each users mean and convert it from a dataframe to a numpy array.

R = ratings_demo_matrix.values
user_ratings_mean = np.mean(R, axis = 1)
Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [97]:
# Compute the largest k singular values/vectors for a sparse matrix

from scipy.sparse.linalg import svds

U, sigma, Vt = svds(Ratings_demeaned, k = 50)

In [98]:
# convert sigma to a diagonal matrix

sigma = np.diag(sigma)

In [100]:
# calculate the dot product of our vectors
user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [101]:
preds_demo = pd.DataFrame(user_predicted_ratings, columns = ratings_demo_matrix.columns)

preds_demo.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,168122,168250,168252,168254,168418,168554,169400,169982,170875,172133
0,-0.000325,0.000262,0.000265,0.00032,0.00027,0.001397,0.000308,0.00031,0.000307,-0.000235,...,0.000314,0.000314,0.000309,0.000314,0.000314,0.000314,0.000314,0.000314,0.000314,0.000314
1,-0.00028,0.000989,0.00095,0.001101,0.002057,0.004269,0.000905,0.001146,0.001094,-0.005031,...,0.001158,0.001161,0.0011,0.001159,0.001161,0.001162,0.001162,0.001162,0.001162,0.00116
2,-0.007193,-0.004388,-0.001899,-0.000582,0.001318,0.006111,-0.001896,-0.000733,-0.000555,-0.005045,...,-0.000701,-0.000697,-0.001072,-0.000735,-0.000693,-0.000683,-0.000689,-0.000677,-0.000683,-0.000713
3,-0.001498,0.002721,8.1e-05,0.000616,-0.00056,0.001622,0.000544,0.000555,0.000816,-0.018101,...,0.000728,0.000751,0.000696,0.000735,0.000752,0.000756,0.000754,0.000759,0.000756,0.000749
4,-0.001659,0.001585,0.002128,0.002359,0.002302,0.002921,0.002219,0.002357,0.002353,-0.001535,...,0.002384,0.002387,0.002357,0.002388,0.002386,0.002386,0.002386,0.002386,0.002386,0.002385


In [102]:
preds_demo.to_hdf('Data/preds_demo.h5', key='preds_demo',complib='blosc',complevel=9, mode='w' )

In [106]:
exist_users = ratings_demo['userId'].unique().tolist()

exist_users

[85983,
 71558,
 220354,
 243570,
 260431,
 37362,
 6079,
 259726,
 194783,
 72952,
 221568,
 253896,
 269434,
 29387,
 215279,
 36842,
 148505,
 78724,
 142893,
 180557,
 93270,
 172801,
 112881,
 141312,
 162820,
 79468,
 181815,
 34249,
 73470,
 162318,
 79788,
 294,
 64794,
 92111,
 256105,
 128643,
 248298,
 220101,
 67973,
 42950,
 250102,
 228858,
 126990,
 213560,
 132964,
 24232,
 90904,
 28416,
 38340,
 70946,
 2380,
 87481,
 214247,
 259781,
 113435,
 146218,
 200079,
 217195,
 86885,
 96826,
 5461,
 136557,
 82481,
 179894,
 11071,
 213882,
 196323,
 1283,
 18651,
 109244,
 216880,
 148003,
 124859,
 77614,
 244510,
 66425,
 55101,
 199549,
 47687,
 183899,
 174778,
 196128,
 66255,
 9007,
 51488,
 68205,
 95592,
 108875,
 25654,
 203918,
 253072,
 14180,
 227487,
 196484,
 55893,
 217395,
 142087,
 196493,
 208473,
 267779,
 168067,
 174090,
 142945,
 238061,
 191553,
 15824,
 16509,
 85871,
 190390,
 229168,
 112602,
 49318,
 28670,
 216922,
 51419,
 113866,
 252287,
 254

In [107]:
exist_movies = pd.DataFrame(movies_demo['Title'])
exist_movies

exist_movies.to_csv('Data/sample_movies.csv', index=False)