In [67]:
import pandas as pd
import numpy as np

#### Prepare Movie Data

In [68]:
# import dataset from movie_lens and omdb
#movie_lens data contains the columns movieId, genre, movie_nm, released
#omdb data contains title, actors, directors, year

movies_ml = pd.read_csv('data/movies_clean.csv')
movies_omdb = pd.read_csv('data/all_movies_info.csv')

In [69]:
movies_ml.head()

Unnamed: 0,movieId,genres,movie_nm,released
0,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,2,Adventure|Children|Fantasy,Jumanji,1995
2,3,Comedy|Romance,Grumpier Old Men,1995
3,4,Comedy|Drama|Romance,Waiting to Exhale,1995
4,5,Comedy,Father of the Bride Part II,1995


In [70]:
movies_omdb.head()

Unnamed: 0,Title,Actors,Director,Year
0,Home Alone,"Macaulay Culkin, Joe Pesci, Daniel Stern, John...",Chris Columbus,1990
1,Ghost,"Patrick Swayze, Demi Moore, Tony Goldwyn, Stan...",Jerry Zucker,1990
2,Dances with Wolves,"Kevin Costner, Mary McDonnell, Graham Greene, ...",Kevin Costner,1990
3,Pretty Woman,"Richard Gere, Julia Roberts, Ralph Bellamy, Ja...",Garry Marshall,1990
4,"I, the Worst of All","Assumpta Serna, Dominique Sanda, Héctor Alteri...",María Luisa Bemberg,1990


In [71]:
# prepare both datasets to be joined on the Title column

# rename movie_nm to Title in movies_ml dataframe

movies_ml = movies_ml.rename(columns = {'movie_nm': 'Title'})

# apply strip method on the title columns of both datasets 

movies_ml['Title'] = movies_ml['Title'].str.strip()
movies_omdb['Title'] = movies_omdb['Title'].str.strip()

In [72]:
# complete join of dataframes

movies = pd.merge(movies_ml, movies_omdb, how = 'inner',on = 'Title' )

movies.shape

(31779, 7)

In [9]:
# apply strip method on the title columns of both datasets 

movies_ml['Title'] = movies_ml['Title'].str.strip()
movies_omdb['Title'] = movies_omdb['Title'].str.strip()

In [73]:
all_movies = movies.copy()

In [74]:
all_movies = all_movies[['movieId', 'Title', 'genres', 'released', 'Actors', 'Director']]

all_movies.head()

Unnamed: 0,movieId,Title,genres,released,Actors,Director
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney",John Lasseter
1,2,Jumanji,Adventure|Children|Fantasy,1995,"Robin Williams, Jonathan Hyde, Kirsten Dunst, ...",Joe Johnston
2,3,Grumpier Old Men,Comedy|Romance,1995,"Walter Matthau, Jack Lemmon, Sophia Loren, Ann...",Howard Deutch
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995,"Whitney Houston, Angela Bassett, Loretta Devin...",Forest Whitaker
4,5,Father of the Bride Part II,Comedy,1995,"Steve Martin, Diane Keaton, Martin Short, Kimb...",Charles Shyer


#### Prepare Ratings Data

In [75]:
ratings = pd.read_csv('data/ratings.csv')

In [76]:
print(ratings.shape)
ratings.head()

(26024289, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


#### Reduce the size of ratings dataframe

To reduce the size of the ratings dataframe we needed to decide on a meaningful way to shrink the data. We conducted exploratory analysis and decided to split the data based on users who rated within a range of the mean number of ratings per user.


In [77]:
# count number of ratings per user
user_ratings_grp = pd.DataFrame(ratings.groupby('userId')['rating'].count())

In [78]:
user_ratings_grp.head()

Unnamed: 0_level_0,rating
userId,Unnamed: 1_level_1
1,27
2,22
3,10
4,62
5,26


In [79]:
# find mean of number of ratings per user
avg_nratings = user_ratings_grp['rating'].mean()

avg_nratings

96.06745393065974

In [80]:
# find users who rated within +10 or - 10 of the mean

avg_activity = user_ratings_grp[(user_ratings_grp['rating'] >= 85) & (user_ratings_grp['rating'] <= 105)]
print(avg_activity.shape)

# rename column

avg_activity = avg_activity.rename(columns = {'rating': 'rating_ct'})

avg_activity.head()

(11910, 1)


Unnamed: 0_level_0,rating_ct
userId,Unnamed: 1_level_1
60,105
76,105
89,100
107,87
115,105


In [81]:
# combine avg_activity and ratings to only return the ratings for the people that are 
#in the avg_activity dataframe

new_ratings = pd.merge(avg_activity, ratings, how = 'inner', on = 'userId')

print(new_ratings.shape)
new_ratings.head()

(1133047, 5)


Unnamed: 0,userId,rating_ct,movieId,rating,timestamp
0,60,105,112,2.5,1136304271
1,60,105,153,2.5,1136306947
2,60,105,163,4.0,1136304313
3,60,105,165,4.0,1136306968
4,60,105,168,0.5,1136304366


Combine Ratings and Movies Dataset

In [82]:
mrd = pd.merge(new_ratings, all_movies,how = 'inner', on = 'movieId') 

In [83]:
print(mrd.shape)
mrd.head()

(774593, 10)


Unnamed: 0,userId,rating_ct,movieId,rating,timestamp,Title,genres,released,Actors,Director
0,60,105,112,2.5,1136304271,Rumble in the Bronx,Action|Adventure|Comedy|Crime,1995,"Jackie Chan, Anita Mui, Françoise Yip, Bill Tung",Stanley Tong
1,123,99,112,3.0,968570443,Rumble in the Bronx,Action|Adventure|Comedy|Crime,1995,"Jackie Chan, Anita Mui, Françoise Yip, Bill Tung",Stanley Tong
2,131,87,112,2.0,852309667,Rumble in the Bronx,Action|Adventure|Comedy|Crime,1995,"Jackie Chan, Anita Mui, Françoise Yip, Bill Tung",Stanley Tong
3,595,91,112,5.0,839936904,Rumble in the Bronx,Action|Adventure|Comedy|Crime,1995,"Jackie Chan, Anita Mui, Françoise Yip, Bill Tung",Stanley Tong
4,627,85,112,3.0,1136306189,Rumble in the Bronx,Action|Adventure|Comedy|Crime,1995,"Jackie Chan, Anita Mui, Françoise Yip, Bill Tung",Stanley Tong


In [84]:
# check for nulls

mrd.isnull().sum()

userId       0
rating_ct    0
movieId      0
rating       0
timestamp    0
Title        0
genres       0
released     0
Actors       0
Director     0
dtype: int64

In [85]:
mrd.groupby('Title')['rating'].mean().head()

Title
$5 a Day                            4.000000
$9.99                               3.833333
'71                                 3.500000
'Hellboy': The Seeds of Creation    3.700000
'Til There Was You                  3.410714
Name: rating, dtype: float64

In [86]:
mrd.groupby('Title')['rating'].mean().sort_values(ascending=False).head()

Title
Sister                5.0
Brown's Requiem       5.0
The Wizard of Lies    5.0
Gifted                5.0
Bush's Brain          5.0
Name: rating, dtype: float64

In [87]:
mrd.groupby('Title')['rating'].count().sort_values(ascending=False).head()

Title
Beauty and the Beast          10872
Terminator 2: Judgment Day     9490
Ghost                          9260
Speed                          6556
Forrest Gump                   6526
Name: rating, dtype: int64

In [88]:
ratingsMeanCount = pd.DataFrame(mrd.groupby('Title')['rating'].mean())
ratingsMeanCount.head()

Unnamed: 0_level_0,rating
Title,Unnamed: 1_level_1
$5 a Day,4.0
$9.99,3.833333
'71,3.5
'Hellboy': The Seeds of Creation,3.7
'Til There Was You,3.410714


In [89]:
ratingsMeanCount['ratingsCounts'] = pd.DataFrame(mrd.groupby('Title')['rating'].count())
ratingsMeanCount.head()

Unnamed: 0_level_0,rating,ratingsCounts
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
$5 a Day,4.0,1
$9.99,3.833333,3
'71,3.5,3
'Hellboy': The Seeds of Creation,3.7,5
'Til There Was You,3.410714,28


In [90]:
user_movie_rating = mrd.pivot_table(index='userId', columns='Title', values='rating')  

In [91]:
user_movie_rating.head()

Title,$5 a Day,$9.99,'71,'Hellboy': The Seeds of Creation,'Til There Was You,10 Cloverfield Lane,10 Items or Less,10 Questions for the Dalai Lama,10 Things I Hate About You,10 Years,...,Zoom,Zootopia,Zus & Zo,Zuzu Angel,[REC] 4: Apocalypse,eXistenZ,loudQUIETloud: A Film About the Pixies,xXx,xXx: Return of Xander Cage,xXx: State of the Union
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
60,,,,,,,,,,,...,,,,,,,,,,
76,,,,,,,,,,,...,,,,,,,,,,
89,,,,,,,,,,,...,,,,,,,,,,
107,,,,,,,,,,,...,,,,,,,,,,
115,,,,,,,,,,,...,,,,,,,,,,


### Content Recommendation System

In [92]:
movies_only = mrd[['Title','genres','Director','Actors']]

In [93]:
movies_only.shape

(774593, 4)

In [94]:
movies_only.drop_duplicates(keep = 'first',inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [95]:
movies_only.shape

(7003, 4)

In [96]:
movies_only.head()

Unnamed: 0,Title,genres,Director,Actors
0,Rumble in the Bronx,Action|Adventure|Comedy|Crime,Stanley Tong,"Jackie Chan, Anita Mui, Françoise Yip, Bill Tung"
831,Batman Forever,Action|Adventure|Comedy|Crime,Joel Schumacher,"Val Kilmer, Tommy Lee Jones, Jim Carrey, Nicol..."
3098,Desperado,Action|Romance|Western,Robert Rodriguez,"Antonio Banderas, Salma Hayek, Joaquim de Alme..."
4099,Die Hard: With a Vengeance,Action|Crime|Thriller,John McTiernan,"Bruce Willis, Jeremy Irons, Samuel L. Jackson,..."
8967,First Knight,Action|Drama|Romance,Jerry Zucker,"Sean Connery, Richard Gere, Julia Ormond, Ben ..."


In [103]:
# remove spaces from actors and director names

def clean_data(x):
    if isinstance(x, list):
        return[str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''
            

In [104]:
movies_df = movies_only.copy()


In [105]:
credits = ['Actors', 'Director']

In [106]:
for credit in credits:
    movies_df[credit] = movies_df[credit].apply(clean_data)

In [107]:
movies_df.head()

Unnamed: 0,Title,genres,Director,Actors
0,Rumble in the Bronx,Action|Adventure|Comedy|Crime,stanleytong,"jackiechan,anitamui,françoiseyip,billtung"
831,Batman Forever,Action|Adventure|Comedy|Crime,joelschumacher,"valkilmer,tommyleejones,jimcarrey,nicolekidman"
3098,Desperado,Action|Romance|Western,robertrodriguez,"antoniobanderas,salmahayek,joaquimdealmeida,ch..."
4099,Die Hard: With a Vengeance,Action|Crime|Thriller,johnmctiernan,"brucewillis,jeremyirons,samuell.jackson,graham..."
8967,First Knight,Action|Drama|Romance,jerryzucker,"seanconnery,richardgere,juliaormond,bencross"


In [108]:
# remove | from genres

movies_df['genres'] = movies_df['genres'].str.replace("|", " ")

In [109]:
movies_df.head()

Unnamed: 0,Title,genres,Director,Actors
0,Rumble in the Bronx,Action Adventure Comedy Crime,stanleytong,"jackiechan,anitamui,françoiseyip,billtung"
831,Batman Forever,Action Adventure Comedy Crime,joelschumacher,"valkilmer,tommyleejones,jimcarrey,nicolekidman"
3098,Desperado,Action Romance Western,robertrodriguez,"antoniobanderas,salmahayek,joaquimdealmeida,ch..."
4099,Die Hard: With a Vengeance,Action Crime Thriller,johnmctiernan,"brucewillis,jeremyirons,samuell.jackson,graham..."
8967,First Knight,Action Drama Romance,jerryzucker,"seanconnery,richardgere,juliaormond,bencross"


In [110]:
#combine all data 

def create_soup(x):
    return  x['Actors'].replace(',',' ') + ' ' + x['Director'] + ' ' + ' '.join(x['genres'])

In [111]:
movies_df['bag_of_words'] = movies_df.apply(create_soup, axis=1)

In [146]:
movies_df.set_index('Title')

Unnamed: 0_level_0,genres,Director,Actors,bag_of_words
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Rumble in the Bronx,Action Adventure Comedy Crime,stanleytong,"jackiechan,anitamui,françoiseyip,billtung",jackiechan anitamui françoiseyip billtung stan...
Batman Forever,Action Adventure Comedy Crime,joelschumacher,"valkilmer,tommyleejones,jimcarrey,nicolekidman",valkilmer tommyleejones jimcarrey nicolekidman...
Desperado,Action Romance Western,robertrodriguez,"antoniobanderas,salmahayek,joaquimdealmeida,ch...",antoniobanderas salmahayek joaquimdealmeida ch...
Die Hard: With a Vengeance,Action Crime Thriller,johnmctiernan,"brucewillis,jeremyirons,samuell.jackson,graham...",brucewillis jeremyirons samuell.jackson graham...
First Knight,Action Drama Romance,jerryzucker,"seanconnery,richardgere,juliaormond,bencross",seanconnery richardgere juliaormond bencross j...
Mallrats,Comedy Romance,kevinsmith,"shannendoherty,jeremylondon,jasonlee,clairefor...",shannendoherty jeremylondon jasonlee clairefor...
Clerks,Comedy,kevinsmith,"briano'halloran,jeffanderson,marilynghigliotti...",briano'halloran jeffanderson marilynghigliotti...
Hot Shots! Part Deux,Action Comedy War,jimabrahams,"charliesheen,lloydbridges,valeriagolino,richar...",charliesheen lloydbridges valeriagolino richar...
Dead Man,Drama Mystery Western,jimjarmusch,"johnnydepp,garyfarmer,crispinglover,lancehenri...",johnnydepp garyfarmer crispinglover lancehenri...
Die Hard 2,Action Adventure Thriller,rennyharlin,"brucewillis,bonniebedelia,williamatherton,regi...",brucewillis bonniebedelia williamatherton regi...


In [148]:
movies_bow = movies_df[['Title', 'bag_of_words']]

movies_bow.set_index('Title')

Unnamed: 0_level_0,bag_of_words
Title,Unnamed: 1_level_1
Rumble in the Bronx,jackiechan anitamui françoiseyip billtung stan...
Batman Forever,valkilmer tommyleejones jimcarrey nicolekidman...
Desperado,antoniobanderas salmahayek joaquimdealmeida ch...
Die Hard: With a Vengeance,brucewillis jeremyirons samuell.jackson graham...
First Knight,seanconnery richardgere juliaormond bencross j...
Mallrats,shannendoherty jeremylondon jasonlee clairefor...
Clerks,briano'halloran jeffanderson marilynghigliotti...
Hot Shots! Part Deux,charliesheen lloydbridges valeriagolino richar...
Dead Man,johnnydepp garyfarmer crispinglover lancehenri...
Die Hard 2,brucewillis bonniebedelia williamatherton regi...


In [117]:
movies_bow.set_index('Title')

Unnamed: 0_level_0,bag_of_words
Title,Unnamed: 1_level_1
Rumble in the Bronx,jackiechan anitamui françoiseyip billtung stan...
Batman Forever,valkilmer tommyleejones jimcarrey nicolekidman...
Desperado,antoniobanderas salmahayek joaquimdealmeida ch...
Die Hard: With a Vengeance,brucewillis jeremyirons samuell.jackson graham...
First Knight,seanconnery richardgere juliaormond bencross j...
Mallrats,shannendoherty jeremylondon jasonlee clairefor...
Clerks,briano'halloran jeffanderson marilynghigliotti...
Hot Shots! Part Deux,charliesheen lloydbridges valeriagolino richar...
Dead Man,johnnydepp garyfarmer crispinglover lancehenri...
Die Hard 2,brucewillis bonniebedelia williamatherton regi...


In [149]:
# Import sklearn models

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [150]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(movies_bow['bag_of_words'])


# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [168]:
indices = pd.Series(movies_bow.index, index = movies_bow['Title'])

def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies_bow['Title'].iloc[movie_indices]

In [129]:
# # Convert movie titles to numerical so they are associated to an ordered numerical. store in a series
# # This will be used in the function to match the indexes of the movies

# indices = pd.Series(movies_df.index)

# #  defining the function that takes in movie title 
# # as input and returns the top 10 recommended movies
# def recommendations(title, cosine_sim = cosine_sim):
    
#     # initializing the empty list of recommended movies
#     recommended_movies = []
    
#     # gettin the index of the movie that matches the title
#     idx = indices[indices == title].index[0]

#     # creating a Series with the similarity scores in descending order
#     score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

#     # getting the indexes of the 10 most similar movies
#     top_10_indexes = list(score_series.iloc[1:11].index)
    
#     # populating the list with the titles of the best 10 matching movies
#     for i in top_10_indexes:
#         recommended_movies.append(list(movies_df.index)[i])
        
#     return recommended_movies

In [170]:
get_recommendations('Last Will')

IndexError: index 774572 is out of bounds for axis 0 with size 7003

In [169]:
indices

Title
Rumble in the Bronx                       0
Batman Forever                          831
Desperado                              3098
Die Hard: With a Vengeance             4099
First Knight                           8967
Mallrats                              10088
Clerks                                10493
Hot Shots! Part Deux                  12244
Dead Man                              13290
Die Hard 2                            13465
Batman Returns                        14097
Donnie Brasco                         14502
Liar Liar                             14937
Grosse Pointe Blank                   15716
Face/Off                              16300
City of Angels                        17175
Cube                                  17371
Happiness                             17613
American History X                    17957
Enemy of the State                    19928
Office Space                          20625
Wing Commander                        21996
Ravenous                  