In [1]:
import pandas as pd
import numpy as np

#### Prepare Movie Data

In [2]:
# import dataset from movie_lens and omdb
#movie_lens data contains the columns movieId, genre, movie_nm, released
#omdb data contains title, actors, directors, year

movies_ml = pd.read_csv('data/movies_clean.csv')
movies_omdb = pd.read_csv('data/all_movies_info.csv')

In [3]:
movies_ml.head()

Unnamed: 0,movieId,genres,movie_nm,released
0,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,2,Adventure|Children|Fantasy,Jumanji,1995
2,3,Comedy|Romance,Grumpier Old Men,1995
3,4,Comedy|Drama|Romance,Waiting to Exhale,1995
4,5,Comedy,Father of the Bride Part II,1995


In [4]:
movies_omdb.head()

Unnamed: 0,Title,Actors,Director,Year
0,Home Alone,"Macaulay Culkin, Joe Pesci, Daniel Stern, John...",Chris Columbus,1990
1,Ghost,"Patrick Swayze, Demi Moore, Tony Goldwyn, Stan...",Jerry Zucker,1990
2,Dances with Wolves,"Kevin Costner, Mary McDonnell, Graham Greene, ...",Kevin Costner,1990
3,Pretty Woman,"Richard Gere, Julia Roberts, Ralph Bellamy, Ja...",Garry Marshall,1990
4,"I, the Worst of All","Assumpta Serna, Dominique Sanda, Héctor Alteri...",María Luisa Bemberg,1990


In [5]:
# prepare both datasets to be joined on the Title column

# rename movie_nm to Title in movies_ml dataframe

movies_ml = movies_ml.rename(columns = {'movie_nm': 'Title'})

# apply strip method on the title columns of both datasets 

movies_ml['Title'] = movies_ml['Title'].str.strip()
movies_omdb['Title'] = movies_omdb['Title'].str.strip()

In [6]:
# complete join of dataframes

movies = pd.merge(movies_ml, movies_omdb, how = 'inner',on = 'Title' )

movies.shape

(31779, 7)

In [7]:
all_movies = movies.copy()

In [8]:
all_movies = all_movies[['movieId', 'Title', 'genres', 'released', 'Actors', 'Director']]

all_movies.head()

Unnamed: 0,movieId,Title,genres,released,Actors,Director
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney",John Lasseter
1,2,Jumanji,Adventure|Children|Fantasy,1995,"Robin Williams, Jonathan Hyde, Kirsten Dunst, ...",Joe Johnston
2,3,Grumpier Old Men,Comedy|Romance,1995,"Walter Matthau, Jack Lemmon, Sophia Loren, Ann...",Howard Deutch
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995,"Whitney Houston, Angela Bassett, Loretta Devin...",Forest Whitaker
4,5,Father of the Bride Part II,Comedy,1995,"Steve Martin, Diane Keaton, Martin Short, Kimb...",Charles Shyer


#### Prepare Ratings Data

In [9]:
ratings = pd.read_csv('data/ratings.csv')

In [10]:
print(ratings.shape)
ratings.head()

(26024289, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


#### Reduce the size of ratings dataframe

To reduce the size of the ratings dataframe we needed to decide on a meaningful way to shrink the data. We conducted exploratory analysis and decided to split the data based on users who rated within a range of the mean number of ratings per user.


In [11]:
# count number of ratings per user
user_ratings_grp = pd.DataFrame(ratings.groupby('userId')['rating'].count())

In [12]:
user_ratings_grp.head()

Unnamed: 0_level_0,rating
userId,Unnamed: 1_level_1
1,27
2,22
3,10
4,62
5,26


In [13]:
user_ratings_grp.describe()

Unnamed: 0,rating
count,270896.0
mean,96.067454
std,205.719606
min,1.0
25%,15.0
50%,30.0
75%,93.0
max,18276.0


In [14]:
# find mean of number of ratings per user
avg_nratings = user_ratings_grp['rating'].mean()

avg_nratings

96.06745393065974

In [15]:
# find mean number of ratings per movie. This will be used to limit the results when the similarity scores 
# are calculated
movie_ratings_grp = pd.DataFrame(ratings.groupby('movieId')['rating'].count())

In [16]:
movie_ratings_grp.describe()

Unnamed: 0,rating
count,45115.0
mean,576.843378
std,3037.380582
min,1.0
25%,2.0
50%,8.0
75%,69.0
max,91921.0


In [17]:
movie_ratings_grp.head()

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,66008
2,26060
3,15497
4,2981
5,15258


In [18]:
# find users who rated within +10 or - 10 of the mean

avg_activity = user_ratings_grp[(user_ratings_grp['rating'] >= 85) & (user_ratings_grp['rating'] <= 150)]
print(avg_activity.shape)

# rename column

avg_activity = avg_activity.rename(columns = {'rating': 'rating_ct'})

avg_activity.head()

(28711, 1)


Unnamed: 0_level_0,rating_ct
userId,Unnamed: 1_level_1
8,113
30,120
53,145
55,133
60,105


In [17]:
# # find users who rated over 

# avg_activity = user_ratings_grp[user_ratings_grp['rating'] >= 500]
# print(avg_activity.shape)

# # rename column

# avg_activity = avg_activity.rename(columns = {'rating': 'rating_ct'})

# avg_activity.head()

(9516, 1)


Unnamed: 0_level_0,rating_ct
userId,Unnamed: 1_level_1
24,634
46,766
120,516
132,572
150,585


In [19]:
# combine avg_activity and ratings to only return the ratings for the people that are 
#in the avg_activity dataframe

new_ratings = pd.merge(avg_activity, ratings, how = 'inner', on = 'userId')

print(new_ratings.shape)
new_ratings.head()

(3245584, 5)


Unnamed: 0,userId,rating_ct,movieId,rating,timestamp
0,8,113,1,4.0,1013443596
1,8,113,7,2.0,1013442976
2,8,113,44,3.0,1013442518
3,8,113,47,4.0,1013443770
4,8,113,170,3.0,1013442544


Combine Ratings and Movies Dataset

In [20]:
mrd = pd.merge(new_ratings, all_movies,how = 'inner', on = 'movieId') 

In [21]:
print(mrd.shape)
mrd.head()

(2186279, 10)


Unnamed: 0,userId,rating_ct,movieId,rating,timestamp,Title,genres,released,Actors,Director
0,8,113,1,4.0,1013443596,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney",John Lasseter
1,55,133,1,5.0,1037743445,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney",John Lasseter
2,63,113,1,3.5,1198546021,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney",John Lasseter
3,74,143,1,5.0,862777274,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney",John Lasseter
4,76,105,1,3.5,1322344481,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney",John Lasseter


In [22]:
# check for nulls

mrd.isnull().sum()

userId       0
rating_ct    0
movieId      0
rating       0
timestamp    0
Title        0
genres       0
released     0
Actors       0
Director     0
dtype: int64

In [23]:
mrd.groupby('Title')['rating'].mean().head()

Title
$5 a Day                            4.000000
$9.99                               3.166667
'71                                 3.900000
'Hellboy': The Seeds of Creation    3.447368
'Til There Was You                  3.201754
Name: rating, dtype: float64

In [24]:
mrd.groupby('Title')['rating'].mean().sort_values(ascending=False).head()

Title
Unconditional Love      5.0
Born Romantic           5.0
Shaitan                 5.0
Jack Goes Boating       5.0
It's a Free World...    5.0
Name: rating, dtype: float64

In [25]:
mrd.groupby('Title')['rating'].count().sort_values(ascending=False).head()

Title
Beauty and the Beast          26960
Terminator 2: Judgment Day    24842
Ghost                         23216
Gladiator                     17894
Iron Man                      17268
Name: rating, dtype: int64

In [26]:
ratingsMeanCount = pd.DataFrame(mrd.groupby('Title')['rating'].mean())
ratingsMeanCount.head()

Unnamed: 0_level_0,rating
Title,Unnamed: 1_level_1
$5 a Day,4.0
$9.99,3.166667
'71,3.9
'Hellboy': The Seeds of Creation,3.447368
'Til There Was You,3.201754


In [27]:
ratingsMeanCount['ratingsCounts'] = pd.DataFrame(mrd.groupby('Title')['rating'].count())
ratingsMeanCount.head()

Unnamed: 0_level_0,rating,ratingsCounts
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
$5 a Day,4.0,1
$9.99,3.166667,6
'71,3.9,10
'Hellboy': The Seeds of Creation,3.447368,19
'Til There Was You,3.201754,57


In [28]:
user_movie_rating = mrd.pivot_table(index='userId', columns='Title', values='rating')  

In [29]:
user_movie_rating.head()

Title,$5 a Day,$9.99,'71,'Hellboy': The Seeds of Creation,'Til There Was You,00 Schneider - Jagd auf Nihil Baxter,009 Re: Cyborg,10 Cloverfield Lane,10 Items or Less,10 Questions for the Dalai Lama,...,Zuzu Angel,Zygote,[REC] 4: Apocalypse,eXistenZ,iBoy,iMurders,loudQUIETloud: A Film About the Pixies,xXx,xXx: Return of Xander Cage,xXx: State of the Union
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,,,,,,,,,,,...,,,,2.0,,,,,,
30,,,,,,,,,,,...,,,,2.0,,,,,,
53,,,,,,,,,,,...,,,,,,,,,,
55,,,,,,,,,,,...,,,,,,,,,,
60,,,,,,,,,,,...,,,,,,,,,,


####  Perform Collaborative Filtering

In this section we will find the similarity between movies based on user ratings. First we will run the series of steps on one movie to ensure that the code works at every level. Then, the code will be compiled into a function. 

In [30]:
# find all the users who rated a movie. 

testing_ratings = user_movie_rating['Toy Story']
testing_ratings.head()

userId
8     4.0
30    NaN
53    NaN
55    5.0
60    NaN
Name: Toy Story, dtype: float64

In [31]:
movies_like_toyStory = user_movie_rating.corrwith(testing_ratings)

corr_ts = pd.DataFrame(movies_like_toyStory, columns=['Correlation'])  
corr_ts.dropna(inplace=True)  
corr_ts.head(10)  

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


Unnamed: 0_level_0,Correlation
Title,Unnamed: 1_level_1
'Hellboy': The Seeds of Creation,0.208623
'Til There Was You,-0.033811
10 Cloverfield Lane,0.332993
10 Items or Less,0.053483
10 Things I Hate About You,0.087516
"10,000 BC",-0.009693
100 Feet,-1.0
100 Girls,0.029156
101 Dalmatians,0.322301
102 Dalmatians,0.175695


In [32]:
# Rearrenge the correlational outcome we computed above in descending order

corr_ts.sort_values('Correlation', ascending=False).head(10)  

Unnamed: 0_level_0,Correlation
Title,Unnamed: 1_level_1
Tube Tales,1.0
All the Queen's Men,1.0
Mind Game,1.0
Raw,1.0
American Me,1.0
We Are the Best!,1.0
Ratcatcher,1.0
A Ghost Story,1.0
King Arthur: Legend of the Sword,1.0
Love the Beast,1.0


In [33]:

corr_ts.reset_index(inplace=True)
corr_ts.head()

Unnamed: 0,Title,Correlation
0,'Hellboy': The Seeds of Creation,0.208623
1,'Til There Was You,-0.033811
2,10 Cloverfield Lane,0.332993
3,10 Items or Less,0.053483
4,10 Things I Hate About You,0.087516


In [34]:
corr_ts.count()

Title          4584
Correlation    4584
dtype: int64

In [35]:
# Improve results by eliminating movies that have a low number of ratings.

corr_ts = corr_ts.merge(ratingsMeanCount, on='Title', how='inner') 
corr_ts.head()

Unnamed: 0,Title,Correlation,rating,ratingsCounts
0,'Hellboy': The Seeds of Creation,0.208623,3.447368,19
1,'Til There Was You,-0.033811,3.201754,57
2,10 Cloverfield Lane,0.332993,3.742775,173
3,10 Items or Less,0.053483,3.014286,35
4,10 Things I Hate About You,0.087516,3.598327,3946


In [36]:
corr_ts = corr_ts.drop(['rating'], axis=1)
corr_ts.head()

Unnamed: 0,Title,Correlation,ratingsCounts
0,'Hellboy': The Seeds of Creation,0.208623,19
1,'Til There Was You,-0.033811,57
2,10 Cloverfield Lane,0.332993,173
3,10 Items or Less,0.053483,35
4,10 Things I Hate About You,0.087516,3946


In [37]:
corr_ts[corr_ts ['ratingsCounts']>50].sort_values('Correlation', ascending=False).head()

Unnamed: 0,Title,Correlation,ratingsCounts
4172,Toy Story,1.0,11055
2574,Max,1.0,80
4222,Trumbo,1.0,54
2024,Inferno,0.970143,54
313,August,0.884652,60


In [38]:
# function compiled from all the previous steps

def rec_movies(movie):
    top_10 = []
    curr_movie = user_movie_rating[movie]
    curr_movie_results = user_movie_rating.corrwith(curr_movie)
    corr_df = pd.DataFrame(curr_movie_results, columns=['Correlation'])  
    corr_df.dropna(inplace=True)  
    corr_df = corr_df.sort_values('Correlation', ascending=False) 
    corr_df.reset_index(inplace=True)
    corr_df = corr_df.merge(ratingsMeanCount, on='Title', how='inner') 
    rec_results = corr_df[corr_df ['ratingsCounts']>50].sort_values('Correlation', ascending=False)
    top_10 = rec_results[:10]
    return top_10

In [39]:
rec_movies('Toy Story')

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


Unnamed: 0,Title,Correlation,rating,ratingsCounts
50,Toy Story,1.0,3.919313,11055
230,Max,1.0,3.5625,80
235,Trumbo,1.0,3.796296,54
259,Inferno,0.970143,3.722222,54
320,August,0.884652,3.0,60
353,Lee Daniels' The Butler,0.852803,3.87931,58
366,Cargo,0.827144,3.382353,85
372,The Island,0.814862,3.75,52
384,Head Over Heels,0.792458,3.397436,117
395,Inherent Vice,0.778784,3.362745,51


### Content Recommendation System

In [40]:
movies_only = mrd[['Title','genres','Director','Actors']]

In [41]:
movies_only.shape

(2186279, 4)

In [42]:
movies_only.drop_duplicates(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [43]:
movies_only.isnull().sum()

Title       0
genres      0
Director    0
Actors      0
dtype: int64

In [44]:
movies_only.shape

(9264, 4)

In [45]:
movies_only.head()

Unnamed: 0,Title,genres,Director,Actors
0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,John Lasseter,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney"
11055,Sabrina,Comedy|Romance,Billy Wilder,"Humphrey Bogart, Audrey Hepburn, William Holde..."
13181,Mortal Kombat,Action|Adventure|Fantasy,Paul W.S. Anderson,"Christopher Lambert, Robin Shou, Linden Ashby,..."
15309,Hackers,Action|Adventure|Crime|Thriller,Iain Softley,"Jonny Lee Miller, Angelina Jolie, Jesse Bradfo..."
16798,Waterworld,Action|Adventure|Sci-Fi,Kevin Reynolds,"Kevin Costner, Chaim Jeraffi, Rick Aviles, R.D..."


In [123]:
movies_only.to_csv('data/movies_only.csv', index = False)

Content System based on Cosine Similarity

In [46]:
movies_df = movies_only.copy()

In [47]:
# Clean data

# remove spaces from actors and director names so that the system will not mix up actors or directors
# where the first name or last name matches

def clean_data(x):
    if isinstance(x, list):
        return[str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [48]:
credits = ['Actors', 'Director']

# apply function to each column

for credit in credits:
    movies_df[credit] = movies_df[credit].apply(clean_data)
    
movies_df.head()

Unnamed: 0,Title,genres,Director,Actors
0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,johnlasseter,"tomhanks,timallen,donrickles,jimvarney"
11055,Sabrina,Comedy|Romance,billywilder,"humphreybogart,audreyhepburn,williamholden,wal..."
13181,Mortal Kombat,Action|Adventure|Fantasy,paulw.s.anderson,"christopherlambert,robinshou,lindenashby,cary-..."
15309,Hackers,Action|Adventure|Crime|Thriller,iainsoftley,"jonnyleemiller,angelinajolie,jessebradford,mat..."
16798,Waterworld,Action|Adventure|Sci-Fi,kevinreynolds,"kevincostner,chaimjeraffi,rickaviles,r.d.call"


In [49]:
genres = movies_df['genres'].str.get_dummies(sep ='|')

In [51]:
actors = movies_df['Actors'].str.get_dummies(sep = ',')

In [52]:
actors.head()

Unnamed: 0,'snub'pollard,50cent,7yearbitch,a.j.cook,a.j.langer,a.michaelbaldwin,aaliyah,aamirkhan,aaranthomas,aarneaksila,...,óscarjaenada,özgeözberk,özgünamal,özkanugur,öznurkula,ørjangamst,øyvinbangberven,úlfurægisson,þrösturleógunnarsson,þrúðurkristjánsdóttir
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11055,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13181,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15309,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16798,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
# join genres the main dataframe

movies_w_gdummies = pd.merge(movies_df, genres, how = 'inner', left_index = True, right_index = True)

In [58]:
# join actors to the movies_w_gdummies_df

movies_w_dummies = pd.merge(movies_w_gdummies, actors, how = 'inner', left_index = True, right_index = True)

In [59]:
movies_w_dummies.head()

Unnamed: 0,Title,genres,Director,Actors,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,óscarjaenada,özgeözberk,özgünamal,özkanugur,öznurkula,ørjangamst,øyvinbangberven,úlfurægisson,þrösturleógunnarsson,þrúðurkristjánsdóttir
0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,johnlasseter,"tomhanks,timallen,donrickles,jimvarney",0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
11055,Sabrina,Comedy|Romance,billywilder,"humphreybogart,audreyhepburn,williamholden,wal...",0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
13181,Mortal Kombat,Action|Adventure|Fantasy,paulw.s.anderson,"christopherlambert,robinshou,lindenashby,cary-...",0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15309,Hackers,Action|Adventure|Crime|Thriller,iainsoftley,"jonnyleemiller,angelinajolie,jessebradford,mat...",0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16798,Waterworld,Action|Adventure|Sci-Fi,kevinreynolds,"kevincostner,chaimjeraffi,rickaviles,r.d.call",0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
movie_dummies = movies_w_dummies.copy()

In [72]:
movie_dummies.drop(['genres', 'Actors', '(no genres listed)'], axis=1, inplace =True)

In [73]:
movie_dummies.head()

Unnamed: 0,Title,Director,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,émilegaudreault,éricrohmer,étiennechatiliez,étiennefaure,óskarjónasson,óskarthóraxelsson,ömerfaruksorak,ömervargi,özerkiziltan,"özhaneren,muratsaraçoglu"
0,Toy Story,johnlasseter,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11055,Sabrina,billywilder,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13181,Mortal Kombat,paulw.s.anderson,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15309,Hackers,iainsoftley,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
16798,Waterworld,kevinreynolds,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# get dummies for director

In [74]:
directors = movie_dummies['Director'].str.get_dummies()

In [75]:
directors.head()

Unnamed: 0,50cent,a.b.stone,a.j.edwards,a.m.lukas,a.r.murugadoss,"aamirkhan,amolegupte",aanandl.rai,"aaronaites,audreyewell","aaronblaise,robertwalker",aaroncassara,...,émilegaudreault,éricrohmer,étiennechatiliez,étiennefaure,óskarjónasson,óskarthóraxelsson,ömerfaruksorak,ömervargi,özerkiziltan,"özhaneren,muratsaraçoglu"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11055,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13181,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15309,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16798,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [78]:
# join directors to movie_dummies

movie_dummies = pd.merge(movie_dummies, directors, how = 'inner', left_index = True, right_index = True)

In [79]:
movie_dummies.head()

Unnamed: 0,Title,Director,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,émilegaudreault_y,éricrohmer_y,étiennechatiliez_y,étiennefaure_y,óskarjónasson_y,óskarthóraxelsson_y,ömerfaruksorak_y,ömervargi_y,özerkiziltan_y,"özhaneren,muratsaraçoglu_y"
0,Toy Story,johnlasseter,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11055,Sabrina,billywilder,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13181,Mortal Kombat,paulw.s.anderson,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15309,Hackers,iainsoftley,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
16798,Waterworld,kevinreynolds,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
movie_dummies.drop(['Director'], axis=1, inplace =True)

In [82]:
movie_dummies.head()

Unnamed: 0,Title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,émilegaudreault_y,éricrohmer_y,étiennechatiliez_y,étiennefaure_y,óskarjónasson_y,óskarthóraxelsson_y,ömerfaruksorak_y,ömervargi_y,özerkiziltan_y,"özhaneren,muratsaraçoglu_y"
0,Toy Story,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
11055,Sabrina,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13181,Mortal Kombat,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
15309,Hackers,1,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16798,Waterworld,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
movie_dummies.to_csv('data/movies_w_dummies.csv', index = True)

In [84]:
from sklearn.metrics.pairwise import cosine_similarity



In [85]:
test_movie = movie_dummies.iloc[1]

test_movie

Title                         Sabrina
Action                              0
Adventure                           0
Animation                           0
Children                            0
Comedy                              1
Crime                               0
Documentary                         0
Drama                               0
Fantasy                             0
Film-Noir                           0
Horror                              0
IMAX                                0
Musical                             0
Mystery                             0
Romance                             1
Sci-Fi                              0
Thriller                            0
War                                 0
Western                             0
'snub'pollard                       0
50cent_x                            0
7yearbitch                          0
a.j.cook                            0
a.j.langer                          0
a.michaelbaldwin                    0
aaliyah     

Content System based on Bag of Words Method

In [89]:
# remove spaces from actors and director names

def clean_data(x):
    if isinstance(x, list):
        return[str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''
            

In [91]:
movies_alt = movies_only.copy()

movies_alt.shape

(9264, 4)

In [92]:
credits = ['Actors', 'Director']

for credit in credits:
    movies_alt[credit] = movies_alt[credit].apply(clean_data)

In [94]:
movies_alt.head()

Unnamed: 0,Title,genres,Director,Actors
0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,johnlasseter,"tomhanks,timallen,donrickles,jimvarney"
11055,Sabrina,Comedy|Romance,billywilder,"humphreybogart,audreyhepburn,williamholden,wal..."
13181,Mortal Kombat,Action|Adventure|Fantasy,paulw.s.anderson,"christopherlambert,robinshou,lindenashby,cary-..."
15309,Hackers,Action|Adventure|Crime|Thriller,iainsoftley,"jonnyleemiller,angelinajolie,jessebradford,mat..."
16798,Waterworld,Action|Adventure|Sci-Fi,kevinreynolds,"kevincostner,chaimjeraffi,rickaviles,r.d.call"


In [99]:
# remove | from genres

movies_alt['genres'] = movies_alt['genres'].str.replace("|", " ")

In [100]:
movies_alt.head()

Unnamed: 0,Title,genres,Director,Actors
0,Toy Story,Adventure Animation Children Comedy Fantasy,johnlasseter,"tomhanks,timallen,donrickles,jimvarney"
11055,Sabrina,Comedy Romance,billywilder,"humphreybogart,audreyhepburn,williamholden,wal..."
13181,Mortal Kombat,Action Adventure Fantasy,paulw.s.anderson,"christopherlambert,robinshou,lindenashby,cary-..."
15309,Hackers,Action Adventure Crime Thriller,iainsoftley,"jonnyleemiller,angelinajolie,jessebradford,mat..."
16798,Waterworld,Action Adventure Sci-Fi,kevinreynolds,"kevincostner,chaimjeraffi,rickaviles,r.d.call"


In [101]:
#combine all data 

def create_soup(x):
    return  x['Actors'].replace(',',' ') + ' ' + x['Director'] + ' ' + ' '.join(x['genres'])

In [102]:
movies_alt['bag_of_words'] = movies_alt.apply(create_soup, axis=1)

In [103]:
movies_alt.set_index('Title')

Unnamed: 0_level_0,genres,Director,Actors,bag_of_words
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Toy Story,Adventure Animation Children Comedy Fantasy,johnlasseter,"tomhanks,timallen,donrickles,jimvarney",tomhanks timallen donrickles jimvarney johnlas...
Sabrina,Comedy Romance,billywilder,"humphreybogart,audreyhepburn,williamholden,wal...",humphreybogart audreyhepburn williamholden wal...
Mortal Kombat,Action Adventure Fantasy,paulw.s.anderson,"christopherlambert,robinshou,lindenashby,cary-...",christopherlambert robinshou lindenashby cary-...
Hackers,Action Adventure Crime Thriller,iainsoftley,"jonnyleemiller,angelinajolie,jessebradford,mat...",jonnyleemiller angelinajolie jessebradford mat...
Waterworld,Action Adventure Sci-Fi,kevinreynolds,"kevincostner,chaimjeraffi,rickaviles,r.d.call",kevincostner chaimjeraffi rickaviles r.d.call ...
Beverly Hills Cop III,Action Comedy Crime Thriller,johnlandis,"eddiemurphy,jontenney,joeytravolta,eugenecollier",eddiemurphy jontenney joeytravolta eugenecolli...
Tombstone,Action Drama Western,"georgep.cosmatos,kevinjarre","kurtrussell,valkilmer,samelliott,billpaxton",kurtrussell valkilmer samelliott billpaxton ge...
Courage Under Fire,Action Crime Drama War,edwardzwick,"denzelwashington,megryan,loudiamondphillips,mi...",denzelwashington megryan loudiamondphillips mi...
Ransom,Crime Thriller,ronhoward,"melgibson,renerusso,brawleynolte,garysinise",melgibson renerusso brawleynolte garysinise ro...
Tin Cup,Comedy Drama Romance,ronshelton,"kevincostner,renerusso,donjohnson,cheechmarin",kevincostner renerusso donjohnson cheechmarin ...


In [109]:
movies_bow = movies_alt[['Title', 'bag_of_words']]

movies_bow.set_index('Title', inplace=True)

In [110]:
# Import sklearn models

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [111]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(movies_bow['bag_of_words'])


# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [116]:
indices = pd.Series(movies_bow.index, index = movies_bow['Title'])

def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies_bow['Title'].iloc[movie_indices]

KeyError: 'Title'

In [129]:
# # Convert movie titles to numerical so they are associated to an ordered numerical. store in a series
# # This will be used in the function to match the indexes of the movies

# indices = pd.Series(movies_df.index)

# #  defining the function that takes in movie title 
# # as input and returns the top 10 recommended movies
# def recommendations(title, cosine_sim = cosine_sim):
    
#     # initializing the empty list of recommended movies
#     recommended_movies = []
    
#     # gettin the index of the movie that matches the title
#     idx = indices[indices == title].index[0]

#     # creating a Series with the similarity scores in descending order
#     score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

#     # getting the indexes of the 10 most similar movies
#     top_10_indexes = list(score_series.iloc[1:11].index)
    
#     # populating the list with the titles of the best 10 matching movies
#     for i in top_10_indexes:
#         recommended_movies.append(list(movies_df.index)[i])
        
#     return recommended_movies

In [117]:
get_recommendations('Toy Story')

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [169]:
indices

Title
Rumble in the Bronx                       0
Batman Forever                          831
Desperado                              3098
Die Hard: With a Vengeance             4099
First Knight                           8967
Mallrats                              10088
Clerks                                10493
Hot Shots! Part Deux                  12244
Dead Man                              13290
Die Hard 2                            13465
Batman Returns                        14097
Donnie Brasco                         14502
Liar Liar                             14937
Grosse Pointe Blank                   15716
Face/Off                              16300
City of Angels                        17175
Cube                                  17371
Happiness                             17613
American History X                    17957
Enemy of the State                    19928
Office Space                          20625
Wing Commander                        21996
Ravenous                  