In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
def get_title_from_index(index):
	return df[df.index == index]["title"].values[0]

def get_index_from_title(title):
	return df[df.title == title]["index"].values[0]

In [5]:
##Step 1: Read CSV File
df = pd.read_csv("https://raw.githubusercontent.com/codeheroku/Introduction-to-Machine-Learning/master/Building%20a%20Movie%20Recommendation%20Engine/movie_dataset.csv")
print (df.columns)

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')


In [7]:
df['keywords']

0       culture clash future space war space colony so...
1       ocean drug abuse exotic island east india trad...
2              spy based on novel secret agent sequel mi6
3       dc comics crime fighter terrorist secret ident...
4       based on novel mars medallion space travel pri...
                              ...                        
4798    united states\u2013mexico barrier legs arms pa...
4799                                                  NaN
4800    date love at first sight narration investigati...
4801                                                  NaN
4802                 obsession camcorder crush dream girl
Name: keywords, Length: 4803, dtype: object

In [112]:
'Alien' in df['title'].values

True

In [113]:
##Step 2: Select Features
features = ['keywords']

In [114]:
##Step 3: Create a column in DF which combines all selected features
for feature in features:
	df[feature] = df[feature].fillna('')

def combine_features(row):
	try:
		#return row['tagline'] +" "+row['cast']+" "+row["genres"]+" "+row["director"]
		return row['keywords'] 
	except:
		print ("Error:", row)	

df["combined_features"] = df.apply(combine_features,axis=1)

print (df["combined_features"].head())

0    culture clash future space war space colony so...
1    ocean drug abuse exotic island east india trad...
2           spy based on novel secret agent sequel mi6
3    dc comics crime fighter terrorist secret ident...
4    based on novel mars medallion space travel pri...
Name: combined_features, dtype: object


In [115]:
df[['tagline','cast','genres','director','combined_features']]

Unnamed: 0,tagline,cast,genres,director,combined_features
0,Enter the World of Pandora.,Sam Worthington Zoe Saldana Sigourney Weaver S...,Action Adventure Fantasy Science Fiction,James Cameron,culture clash future space war space colony so...
1,"At the end of the world, the adventure begins.",Johnny Depp Orlando Bloom Keira Knightley Stel...,Adventure Fantasy Action,Gore Verbinski,ocean drug abuse exotic island east india trad...
2,A Plan No One Escapes,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,Action Adventure Crime,Sam Mendes,spy based on novel secret agent sequel mi6
3,The Legend Ends,Christian Bale Michael Caine Gary Oldman Anne ...,Action Crime Drama Thriller,Christopher Nolan,dc comics crime fighter terrorist secret ident...
4,"Lost in our world, found in another.",Taylor Kitsch Lynn Collins Samantha Morton Wil...,Action Adventure Science Fiction,Andrew Stanton,based on novel mars medallion space travel pri...
...,...,...,...,...,...
4798,"He didn't come looking for trouble, but troubl...",Carlos Gallardo Jaime de Hoyos Peter Marquardt...,Action Crime Thriller,Robert Rodriguez,united states\u2013mexico barrier legs arms pa...
4799,A newlywed couple's honeymoon is upended by th...,Edward Burns Kerry Bish\u00e9 Marsha Dietlein ...,Comedy Romance,Edward Burns,
4800,,Eric Mabius Kristin Booth Crystal Lowe Geoff G...,Comedy Drama Romance TV Movie,Scott Smith,date love at first sight narration investigati...
4801,A New Yorker in Shanghai,Daniel Henney Eliza Coupe Bill Paxton Alan Ruc...,,Daniel Hsia,


In [116]:
##Step 4: Create count matrix from this new combined column
cv = CountVectorizer()

count_matrix = cv.fit_transform(df["combined_features"])

In [117]:
count_matrix.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [118]:
##Step 5: Compute the Cosine Similarity based on the count_matrix
cosine_sim = cosine_similarity(count_matrix) 
movie_user_likes = "Cars"

In [119]:
## Step 6: Get index of this movie from its title
movie_index = get_index_from_title(movie_user_likes)

similar_movies =  list(enumerate(cosine_sim[movie_index]))

In [120]:
## Step 7: Get a list of similar movies in descending order of similarity score
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)

In [121]:
## Step 8: Print titles of first 50 movies
i=0
for element in sorted_similar_movies:
		print (get_title_from_index(element[0]))
		i=i+1
		if i>50:
			break

Cars
The Fast and the Furious: Tokyo Drift
The Final Destination
2 Fast 2 Furious
Furious 7
Days of Thunder
Herbie Fully Loaded
Back to the Future Part II
The Transporter
Death Race
Gone in Sixty Seconds
Here On Earth
The Fast and the Furious
Turbo
Sorcerer
American Graffiti
Cars 2
Princess Mononoke
Back to the Future
Speed Racer
Dragonfly
Dawn of the Dead
Are We There Yet?
Angel Eyes
The Woman Chaser
Premonition
Need for Speed
Driven
The Siege
Hard Rain
28 Days
Get Carter
Breakdown
The Transporter Refueled
The Bank Job
Carlos
Soul Survivors
Christmas Eve
Horrible Bosses 2
The Animal
Armored
Pretty Woman
If I Stay
Grease
As Above, So Below
Holy Motors
Speedway Junky
Fast Five
Nine
xXx
Back to the Future Part III
