# Recommendation Engine using Cosine Similarity

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Functions to fetch the index of a movie, given its title, and vice-versa. 

In [2]:
def get_title_from_index(index):
	return df[df.index == index]["title"].values[0]

def get_index_from_title(title):
	return df[df.title == title]["index"].values[0]

### 1) The features were decided on the basis of external information regarding movie suggestions.
### 2) The function below it converts all the features into one string for vectorization.

In [3]:
df = pd.read_csv("movie_dataset.csv")
features = ['keywords', 'cast', 'genres', 'director']

def combine_features(row):
	return f"{row['keywords']} {row['cast']} {row['genres']} {row['director']}"

### 1) Removing np.nan values from the movies dataset (A good reminder that we must always check the dataset, no matter how trusted the source may be).
### 2) A new way of applying a function to an entire column/row of a pandas dataframe.

In [4]:
for feature in features:
	df[feature] = df[feature].fillna('')


df["combined features"] = df.apply(combine_features, axis=1)

### The most part of the program, the algorithm itself:
#### 1) Fit-transform the combined features column to vectorize the string
#### 2) Using the cosine similarity method for getting a table of similarity between all possible pairs of movies

In [5]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df['combined features']).toarray()
cos_sim = cosine_similarity(count_matrix)
movie_user_likes = "Avatar"

### After getting the similarity table, we need to:
#### 1) Get the index of the movie that the user likes
#### 2) Retrieve the row from the cos_sim matrix corresponding to the index value
#### 3) Enumerate the list (this naturally happens according to the index values)
#### 4) Sort this list according to the similarity index, greatest first
#### 5) Print 50 similar movies

In [6]:
index = get_index_from_title(movie_user_likes)
similar_movies = list(enumerate(cos_sim[index]))
sorted_similar_movies = sorted(similar_movies, key= lambda x: x[1], reverse=True)


In [7]:
for tuple in sorted_similar_movies[1:51]:
	index = tuple[0]
	movie = get_title_from_index(index)
	print(movie)

Guardians of the Galaxy
Aliens
Star Wars: Clone Wars: Volume 1
Star Trek Into Darkness
Star Trek Beyond
Alien
Lockout
Jason X
The Helix... Loaded
Moonraker
Planet of the Apes
Galaxy Quest
Gravity
Alien³
Jupiter Ascending
The Wolverine
Silent Running
Zathura: A Space Adventure
Trekkies
Cargo
Wing Commander
Star Trek
Lost in Space
Babylon A.D.
The Fifth Element
Oblivion
Titan A.E.
AVP: Alien vs. Predator
The Empire Strikes Back
Dragonball Evolution
Superman Returns
Divergent
John Carter
The Black Hole
The Ice Pirates
Memoirs of an Invisible Man
Starship Troopers
The Astronaut's Wife
Machete Kills
Soldier
The Abyss
Damnation Alley
Men in Black
Space Cowboys
Space Dogs
The Time Machine
Sheena
Captain America: Civil War
Star Trek: Insurrection
Oz: The Great and Powerful
