   #                      Content Based Movie Recommendation

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer  #for converting a collection of words to a matrix of token counts
from sklearn.metrics.pairwise import cosine_similarity  #for calculating similarity  between two vectors. (counted word vectors in this case)

First, we load the data frame and look at the columns of it to decide the features we will use 

In [2]:
df=pd.read_csv("movie_recommendation_dataset/movie_dataset.csv")
df.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

In [3]:
important_features=["genres","keywords","cast","director"]  #we will recommend movies according to similarity betwenn these features

Preparing and cleaning the data:

In [4]:
df.replace(',',' ', regex=True, inplace=True) #replace commas with white space
df.replace(':','', regex=True, inplace=True)  #deleting colons so we dont need to write them when we enter a name of a movie (Since they are hard to remember all the time)
df.replace("'",'', regex=True, inplace=True)  #deleting apostrophes so we dont need to write them (Since they are hard to remember all the time)

df = df.applymap(lambda s:s.lower() if type(s) == str else s) #lowering all letters
for feature in df:                                            #filling NaNs with empty string
    df[feature]=df[feature].fillna("")
    

In [5]:
df.head(3)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,action adventure fantasy science fiction,http//www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,avatar,in the 22nd century a paraplegic marine is di...,150.437577,...,162,"[{""iso_639_1"" ""en"" ""name"" ""english""} {""iso_6...",released,enter the world of pandora.,avatar,7.2,11800,sam worthington zoe saldana sigourney weaver s...,[{name stephen e. rivkin gender 0 department...,james cameron
1,1,300000000,adventure fantasy action,http//disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,pirates of the caribbean at worlds end,captain barbossa long believed to be dead ha...,139.082615,...,169,"[{""iso_639_1"" ""en"" ""name"" ""english""}]",released,at the end of the world the adventure begins.,pirates of the caribbean at worlds end,6.9,4500,johnny depp orlando bloom keira knightley stel...,[{name dariusz wolski gender 2 department ca...,gore verbinski
2,2,245000000,action adventure crime,http//www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,spectre,a cryptic message from bond’s past sends him o...,107.376788,...,148,"[{""iso_639_1"" ""fr"" ""name"" ""fran\u00e7ais""} {...",released,a plan no one escapes,spectre,6.3,4466,daniel craig christoph waltz l\u00e9a seydoux ...,[{name thomas newman gender 2 department sou...,sam mendes


Combining features in a string, since we want to use CountVectorizer

In [6]:
def combine_features(row):  
    return row['genres']+" "+row['keywords']+" "+row['cast']+" "+row['director']

In [7]:
df["combined_features"] = df.apply(combine_features,axis=1)  #creating new column with the combined features

In [8]:
df["combined_features"][0]  #example

'action adventure fantasy science fiction culture clash future space war space colony society sam worthington zoe saldana sigourney weaver stephen lang michelle rodriguez james cameron'

In [9]:
count_vec=CountVectorizer().fit_transform(df["combined_features"])  #applying CountVectorizer on our combined_features column

In [10]:
cs=cosine_similarity(count_vec)   #appliying cosine_similarity in order to calculate similarity between each movie

Now we can search for the movie we want to find its similar. For that, we need to find movie's id. But first, we need to find whether the movie's name we search is fully in the data frame, if it is not, suggest user a movie that contains a word that he/she/they entered.

![Content Based Movie Recommendation](/images/flowchartt.png)

In [35]:
title="harry potter"   # title contains the name we want to find its similar
pseudo_list=[]
done=False
#find the movie id
for i in df["original_title"]:
    if i==title:
        movie_id=df[df.original_title==title]["index"].values[0]
        scores=list(enumerate(cs[movie_id]))
    elif (title in i):
        pseudo_list.append(i)
for i in range (0,len(pseudo_list)):
    print("Did you mean ", pseudo_list[i])
    ans=input("yes or no? \n")
    if (ans=="yes"):
            movie_id=df[df.original_title==pseudo_list[0]]["index"].values[0]
            break
    else:
            continue
         

Did you mean  harry potter and the half-blood prince
yes or no? 
no
Did you mean  harry potter and the order of the phoenix
yes or no? 
no
Did you mean  harry potter and the goblet of fire
yes or no? 
yes


In [36]:
#create a list of enumerations for the similar score [(movie_id,similarity score),(...)]
scores=list(enumerate(cs[movie_id]))

In [37]:
#sort list
sorted_score=sorted(scores,key= lambda x:x[1],reverse=True)
sorted_score=sorted_score[1:]   #we dont take index 0 since it is the movie itself

In [38]:
j=0

print("the most 5 recommended movies to",df[df.index==movie_id]["original_title"].values[0],"\n")
for item in sorted_score:
    movie_title=df[df.index==item[0]]["original_title"].values[0]
    print(j+1,movie_title)
    j=j+1
    if j>5:
        break

the most 5 recommended movies to harry potter and the half-blood prince 

1 harry potter and the order of the phoenix
2 harry potter and the goblet of fire
3 harry potter and the philosophers stone
4 harry potter and the prisoner of azkaban
5 harry potter and the chamber of secrets
6 treasure planet
