In [16]:
import pandas as pd
from scipy.spatial.distance import cosine

## Reading the imdb top 250 movie dataset

In [31]:
movie_data = pd.read_csv("imdb-top250.csv",delimiter="#")
movie_data.head(2)

Unnamed: 0,Id,Title,Year,Runtime,Languege,Country,Genre,Director,Writer,Actors,Production,IMDB_Rating
0,111161,The Shawshank Redemption,1994,142,English,USA,Crime|Drama,Frank Darabont,Stephen King|Frank Darabont,Tim Robbins|Morgan Freeman|Bob Gunton|William ...,Castle Rock Entertainment,9.3
1,68646,The Godfather,1972,175,English|Italian|Latin,USA,Crime|Drama,Francis Ford Coppola,Mario Puzo|Francis Ford Coppola|Mario Puzo,Marlon Brando|Al Pacino|James Caan|Richard S. ...,Paramount Pictures|Alfran Productions,9.2


## Generating the binary genre features for the movie dataset

In [22]:
def get_all_genres(genres):
    g_list = genres.tolist()
    genre = set()
    for x in g_list:
        y = x.split('|')
        genre.update(y)
    return list(genre)
    
def generate_featured_dataset(movie_data):
    movie_data_featured = movie_data.copy(deep=True)
    ### To get all genres
    genres = get_all_genres(movie_data_featured['Genre'])

    for g in genres:
        movie_data_featured[g] = 0
    
    for i, row in movie_data_featured.iterrows():
        g = row['Genre']
        g_list = g.split('|')
        for x in g_list:
            movie_data_featured.set_value(i,x,1)
    return movie_data_featured

## Some preprocessing and cleaning of the dataset

In [26]:
movie_data_featured = generate_featured_dataset(movie_data)
### To remove duplicates
movie_data_featured = movie_data_featured.drop_duplicates(subset='Title', keep="last")
### To convert title to lower case
movie_data_featured['Title'] = movie_data_featured['Title'].str.lower()

## Get subset of movies which have common features with the queried movie

In [28]:
def get_movies_by_common_actors(vec1, data):
    cols = data.columns
    df_temp = pd.DataFrame(columns=cols)

    f1=vec1.Actors.split('|')
    for i, row in data.iterrows():
        f2 = row.Actors.split('|')
        flag = not set(f1).isdisjoint(f2)
        if flag:
            df_temp = df_temp.append(row)  
    return df_temp
        
def get_movies_by_common_directors(vec1, data):
    cols = data.columns
    df_temp = pd.DataFrame(columns=cols)

    f1=vec1.Director.split('|')
    for i, row in data.iterrows():
        f2 = row.Director.split('|')
        flag = not set(f1).isdisjoint(f2)
        if flag:
            df_temp = df_temp.append(row)
    return df_temp

def get_movies_by_common_writer(vec1, data):
    cols = data.columns
    df_temp = pd.DataFrame(columns=cols)

    f1=vec1.Writer.split('|')
    for i, row in data.iterrows():
        f2 = row.Writer.split('|')
        flag = not set(f1).isdisjoint(f2)
        if flag:
            df_temp = df_temp.append(row)
    return df_temp

def get_movies_by_common_production(vec1, data):
    cols = data.columns
    df_temp = pd.DataFrame(columns=cols)

    f1=vec1.Production.split('|')
    for i, row in data.iterrows():
        f2 = row.Production.split('|')
        flag = not set(f1).isdisjoint(f2)
        if flag:
            df_temp = df_temp.append(row)
    return df_temp

## Finding similar movies based on vector similarity

In [29]:
def similarity(row, vec2):
    s = 1 - cosine(row, vec2)
    return s

def recommand_movies(movie_data, movie):
    movie = movie.lower()
    vec1=movie_data[movie_data.Title==movie].iloc[0]
    df1 = get_movies_by_common_actors(vec1,movie_data)
    df2 = get_movies_by_common_directors(vec1,movie_data)
    df3 = get_movies_by_common_writer(vec1,movie_data)
    df4 = get_movies_by_common_writer(vec1,movie_data)

    cols = movie_data.columns
    movie_matched = pd.DataFrame(columns=cols)
    
    movie_matched = movie_matched.append(df1)
    movie_matched = movie_matched.append(df2)
    movie_matched = movie_matched.append(df3)
    movie_matched = movie_matched.append(df4)

    ### To remove duplicates
    movie_matched = movie_matched.drop_duplicates(subset='Title', keep="last")

    ### To remove duplicates
    movie_matched = movie_matched[movie_matched['Title'] != movie]

    ### features used to calculate similarity
    features = movie_matched.iloc[:,11:]
    vec2 = vec1.iloc[11:]

    sim = features.apply(similarity, args=(vec2,), axis=1)

    data = pd.DataFrame({"Title":movie_matched.Title, "similarity":sim})
    ### to sort according to similarity
    data = data.sort_values(['similarity','Title'], ascending=False)
    
    return data    

## Run few examples to see how it works

In [30]:
### movie --> queried movie
# movie="Pulp Fiction"
# movie="Shichinin No Samurai"
# movie="The Godfather"
# movie="Interstellar"
movie="3 idiots"
# movie="Dangal"

data=recommand_movies(movie_data_featured,movie)
data.head(5)

Unnamed: 0,Title,similarity
249,pk,0.992848
234,munna bhai m.b.b.s.,0.992848
79,taare zameen par,0.986374
175,rang de basanti,0.986183
70,dangal,0.973787
