In [563]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [564]:
def get_title_from_index(i):
    return df.iloc[[i]].Title.values[0]


def get_index_from_title(title):
    try:
        return df[df.Title == title].index.values[0]
    except IndexError:
        return 0 # if not found
    
    
# putting more weight on films with same actors, than by genre or description
def concat_columns(row):
    return row['Actors'] +" "+ row['Actors'] +" "+ row['Description'] +" "+ row['Genre']

**Step 1: Import data set file**

In [565]:
df = pd.read_csv('IMDBMovies.csv')
df.head(2)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0


**Step 2: Defining the FEATURES**

_we select the columns which may be relevant to finding a similar movie,
we will concatenate all those columns to have 1 new column with all the data in it as a long string_

In [566]:
df.fillna('', inplace=True)
          
# df.apply will run the concat_columns() func on each row (axis=1)
df['features'] = df.apply(concat_columns, axis=1)

**Step 3: Creating a count matrix model from this new conbined column**

_the model will be a matrix table of the similarity % between all movies in the DF (based on the text similatiry in the FEATURES column). each MoviexMovie combination will have a score_ 

In [567]:
model = CountVectorizer()
count_matrix = model.fit_transform(df['features'])


**Step 4: Computing the Cosine Similarity based on the count_matrix**

In [568]:
cosine_sim = cosine_similarity(count_matrix)
cosine_sim

array([[1.        , 0.14762035, 0.09615088, ..., 0.08858233, 0.14729194,
        0.05579525],
       [0.14762035, 1.        , 0.09304842, ..., 0.05143445, 0.1069045 ,
        0.05399492],
       [0.09615088, 0.09304842, 1.        , ..., 0.05025189, 0.08703883,
        0.035169  ],
       ...,
       [0.08858233, 0.05143445, 0.05025189, ..., 1.        , 0.13471506,
        0.05832118],
       [0.14729194, 0.1069045 , 0.08703883, ..., 0.13471506, 1.        ,
        0.06060915],
       [0.05579525, 0.05399492, 0.035169  , ..., 0.05832118, 0.06060915,
        1.        ]])

**Step 5: Using the model to get movie recommendations**

In [569]:
# asking for recommendations of similar movies to "Mr. Brooks" film
similar_to_name = "Mr. Brooks"
similar_to_i = get_index_from_title(similar_to_name)

# getting row of cosine_sim for our movie above, this row will contain % of similarity to every other movie, in each col
# applying list(enumerate()) on the row, to get a list key:movie_id, value:similarity%
# that's how we preserve the movie id key
similar_movies = list(enumerate(cosine_sim[similar_to_i]))
similar_movies[:6]

[(0, 1.0),
 (1, 0.14762034939153684),
 (2, 0.0961508829696314),
 (3, 0.07725290582613706),
 (4, 0.15905106915879738),
 (5, 0.16147947019964579)]

In [570]:
# sorting the similar movies list by the similarity in descending order. 
# the keys are preserved to get the movie title from it
sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)
sorted_similar_movies[:6]

[(0, 1.0),
 (362, 0.42477450887985435),
 (48, 0.35607368025665226),
 (728, 0.306810993379327),
 (384, 0.2983002696067465),
 (541, 0.28241940061982396)]

In [571]:
# printing out top 10 recommendations
i=1
for movie in sorted_similar_movies[1:11]:
    print(f"{i}: {get_title_from_index(movie[0])}")
    i+=1
 

1: Star Trek Into Darkness
2: Star Trek Beyond
3: The A-Team
4: The Lego Movie
5: Limitless
6: Avengers: Age of Ultron
7: American Sniper
8: Joy
9: Star Trek
10: The Magnificent Seven
