In [26]:
# Import pandas library and the data set
import pandas as pd
df = pd.read_csv('netflix_titles.csv')

In [27]:
# Have a look at the first five rows
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...


In [28]:
# Retrive features from columns
featured_columns = ['type', 'title', 'director', 'cast', 'country', 'release_year', 'rating', 'listed_in', 'description']

In [29]:
# Check if the columns have missing values and fill in them by the white space if any
df.isnull().values.any()
for featured_column in featured_columns:
    df[featured_column] = df[featured_column].fillna(' ')

In [30]:
# Check types of featured columns
df[featured_columns].dtypes

type            object
title           object
director        object
cast            object
country         object
release_year     int64
rating          object
listed_in       object
description     object
dtype: object

In [31]:
# Create a new column combining all the featured columns
def combine_features(row):
    return row['type'] + ' ' + row['title'] + ' ' + row['director'] + ' ' + row['cast'] + ' ' + row['country'] + ' ' + str(row['release_year']) + ' ' + row['rating'] + ' ' + row['listed_in'] + ' ' + row['description']
df['combined_features'] = df.apply(combine_features, axis=1)

In [32]:
# Get the cosine similarity matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
cv = CountVectorizer()
count_matrix = cv.fit_transform(df.combined_features)
cosine_sim = cosine_similarity(count_matrix)

In [33]:
# Write a function to return the index of the movie from its title
def get_index_from_title(title):
    return df[df.title == title].index.values[0]
my_movie = 'Apaches'
my_movie_index = get_index_from_title(my_movie)

In [34]:
# Create a numerated list having [movie index, cosine similarity]
scores = list(enumerate(cosine_sim[my_movie_index]))

# Sort the list in descending order with the key == cosine similarity
sorted_scores = sorted(scores, key=lambda x:x[1], reverse=True)
sorted_scores = sorted_scores[1:]

In [35]:
# Print the first ten recommended movies
for i in sorted_scores[:10]:
    print(df.iloc[i[0], 2])

Fatal Destiny
Killer Ratings
Mexico Diseña
Melodies of Life - Born This Way
Rica, Famosa, Latina
Apache: The Life of Carlos Tevez
Jack Taylor
Four Seasons in Havana
Dancing Angels
Shadow of Truth
