## Implementing a simple content based recommendation system

# Importing all the necessary libraries

In [286]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from itertools import chain

## Reading the dataset

In [47]:
df = pd.read_csv('netflix_titles.csv')

## Exploring the data 

In [321]:
df.tail()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,combined_features
6229,80000063,TV Show,Red vs. Blue,,"Burnie Burns, Jason Saldaña, Gustavo Sorola, G...",United States,,2015,NR,13 Seasons,"TV Action & Adventure, TV Comedies, TV Sci-Fi ...","[parody, firstperson, shooter, games, military...","TV Show Red vs. Blue Burnie Burns, Jason Sald..."
6230,70286564,TV Show,Maron,,"Marc Maron, Judd Hirsch, Josh Brener, Nora Zeh...",United States,,2016,TV-MA,4 Seasons,TV Comedies,"[Marc, Maron, stars, Marc, Maron, interviews, ...","TV Show Maron Marc Maron, Judd Hirsch, Josh B..."
6231,80116008,Movie,Little Baby Bum: Nursery Rhyme Friends,,,,,2016,,60 min,Movies,"[Nursery, rhymes, original, music, children, a...",Movie Little Baby Bum: Nursery Rhyme Friends ...
6232,70281022,TV Show,A Young Doctor's Notebook and Other Stories,,"Daniel Radcliffe, Jon Hamm, Adam Godley, Chris...",United Kingdom,,2013,TV-MA,2 Seasons,"British TV Shows, TV Comedies, TV Dramas","[Set, Russian, Revolution, comic, miniseries, ...",TV Show A Young Doctor's Notebook and Other St...
6233,70153404,TV Show,Friends,,"Jennifer Aniston, Courteney Cox, Lisa Kudrow, ...",United States,,2003,TV-14,10 Seasons,"Classic & Cult TV, TV Comedies","[hit, sitcom, follows, merry, misadventures, s...","TV Show Friends Jennifer Aniston, Courteney C..."


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6234 entries, 0 to 6233
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       6234 non-null   int64 
 1   type          6234 non-null   object
 2   title         6234 non-null   object
 3   director      4265 non-null   object
 4   cast          5664 non-null   object
 5   country       5758 non-null   object
 6   date_added    6223 non-null   object
 7   release_year  6234 non-null   int64 
 8   rating        6224 non-null   object
 9   duration      6234 non-null   object
 10  listed_in     6234 non-null   object
 11  description   6234 non-null   object
dtypes: int64(2), object(10)
memory usage: 584.6+ KB


In [50]:
df.isnull().sum()

show_id            0
type               0
title              0
director        1969
cast             570
country          476
date_added        11
release_year       0
rating            10
duration           0
listed_in          0
description        0
dtype: int64

###### There appears to be some null values in the columns 'Director', 'Cast', 'Country' and 'date_added'. We will replace them by a whitespace.

In [51]:
df.fillna('', inplace=True)

In [52]:
df.isnull().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

##### The features that i'm choosing are ['type', 'title', 'director', 'cast', 'listed_in', 'description'].

###### Taking out unnecessary 'stopwords' from the column 'description'

In [98]:
def cleanup(text):
    #removing punctuations
    no_punc = [char for char in text if char not in string.punctuation]
    no_punc = ''.join(no_punc)
    #removing the stopwords
    clean_info = [word for word in no_punc.split() if word.lower() not in stopwords.words('english')]
    return clean_info

In [104]:
df['description'] = df['description'].apply(cleanup)

###### Now we're combining all the relevant features and storing them in a single column.

In [111]:
def features(row):
    return row['type']+" "+row['title']+" "+row['director']+' '+row['cast']+' '+row['listed_in']+' '+str(row['description'])

In [112]:
df['combined_features'] = df.apply(features, axis=1)

###### Using the CountVectorizer function on the combined_features column to get a 'count_matrix' that contains the word counts of each row

In [116]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df['combined_features'])

###### Now calculating the cosine similarity of all the rows.
#The resulting matrix of the cosine similarities will be stored in the 'cos_sim' variable

In [312]:
cos_sim = cosine_similarity(count_matrix)

###### Functions to get the Movie name from it's index value (get_title_from_index) and a function to get index from the title (get_index_from_title)

In [260]:
def get_index_from_title(name):
    return df.index[df.title==name].values
def get_title_from_index(m_index):
    return df[df.index==m_index]['title'].values

In [338]:
movie_name='Raees' #movie on which the recommendation will be calculated and displayed
movie_index = get_index_from_title(movie_name) #getting the index of the movie from the movie name
similar_movies = cos_sim[movie_index] #fetching the similarity scores of all the movies against the given 'movie_name' from the 'cosine_similarity' matrix
similar_movies = list(enumerate(chain.from_iterable(similar_movies))) #Converting the resulting list of similarity scores into a list of tuples where each tuple contains the individual movie's index on the 0th index and it's similarity score on the 1st index
sorted_movies = sorted(similar_movies, key= lambda x: x[1], reverse=True) #Now using the similarity score of each tuple to arrange the similarity scores in descending order

###### Displaying the recommendation using the index from each tuple associated with the similarity score and fetching the movie name using that index as a reference in the original df

In [339]:
#displaying the top 10 suggestions
i=0
for movie in sorted_movies:
    print(get_title_from_index(movie[0]))
    i+=1
    if i>10:
        break

['Raees']
['My Next Guest with David Letterman and Shah Rukh Khan']
['Zero']
['Phantom']
['Mumbai Cha Raja']
['Chaahat']
['Don 2']
['Kisaan']
['Dil Chahta Hai']
['Talaash']
['Asoka']


In [343]:
def suggest(name):

    movie_index = get_index_from_title(name) #getting the index of the movie from the movie name
    similar_movies = cos_sim[movie_index] #fetching the similarity scores of all the movies against the given 'movie_name' from the 'cosine_similarity' matrix
    similar_movies = list(enumerate(chain.from_iterable(similar_movies))) #Converting the resulting list of similarity scores into a list of tuples where each tuple contains the individual movie's index on the 0th index and it's similarity score on the 1st index
    sorted_movies = sorted(similar_movies, key= lambda x: x[1], reverse=True) #Now using the similarity score of each tuple to arrange the similarity scores in descending order
    ###### Displaying the recommendation using the index from each tuple associated with the similarity score and fetching the movie name using that index as a reference in the original df
    #displaying the top 10 suggestions
    i=0
    for movie in sorted_movies:
        print(get_title_from_index(movie[0]))
        i+=1
        if i>10:
            break
m_name = input("What was the last movie or TV Show you watched?: ") #try: Narcos, Stranger Things, Friends, Little Things, Special 26, Naam Shabana, Raees
suggest(m_name)

What was the last movie or TV Show you watched?: Narcos
['Narcos']
['Narcos: Mexico']
['Wild District']
['Luis Miguel - The Series']
['El Cartel']
['La Reina del Sur']
['Sin senos no hay paraíso']
['La Viuda Negra']
['Rosario Tijeras']
['Dueños del paraíso']
['El Clon']
