<a href="https://colab.research.google.com/github/sunnylistax25/Netflix_Recommendation_System/blob/main/RecommendationSystem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity

data = pd.read_csv('netflixData.csv')
print(data.head())


                                Show Id                          Title  \
0  cc1b6ed9-cf9e-4057-8303-34577fb54477                       (Un)Well   
1  e2ef4e91-fb25-42ab-b485-be8e3b23dedb                         #Alive   
2  b01b73b7-81f6-47a7-86d8-acb63080d525  #AnneFrank - Parallel Stories   
3  b6611af0-f53c-4a08-9ffa-9716dc57eb9c                       #blackAF   
4  7f2d4170-bab8-4d75-adc2-197f7124c070               #cats_the_mewvie   

                                         Description  \
0  This docuseries takes a deep dive into the luc...   
1  As a grisly virus rampages a city, a lone man ...   
2  Through her diary, Anne Frank's story is retol...   
3  Kenya Barris and his family navigate relations...   
4  This pawesome documentary explores how our fel...   

                      Director  \
0                          NaN   
1                       Cho Il   
2  Sabina Fedeli, Anna Migotto   
3                          NaN   
4             Michael Margolis   

             

In [2]:
# Check Null values
print(data.isnull().sum())

Show Id                  0
Title                    0
Description              0
Director              2064
Genres                   0
Cast                   530
Production Country     559
Release Date             3
Rating                   4
Duration                 3
Imdb Score             608
Content Type             0
Date Added            1335
dtype: int64


In [3]:
data = data[['Title', 'Description', 'Content Type', 'Genres']]
print(data.head())

                           Title  \
0                       (Un)Well   
1                         #Alive   
2  #AnneFrank - Parallel Stories   
3                       #blackAF   
4               #cats_the_mewvie   

                                         Description Content Type  \
0  This docuseries takes a deep dive into the luc...      TV Show   
1  As a grisly virus rampages a city, a lone man ...        Movie   
2  Through her diary, Anne Frank's story is retol...        Movie   
3  Kenya Barris and his family navigate relations...      TV Show   
4  This pawesome documentary explores how our fel...        Movie   

                                           Genres  
0                                      Reality TV  
1  Horror Movies, International Movies, Thrillers  
2             Documentaries, International Movies  
3                                     TV Comedies  
4             Documentaries, International Movies  


In [4]:
data = data.dropna()

In [5]:
# Cleaning the Title column

In [6]:
# stopwords are words that does not mean much to a sentense.
# stemmer is an algorithm that can be used to remove suffixes from a word to
# obtain root/stem word.
import nltk
import re
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer('english')
from nltk.corpus import stopwords
import string
stopword = set(stopwords.words('english'))

# cleaning the strings to remove any unwanted symbols, url, html tag, punctuation, newline, digits.
 
def clean(text):
  text = str(text).lower()
  text = re.sub('\[.*?\]', '', text)
  text = re.sub('https?://\S+|www.\.\S+', '', text)
  text = re.sub('<.*?>+', '', text)
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
  text = re.sub('\n', '', text)
  text = re.sub('\w*\d\w*', '', text)
  # removing stopwords Using list comprehension.
  text = [word for word in text.split(' ') if word not in stopword]
  text = ' '.join(text)
  # applying stemming to the words using list comprehension.
  text = [stemmer.stem(word) for word in text.split(' ')]
  text = ' '.join(text)
  return text
data['Title'] = data['Title'].apply(clean)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
print(data.Title.sample(10))

4992                     letdown
5492    trevor noah son patricia
4961                last bastion
5326                 worst witch
3241         mumbai delhi mumbai
1781                    generat 
4016                rowdi rathor
2999                      mascot
5731                      legend
1967                  haraamkhor
Name: Title, dtype: object


In [8]:
# Using Genres column as feature to recommend similar content to the user.
# cosine similarity is used here to find the similarities in two document

feature = data['Genres'].tolist()
tfidf = text.TfidfVectorizer(input=feature, stop_words='english')
tfidf_matrix = tfidf.fit_transform(feature)
similarity = cosine_similarity(tfidf_matrix)

In [9]:
indices = pd.Series(data.index, index=data['Title']).drop_duplicates()

In [10]:
# Recommendation Function

def netflix_recommendation(title, similarity = similarity):
  index = indices[title]
  similarity_scores = list(enumerate(similarity[index]))
  similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
  similarity_scores = similarity_scores[0:10]
  movieindices = [i[0] for i in similarity_scores]
  return data['Title'].iloc[movieindices]



In [29]:
print(netflix_recommendation('godzilla'))

120                                  lion hous
816                         bright music video
1649                 five came back refer film
1852                                  godzilla
3053                    metallica kind monster
3706               pioneer first women filmmak
3758    pretti guardian sailor moon etern movi
4289                         six window desert
4878                 hate eight extend version
2523                                    khaani
Name: Title, dtype: object


In [39]:
print(netflix_recommendation('letdown'))

89                          boy name flora
532                          bangkok buddi
866                             call agent
873                                   hear
948     check store next door next chapter
1566                           famili busi
1763                          game win hit
1827                           girl hostel
2075                               hjørdis
2107             hong kong west side stori
Name: Title, dtype: object


In [40]:
print(netflix_recommendation('mascot'))

7                          realityhigh
106                futil stupid gestur
191                             accept
331            american pie  girl rule
456    austin power intern man mysteri
513                       bad grandpa 
517                           bad trip
577                           bebe kid
629                      two fern movi
999                    christma surviv
Name: Title, dtype: object
