<a href="https://colab.research.google.com/github/vaishnavi52thade/Netflix-Recommendation-system/blob/main/Netflix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 # **Netflix Recommendation System**

---



---



# Import Python libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity


# Import Datasets

In [2]:
data = pd.read_csv("/content/netflixData.csv")
print(data.head())

                                Show Id                          Title  \
0  cc1b6ed9-cf9e-4057-8303-34577fb54477                       (Un)Well   
1  e2ef4e91-fb25-42ab-b485-be8e3b23dedb                         #Alive   
2  b01b73b7-81f6-47a7-86d8-acb63080d525  #AnneFrank - Parallel Stories   
3  b6611af0-f53c-4a08-9ffa-9716dc57eb9c                       #blackAF   
4  7f2d4170-bab8-4d75-adc2-197f7124c070               #cats_the_mewvie   

                                         Description  \
0  This docuseries takes a deep dive into the luc...   
1  As a grisly virus rampages a city, a lone man ...   
2  Through her diary, Anne Frank's story is retol...   
3  Kenya Barris and his family navigate relations...   
4  This pawesome documentary explores how our fel...   

                      Director  \
0                          NaN   
1                       Cho Il   
2  Sabina Fedeli, Anna Migotto   
3                          NaN   
4             Michael Margolis   

             

# Check whether the data contains null values or not.

In [3]:
print(data.isnull().sum())

Show Id                  0
Title                    0
Description              0
Director              2064
Genres                   0
Cast                   530
Production Country     559
Release Date             3
Rating                   4
Duration                 3
Imdb Score             608
Content Type             0
Date Added            1335
dtype: int64


# The dataset contain null values , but before removing the null values, select the column that use to build a netflix recommendation system.

In [4]:
data = data[["Title", "Description", "Content Type", "Genres"]]
print(data.head())

                           Title  \
0                       (Un)Well   
1                         #Alive   
2  #AnneFrank - Parallel Stories   
3                       #blackAF   
4               #cats_the_mewvie   

                                         Description Content Type  \
0  This docuseries takes a deep dive into the luc...      TV Show   
1  As a grisly virus rampages a city, a lone man ...        Movie   
2  Through her diary, Anne Frank's story is retol...        Movie   
3  Kenya Barris and his family navigate relations...      TV Show   
4  This pawesome documentary explores how our fel...        Movie   

                                           Genres  
0                                      Reality TV  
1  Horror Movies, International Movies, Thrillers  
2             Documentaries, International Movies  
3                                     TV Comedies  
4             Documentaries, International Movies  


In [5]:
data = data.dropna()

# Data preparation

In [6]:
import nltk
import re
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))

def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
data["Title"] = data["Title"].apply(clean)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
print(data.Title.sample(10))

2365    jerri seinfeld im tell last time
1876                           goosebump
2169                               kalam
4335                         someon like
3669             penguin madagascar movi
536     bangkok love stori object affect
3813                         queen south
5779                           keep aliv
4090                            sarajevo
2650                                léa 
Name: Title, dtype: object


In [11]:
feature = data["Genres"].tolist()

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
indices = pd.Series(data.index,
                    index=data['Title']).drop_duplicates()

# This is how Movies and TV shows on netflix.

In [21]:
# Calculate the TF-IDF matrix and cosine similarity
# Initialize TfidfVectorizer with explicit input='content'
tfidf = text.TfidfVectorizer(stop_words="english", input='content')
tfidf_matrix = tfidf.fit_transform(feature)
similarity = cosine_similarity(tfidf_matrix)

def netFlix_recommendation(title, similarity=similarity):
    # Stem the input title
    stemmed_title = clean(title)
    # Check if the stemmed title exists
    if stemmed_title not in indices:
        return f"Title '{title}' (stemmed: '{stemmed_title}') not found in the dataset."

    index = indices[stemmed_title]
    similarity_scores = list(enumerate(similarity[index]))
    # Sort by similarity score in descending order
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    # Skip the first one (the movie itself)
    similarity_scores = similarity_scores[1:11]
    movieindices = [i[0] for i in similarity_scores]
    return data['Title'].iloc[movieindices].reset_index(drop=True)

# Example usage
print(netFlix_recommendation("girlfriend"))

0                     washington
1                 arrest develop
2     astronomi club sketch show
3    aunti donna big ol hous fun
4                      big mouth
5                bojack horseman
6                   brew brother
7                       champion
8                  chappell show
9                      communiti
Name: Title, dtype: object
