In [4]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import re
import string
from nltk.corpus import stopwords

In [8]:
# Load the dataset
data = pd.read_csv(r"C:\Users\hp\OneDrive\Desktop\netflix_recommendation\netflixData.csv")
print(data.head())

                                Show Id                          Title  \
0  cc1b6ed9-cf9e-4057-8303-34577fb54477                       (Un)Well   
1  e2ef4e91-fb25-42ab-b485-be8e3b23dedb                         #Alive   
2  b01b73b7-81f6-47a7-86d8-acb63080d525  #AnneFrank - Parallel Stories   
3  b6611af0-f53c-4a08-9ffa-9716dc57eb9c                       #blackAF   
4  7f2d4170-bab8-4d75-adc2-197f7124c070               #cats_the_mewvie   

                                         Description  \
0  This docuseries takes a deep dive into the luc...   
1  As a grisly virus rampages a city, a lone man ...   
2  Through her diary, Anne Frank's story is retol...   
3  Kenya Barris and his family navigate relations...   
4  This pawesome documentary explores how our fel...   

                      Director  \
0                          NaN   
1                       Cho Il   
2  Sabina Fedeli, Anna Migotto   
3                          NaN   
4             Michael Margolis   

             

In [9]:
# Select relevant columns
data = data[["Title", "Description", "Content Type", "Genres"]]
print(data.head())


                           Title  \
0                       (Un)Well   
1                         #Alive   
2  #AnneFrank - Parallel Stories   
3                       #blackAF   
4               #cats_the_mewvie   

                                         Description Content Type  \
0  This docuseries takes a deep dive into the luc...      TV Show   
1  As a grisly virus rampages a city, a lone man ...        Movie   
2  Through her diary, Anne Frank's story is retol...        Movie   
3  Kenya Barris and his family navigate relations...      TV Show   
4  This pawesome documentary explores how our fel...        Movie   

                                           Genres  
0                                      Reality TV  
1  Horror Movies, International Movies, Thrillers  
2             Documentaries, International Movies  
3                                     TV Comedies  
4             Documentaries, International Movies  


In [10]:
# Drop rows with missing values
data = data.dropna()
print("Data after dropping NaNs:")
print(data.head())

Data after dropping NaNs:
                           Title  \
0                       (Un)Well   
1                         #Alive   
2  #AnneFrank - Parallel Stories   
3                       #blackAF   
4               #cats_the_mewvie   

                                         Description Content Type  \
0  This docuseries takes a deep dive into the luc...      TV Show   
1  As a grisly virus rampages a city, a lone man ...        Movie   
2  Through her diary, Anne Frank's story is retol...        Movie   
3  Kenya Barris and his family navigate relations...      TV Show   
4  This pawesome documentary explores how our fel...        Movie   

                                           Genres  
0                                      Reality TV  
1  Horror Movies, International Movies, Thrillers  
2             Documentaries, International Movies  
3                                     TV Comedies  
4             Documentaries, International Movies  


In [11]:
# Download NLTK stopwords
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
stopword = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
# Function to clean text
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text = " ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text = " ".join(text)
    return text

  text = re.sub('\[.*?\]', '', text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('\w*\d\w*', '', text)


In [39]:
# Clean the Title and Genres columns
data["Title"] = data["Title"].apply(clean)
data["Genres"] = data["Genres"].apply(clean)
print("Data after cleaning:")
print(data.head())

Data after cleaning:
                       Title  \
0                      unwel   
1                       aliv   
2  annefrank  parallel stori   
3                    blackaf   
4               catsthemewvi   

                                         Description Content Type  \
0  This docuseries takes a deep dive into the luc...      TV Show   
1  As a grisly virus rampages a city, a lone man ...        Movie   
2  Through her diary, Anne Frank's story is retol...        Movie   
3  Kenya Barris and his family navigate relations...      TV Show   
4  This pawesome documentary explores how our fel...        Movie   

                             Genres  
0                        realiti tv  
1  horror movi intern movi thriller  
2           documentari intern movi  
3                         tv comedi  
4           documentari intern movi  


In [18]:
# Create a TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words="english")

In [19]:
# Fit and transform the vectorizer on the Genres column
tfidf_matrix = tfidf.fit_transform(data["Genres"].tolist())
print("TF-IDF Matrix shape:", tfidf_matrix.shape)

TF-IDF Matrix shape: (5967, 39)


In [20]:
# Compute the cosine similarity matrix
similarity = cosine_similarity(tfidf_matrix)
print("Cosine similarity matrix shape:", similarity.shape)

Cosine similarity matrix shape: (5967, 5967)


In [21]:
# Create a Series to map titles to their indices
indices = pd.Series(data.index, index=data['Title']).drop_duplicates()
print("Indices Series:")
print(indices.head())

Indices Series:
Title
unwel                        0
aliv                         1
annefrank  parallel stori    2
blackaf                      3
catsthemewvi                 4
dtype: int64


In [22]:
# Function to get Netflix recommendations
def netFlix_recommendation(title, similarity=similarity):
    if title not in indices:
        return f"Title '{title}' not found in the dataset."
    index = indices[title]
    similarity_scores = list(enumerate(similarity[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[:10]
    movie_indices = [i[0] for i in similarity_scores]
    return data['Title'].iloc[movie_indices]

In [40]:
# Example: Get recommendations for a specific title
recommendations = netFlix_recommendation("girlfriend")
print("Recommendations for 'girlfriend':")
print(recommendations)

Recommendations for 'girlfriend':
3                          blackaf
285                     washington
417                 arrest develop
434     astronomi club sketch show
451    aunti donna big ol hous fun
656                      big mouth
752                bojack horseman
805                   brew brother
935                       champion
937                   chappel show
Name: Title, dtype: object
