In [None]:
!unzip /content/netflix_titles.csv.zip

Archive:  /content/netflix_titles.csv.zip
  inflating: netflix_titles.csv      


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
import re

In [None]:
df = pd.read_csv("/content/netflix_titles.csv")  # Replace with your actual dataset file path

In [None]:
df = pd.read_csv("/content/netflix_titles.csv")  # Replace with your actual dataset file path

# Data Cleaning
# Remove duplicates
df.drop_duplicates(inplace=True)

# Handle missing values
df.dropna(subset=['title', 'description', 'listed_in'], inplace=True)

# Text Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, numbers, and extra spaces
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenization (you can use more advanced methods depending on your needs)
    tokens = text.split()
    # Remove stop words
    tokens = [token for token in tokens if token not in ENGLISH_STOP_WORDS]
    # Stemming (you can also use lemmatization)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    # Join tokens back into a string
    text = ' '.join(tokens)
    return text

# Apply text preprocessing to relevant columns
df['processed_description'] = df['description'].apply(preprocess_text)

# Feature Engineering
# Genre one-hot encoding
mlb = MultiLabelBinarizer()
df['listed_in'] = df['listed_in'].apply(lambda x: x.split(', '))
genre_encoded = pd.DataFrame(mlb.fit_transform(df['listed_in']), columns=mlb.classes_)
df = pd.concat([df, genre_encoded], axis=1)

# TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_description'])

# Convert TF-IDF matrix to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
df = pd.concat([df, tfidf_df], axis=1)

# Drop unnecessary columns
df.drop(['description', 'listed_in', 'processed_description'], axis=1, inplace=True)

# Display the first few rows of the processed DataFrame
print(df.head())

  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  ...  zoom  zordon  \
0  September 25, 2021          2020  PG-13     90 min  ...   0.0     0.0   
1  September 24, 2021          2021  TV-MA  2 Se

In [None]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['description'])

# Calculate Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Build the Recommendation Function
def recommend_movies(title, cosine_sim=cosine_sim, df=df, top_n=5):
    # Find the index of the movie in the dataset
    idx = df[df['title'] == title].index[0]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top-n most similar movies
    sim_scores = sim_scores[1:top_n+1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top-n most similar movies
    return df['title'].iloc[movie_indices]


In [None]:
recommend_movies('Blood & Water')

5344    Message from the King
1884      Walk Away from Love
4285                    Lilli
4271               Lion Pride
4209               Next Enti?
Name: title, dtype: object

In [None]:
from sklearn.neighbors import NearestNeighbors

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['description'])

# Fit a Nearest Neighbors model
nn_model = NearestNeighbors(metric='cosine', algorithm='brute')
nn_model.fit(tfidf_matrix)

# Build the Recommendation Function using KNN
def recommend_movies_knn(title, nn_model=nn_model, df=df, top_n=5):
    # Find the index of the movie in the dataset
    idx = df[df['title'] == title].index[0]

    # Get the TF-IDF vector for the user-provided movie
    movie_tfidf = tfidf_matrix[idx]

    # Find the top-n most similar movies using KNN
    _, movie_indices = nn_model.kneighbors(movie_tfidf, n_neighbors=top_n+1)

    # Exclude the input movie itself from recommendations
    movie_indices = movie_indices.squeeze()[1:]

    # Return the top-n most similar movies
    return df['title'].iloc[movie_indices]


In [None]:
recommend_movies('Blood & Water')

5344    Message from the King
1884      Walk Away from Love
4285                    Lilli
4271               Lion Pride
4209               Next Enti?
Name: title, dtype: object

In [None]:
recommend_movies('Lion Pride')

1884    Walk Away from Love
1941             Ishq Vishk
2323         The Mirror Boy
3843         Away From Home
1             Blood & Water
Name: title, dtype: object