## Import all Necessary libraries

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer as tf_idf
import nltk
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [None]:
movies=pd.read_csv('netflix_titles.csv') #read in the netflix file

In [None]:
movies.shape #dataset has seven columns, and 8807 movies(rows)

In [None]:
movies.head()

## Preprocessing Begins

In [None]:
#list of columns that will be needed
#show_id
#type
#title
#cast
#country
#listed_in
#description

In [None]:
#drop unnecessary columns
movies.drop("date_added", axis=1, inplace=True)
movies.drop("release_year", axis=1, inplace=True)
movies.drop("duration", axis=1, inplace=True)
movies.drop("rating", axis=1, inplace=True)
movies.drop('director',axis=1, inplace=True)

In [None]:
movies.isnull().sum() #checking for null values in needed columns

In [None]:
movies=movies.dropna() #dropping missing values

In [None]:
movies.isnull().sum()#checking to make sure there are no null values anymore

In [None]:
movies.shape 

In [None]:
print(movies.iloc[0]['cast'])
print(movies.iloc[0]['listed_in'])
print(movies.iloc[0]['country'])
print(movies.iloc[0]['description'])
print(movies.iloc[0]['type'])

Since the columns are in string formats and are needed in the list data structure, I'll apply split to each of the columns so it returns list. 

In [None]:
movies['cast']=movies['cast'].apply(lambda x:x.split(',')) 
movies['listed_in']=movies['listed_in'].apply(lambda x:x.split(',')) 
movies['country']=movies['country'].apply(lambda x:x.split(','))
movies['description']=movies['description'].apply(lambda x:x.split())
movies['type']=movies['type'].apply(lambda x:x.split(','))

In [None]:
print(movies.iloc[0]['cast'])
print(movies.iloc[0]['listed_in'])
print(movies.iloc[0]['country'])
print(movies.iloc[0]['description'])
print(movies.iloc[0]['type'])

In [None]:
#create a function to get the first three names of the cast members
def get_three_people(obj):
    three_people=[]
    counter=0
    for i in (obj):
        if counter != 3:
            three_people.append(i)
            counter+=1
        else:
            break
    return three_people

In [None]:
 movies['cast']=movies['cast'].apply(get_three_people) 

In [None]:
#renaming the columns to look better
movies=movies.rename(columns={'type':'Type','title':'Title','cast':'Cast','country':'Country','listed_in':'Genre','description':'Overview'})

In [None]:
movies.head() #Now we have it how we want but there are still some works to do

Since each of the rows are now in list format, I need to remove spaces between each words. So, 'Mary Berry' becomes 'MaryBerry'.
This creates uniformity

In [None]:
movies['Genre']=movies['Genre'].apply(lambda x: [i.replace(" ","") for i in x]) 
movies['Cast']=movies['Cast'].apply(lambda x: [i.replace(" ","") for i in x])
movies['Country']=movies['Country'].apply(lambda x: [i.replace(" ","") for i in x])
movies['Type']=movies['Type'].apply(lambda x: [i.replace(" ","") for i in x])

In [None]:
#Now add all the useful features under a new column called Tag
movies['Tags']=movies['Cast']+movies['Country']+movies['Genre']+movies['Overview']+movies['Type']

In [None]:
Netflix_df=movies[['show_id','Title','Tags']] #creating a new dataframe with the new important columns

In [None]:
Netflix_df.head()

In [None]:
Netflix_df['Tags']=Netflix_df['Tags'].apply(lambda x:" ".join(x)) #converting back to string

In [None]:
Netflix_df=Netflix_df.reset_index() #resets index

In [None]:
Netflix_df.head()

Creating a function that extracts the part of speech of a word and put it in the format a lemmatizer would take. 

In [None]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

Now to lemmatize each of the words to the root word while also considering context.

In [None]:
def lemmatizer(word):
    lemmatizer=WordNetLemmatizer()
    return lemmatizer.lemmatize(word,get_wordnet_pos(word))

In [None]:
Netflix_df['Tags']=Netflix_df['Tags'].apply(lemmatizer) 

A tf_idf vectorizer is used to convert words to vectors based on their frequency and importance.Fit transform will basically fit the words into what we've set for tfidf and the toarray returns an array of vectors

In [None]:
vectorizer=tf_idf(max_features=4000,stop_words='english',lowercase=True) 

In [None]:
vectors=vectorizer.fit_transform(Netflix_df['Tags']).toarray() 

In [None]:
vectors.shape #this shows for 7305 movies, there are 4000 features(words) each.

In [None]:
vectorizer.get_feature_names()#this shows each of the features. 

## and that's all with preprocessing. 

The cosine similarity will measure the similarity between each feature words and movies

In [None]:
similarity=cosine_similarity(vectors) 

In [None]:
similarity

In [None]:
sorted(similarity[3],reverse=True) #this shows the cosine similarity of the 4th movie

Now is time to write the recommendation function. The function takes a Title argument and checks with the dataset where the index of the title falls. it uses the index to bring a list of the movie's similarity score(compared to other movies). The scores are arranged in descending order and the first 10 are sliced. The Titles are then retrieved from the dataset. 

In [None]:
def recommendation(Title): 
    movie_index=Netflix_df[Netflix_df['Title']==Title].index[0] 
    distances=similarity[movie_index] 
    movies_list=sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:10]
    for i in movies_list:
        print(Netflix_df.iloc[i[0]].Title) 

In [None]:
recommendation("Khoobsurat") #up and working!!

In [None]:
pickle.dump(Netflix_df.to_dict(),open('Netflix_dict.pkl','wb')) #Now to deploy locally. Use pickle to dump files so that you
#can load them later. 

In [None]:
similarities=pickle.dump(similarity,open('similarity.pkl','wb'))

## The END