## Import all Necessary libraries

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [6]:
movies=pd.read_csv('netflix_titles.csv') #read in the netflix file

In [7]:
movies.shape #dataset has seven columns, and 8807 movies(rows)

(8807, 12)

In [8]:
movies.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."


## Preprocessing Begins

In [9]:
#list of columns that will be needed
#show_id
#type
#title
#cast
#country
#listed_in
#description

In [10]:
#drop unnecessary columns
movies.drop("date_added", axis=1, inplace=True)
movies.drop("release_year", axis=1, inplace=True)
movies.drop("duration", axis=1, inplace=True)
movies.drop("rating", axis=1, inplace=True)
movies.drop('director',axis=1, inplace=True)

In [11]:
movies.isnull().sum() #checking for null values in needed columns

show_id          0
type             0
title            0
cast           825
country        831
listed_in        0
description      0
dtype: int64

In [12]:
movies=movies.dropna() #since there are lots of movies, dropping these null values shouldn't hurt. Note this is because country
#and cast are important columns for the method i'm about to use if not I can just drop the whole column. 

In [13]:
movies.isnull().sum()#checking to make sure there are no null values anymore

show_id        0
type           0
title          0
cast           0
country        0
listed_in      0
description    0
dtype: int64

In [14]:
movies.shape #Now left with 7305 movies. The dataset is updated till the last 6 months so it is recent and 7000+ is a fair number

(7305, 7)

In [15]:
movies.iloc[0]['cast'] #returns a string list of names

'Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng'

In [16]:
#The split function returns a list which is the required dataset for this method i'm about to use. 
movies['cast']=movies['cast'].apply(lambda x:x.split(','))

In [17]:
movies.iloc[0]['cast'] #returns them in a list form 

['Ama Qamata',
 ' Khosi Ngema',
 ' Gail Mabalane',
 ' Thabang Molaba',
 ' Dillon Windvogel',
 ' Natasha Thahane',
 ' Arno Greeff',
 ' Xolile Tshabalala',
 ' Getmore Sithole',
 ' Cindy Mahlangu',
 ' Ryle De Morny',
 ' Greteli Fincham',
 ' Sello Maake Ka-Ncube',
 ' Odwa Gwanya',
 ' Mekaila Mathys',
 ' Sandi Schultz',
 ' Duane Williams',
 ' Shamilla Miller',
 ' Patrick Mofokeng']

In [18]:
#create a function to get the first three names of the cast members
def get_three_people(obj):
    l=[]
    counter=0
    for i in (obj):
        if counter != 3:
            l.append(i)
            counter+=1
        else:
            break
    return l

In [19]:
 movies['cast']=movies['cast'].apply(get_three_people) #applying the function to the column. This leaves the column with three
#names per movie cast. 

In [20]:
movies.iloc[0]['cast']

['Ama Qamata', ' Khosi Ngema', ' Gail Mabalane']

In [21]:
movies['listed_in']=movies['listed_in'].apply(lambda x:x.split(',')) #The split function returns a list which is the required 
#dataset for this method i'm about to use. 

In [22]:
movies['country']=movies['country'].apply(lambda x:x.split(',')) #The split function returns a list which is the required 
#dataset for this method i'm about to use. 

In [23]:
movies.iloc[0]['description'] #returns a stringnbut it needs to be in a list

'After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth.'

In [24]:
movies['description']=movies['description'].apply(lambda x:x.split())

In [25]:
movies['description'].iloc[0] #returns a list of the words

['After',
 'crossing',
 'paths',
 'at',
 'a',
 'party,',
 'a',
 'Cape',
 'Town',
 'teen',
 'sets',
 'out',
 'to',
 'prove',
 'whether',
 'a',
 'private-school',
 'swimming',
 'star',
 'is',
 'her',
 'sister',
 'who',
 'was',
 'abducted',
 'at',
 'birth.']

In [26]:
movies['type']=movies['type'].apply(lambda x:x.split(','))#The split function returns a list which is the required
#dataset for this method i'm about to use. 

In [27]:
#renaming the columns to look better
movies=movies.rename(columns={'type':'Type','title':'Title','cast':'Cast','country':'Country','listed_in':'Genre','description':'Overview'})

In [28]:
movies.head() #Now we have it how we want but there are still some works to do

Unnamed: 0,show_id,Type,Title,Cast,Country,Genre,Overview
1,s2,[TV Show],Blood & Water,"[Ama Qamata, Khosi Ngema, Gail Mabalane]",[South Africa],"[International TV Shows, TV Dramas, TV Myste...","[After, crossing, paths, at, a, party,, a, Cap..."
4,s5,[TV Show],Kota Factory,"[Mayur More, Jitendra Kumar, Ranjan Raj]",[India],"[International TV Shows, Romantic TV Shows, ...","[In, a, city, of, coaching, centers, known, to..."
7,s8,[Movie],Sankofa,"[Kofi Ghanaba, Oyafunmike Ogunlano, Alexandr...","[United States, Ghana, Burkina Faso, United...","[Dramas, Independent Movies, International M...","[On, a, photo, shoot, in, Ghana,, an, American..."
8,s9,[TV Show],The Great British Baking Show,"[Mel Giedroyc, Sue Perkins, Mary Berry]",[United Kingdom],"[British TV Shows, Reality TV]","[A, talented, batch, of, amateur, bakers, face..."
9,s10,[Movie],The Starling,"[Melissa McCarthy, Chris O'Dowd, Kevin Kline]",[United States],"[Comedies, Dramas]","[A, woman, adjusting, to, life, after, a, loss..."


In [29]:
#remember every of the necessary row is now in list format. Now for each of the items under each column, I'll clean up spaces
#between each words. So that a list of names ['Alimi Qudirah', 'Qudirah Omotayo'] becomes  ['AlimiQudirah', 'QudirahOmotayo']. 
#this is done for all columns

In [30]:

movies['Genre']=movies['Genre'].apply(lambda x: [i.replace(" ","") for i in x]) 

In [31]:
movies['Cast']=movies['Cast'].apply(lambda x: [i.replace(" ","") for i in x])

In [32]:
movies['Country']=movies['Country'].apply(lambda x: [i.replace(" ","") for i in x])

In [33]:
movies['Type']=movies['Type'].apply(lambda x: [i.replace(" ","") for i in x])

In [34]:
#Now add all the useful features under a new column called Tag
movies['Tags']=movies['Cast']+movies['Country']+movies['Genre']+movies['Overview']

In [35]:
movies['Tags']=movies['Tags']+movies['Type'] #I forgot Tag

In [36]:
Netflix_df=movies[['show_id','Title','Tags']] #creating a new dataframe with the new important columns

In [37]:
Netflix_df.head()

Unnamed: 0,show_id,Title,Tags
1,s2,Blood & Water,"[AmaQamata, KhosiNgema, GailMabalane, SouthAfr..."
4,s5,Kota Factory,"[MayurMore, JitendraKumar, RanjanRaj, India, I..."
7,s8,Sankofa,"[KofiGhanaba, OyafunmikeOgunlano, AlexandraDua..."
8,s9,The Great British Baking Show,"[MelGiedroyc, SuePerkins, MaryBerry, UnitedKin..."
9,s10,The Starling,"[MelissaMcCarthy, ChrisO'Dowd, KevinKline, Uni..."


In [38]:
Netflix_df.shape

(7305, 3)

In [51]:
Netflix_df['Tags']=Netflix_df['Tags'].apply(lambda x:" ".join(x)) #returns a string datatype. this way we can apply lower() to make it 
#case insensitive. 

In [52]:
Netflix_df['Tags'].iloc[0]

'AmaQamata KhosiNgema GailMabalane SouthAfrica InternationalTVShows TVDramas TVMysteries After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth. TVShow'

In [53]:
Netflix_df['Tags']=Netflix_df['Tags'].apply(lambda x:x.lower()) 

In [None]:
Netflix_df=Netflix_df.reset_index() #resets index

In [56]:
cv=CountVectorizer(max_features=5000,stop_words='english') #using count vectorizer we perform vectorization i.e converting words
#into vectors

In [57]:
vectors=cv.fit_transform(Netflix_df['Tags']).toarray() #fit transform will basically fit the words into what we've set for count
#vectorizer and the toarray returns an array of vectors

In [60]:
vectors.shape #this shows for 7305 movies, there are 5000 features each.

(7305, 5000)

In [59]:
cv.get_feature_names() #this shows each of the features. it takes 5000 features as specified by the count vectorizer.

['000',
 '007',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '14th',
 '15',
 '16',
 '16th',
 '17',
 '17th',
 '18',
 '18th',
 '19',
 '1920s',
 '1930s',
 '1940s',
 '1950s',
 '1960s',
 '1970s',
 '1971',
 '1980',
 '1980s',
 '1989',
 '1990s',
 '19th',
 '20',
 '2011',
 '2015',
 '20th',
 '21st',
 '24',
 '25',
 '28',
 '30',
 '40',
 '40s',
 '40th',
 '50',
 '500',
 '60',
 '60s',
 '80s',
 '90s',
 'aamirkhan',
 'aaroneckhart',
 'aaronpaul',
 'abandoned',
 'abandons',
 'abduct',
 'abducted',
 'abduction',
 'abhaydeol',
 'abhishekbachchan',
 'abigailbreslin',
 'abilities',
 'ability',
 'able',
 'aboard',
 'abroad',
 'abruptly',
 'absence',
 'absurd',
 'abuse',
 'abusive',
 'academic',
 'academy',
 'accept',
 'accepting',
 'accepts',
 'access',
 'accident',
 'accidental',
 'accidentally',
 'acclaimed',
 'accompanied',
 'accompanies',
 'account',
 'accountant',
 'accounts',
 'accused',
 'accuses',
 'achieve',
 'act',
 'acting',
 'action',
 'actions',
 'activist',
 'activists',
 'activities',
 'activity

In [61]:
ps=PorterStemmer() #porterstemmer delivers roots of each words so that love,loving, loved are counted as same

In [62]:

def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

        

In [63]:
Netflix_df['Tags']=Netflix_df['Tags'].apply(stem) 

## and that's all with preprocessing. 

In [64]:
similarity=cosine_similarity(vectors) #cosine similarity measures the similarity beween each feature words and movies

In [68]:
similarity

array([[1.        , 0.10526316, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.10526316, 1.        , 0.        , ..., 0.05407381, 0.11846978,
        0.0978232 ],
       [0.        , 0.        , 1.        , ..., 0.10540926, 0.11547005,
        0.19069252],
       ...,
       [0.        , 0.05407381, 0.10540926, ..., 1.        , 0.18257419,
        0.05025189],
       [0.        , 0.11846978, 0.11547005, ..., 0.18257419, 1.        ,
        0.05504819],
       [0.        , 0.0978232 , 0.19069252, ..., 0.05025189, 0.05504819,
        1.        ]])

In [67]:
sorted(similarity[3],reverse=True) #this shows the cosine similarity of the 4th movie with index 3 in decreasing order. 

[1.0000000000000002,
 0.40016336533252067,
 0.3535533905932738,
 0.3268602252303068,
 0.314970394174356,
 0.3142696805273545,
 0.3042903097250923,
 0.3042903097250923,
 0.3042903097250923,
 0.3042903097250923,
 0.3042903097250923,
 0.298142396999972,
 0.2946278254943948,
 0.2858309752375148,
 0.2842676218074806,
 0.2842676218074806,
 0.27216552697590873,
 0.27216552697590873,
 0.27216552697590873,
 0.26352313834736496,
 0.25717224993681986,
 0.25197631533948484,
 0.25197631533948484,
 0.25197631533948484,
 0.25197631533948484,
 0.25197631533948484,
 0.25197631533948484,
 0.25197631533948484,
 0.25197631533948484,
 0.25,
 0.24343224778007383,
 0.24343224778007383,
 0.24343224778007383,
 0.24343224778007383,
 0.24343224778007383,
 0.23570226039551587,
 0.23570226039551587,
 0.23570226039551587,
 0.23570226039551587,
 0.23570226039551587,
 0.2286647801900118,
 0.22360679774997902,
 0.22360679774997902,
 0.22360679774997902,
 0.22222222222222227,
 0.22222222222222227,
 0.21629522817435004,

In [None]:
#Now to write a recommendation function

In [69]:
def recommendation(Title): #takes in the argument Title
    movie_index=Netflix_df[Netflix_df['Title']==Title].index[0] #if the title correlates with a title column return index and 
    #store in movie index
    distances=similarity[movie_index] #return the similarity of the index in distance
    movies_list=sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:21] #return first 20 movie index with correlation 
    #near to the specified movie
    for i in movies_list:
        print (Netflix_df.iloc[i[0]].Title) #return the title

In [70]:
recommendation("The Karate Kid") #up and working!!

Labyrinth
The Karate Kid Part II
Indiana Jones and the Last Crusade
Crocodile Dundee in Los Angeles
Indiana Jones and the Raiders of the Lost Ark
Enter the Dragon
The Spy Next Door
6 Bullets
Popeye
The Sleepover
Indiana Jones and the Temple of Doom
National Treasure
The Pirates! Band of Misfits
The Forbidden Kingdom
Double Dad
Carmen Sandiego: To Steal or Not to Steal
Star Wars: Episode VIII: The Last Jedi
Superfly
Never Back Down 2: The Beatdown
The Next Karate Kid


In [74]:
pickle.dump(Netflix_df.to_dict(),open('Netflix_dict.pkl','wb')) #NOw to deploy locally. Use pickpe to dump your files so that you
#can load them later. 

In [75]:
similarities=pickle.dump(similarity,open('similarity.pkl','wb'))

## The END