In [1]:
import pandas as pd
import numpy as np
import copy
import re
import math
from scipy import spatial
from sklearn.neighbors import NearestNeighbors


In [2]:
df = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')

In [3]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


There are a lot of Nan values, which can affect our predictions.
Here, we have 2 options
1. Remove Rows with Nan values altogether
2. replace some other value

We will replace the values with String --> 'missing', as other features might come in handy for predictions for a particular item in the dataset.

In [4]:
df.fillna('missing', inplace = True)

**The columns considered for predictions are as follows:**
1. country
2. release_year
3. rating 
4. duration
5. listed_in


In [5]:
recommendation_cols = ['country', 'release_year', 'rating', 'duration', 'listed_in']
df_new = copy.deepcopy(df[recommendation_cols])

In [6]:
df_new.head()

Unnamed: 0,country,release_year,rating,duration,listed_in
0,United States,2020,PG-13,90 min,Documentaries
1,South Africa,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries"
2,missing,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
3,missing,2021,TV-MA,1 Season,"Docuseries, Reality TV"
4,India,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ..."


In [7]:
country = []
release_year = [] 
rating = []
duration = [] 
genres = []

In [8]:
def split_by_delimeters(target_list):
    """
    this method splits a target list by some delimeters
    """
    result = []
    for i in target_list:
        delimiters = ",", "&"
        regex_pattern = '|'.join(map(re.escape, delimiters))
        result.extend(re.split(regex_pattern, i))
    result = [i.strip() if i not in ['', 'missing'] else i for i in result]
    return result

In [9]:
# preparing all columns for the dataset
country = list(set(split_by_delimeters(df_new['country'])))
release_year = list(set(df_new['release_year']))
release_year = [str(i) for i in release_year]
ratings = list(set(df_new['rating']))
seasons_durations = ['1_season', '2_season', '3_season', '4_season','5+_season']
movies_durations = ['0_25_min', '26_50_min', '51_75_min', '76_100_min', 
                    '101_125_min', '126_150_min', '151+_min' ]
durations = seasons_durations + movies_durations
genres = list(set(split_by_delimeters(df_new['listed_in'])))

In [10]:
# combining all columns for the one hot encoded vector form
all_columns = country + release_year + ratings + durations + genres
all_columns.remove('missing')

In [11]:
# initializes a df with '0' values for the one-hot-encoded vector
ohe_df = pd.DataFrame(0, index = np.arange(len(df_new)), columns = all_columns)

In [12]:
def duration_adjustment(duration: str) -> str:
    try:
        dur_list = []
        if 'Season' in duration:
            temp_res = duration.split()
            no_of_seasons = int(temp_res[0])
            if no_of_seasons <5:
                return seasons_durations[no_of_seasons - 1]
            return seasons_durations[-1]

        else:
            temp_res = duration.split()
            runtime_mins = int(temp_res[0])
            if runtime_mins <= 150:
                index = math.ceil((runtime_mins/25) - 1.0)
                return movies_durations[index]
            return movies_durations[-1]
    except:
        return 'missing'


In [13]:
def return_columns(row):
    """
    recieves a df row and returns the respective columns/features
    that the item i.e. movie falls in
    """
    result_cols = []
    result_cols.extend(split_by_delimeters([row['country']]))
    result_cols.extend(split_by_delimeters([row['listed_in']]))
    result_cols.append(str(row['release_year']))
    result_cols.append(row['rating'])
    result_cols.append(duration_adjustment(str(row['duration'])))
    if 'missing' in result_cols:
        result_cols.remove('missing')
    return result_cols
    

In [14]:
# preparing the one hot encoded df of all items i.e. movies as vectors
for ind,row in df_new.iterrows():
    ohe_df.loc[ind, return_columns(row)] = 1

In [15]:
ohe_df.head()

Unnamed: 0,Unnamed: 1,Burkina Faso,Pakistan,Bulgaria,Bangladesh,Botswana,Ethiopia,Ghana,Ukraine,Soviet Union,...,Adventure,British TV Shows,TV Thrillers,Docuseries,Romantic TV Shows,Science,TV Mysteries,TV Shows,Movies,TV Comedies
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1


In [16]:
def recommend_by_cosine(movie, top_items):
    """
    recommends top_similar movies based on cosine similarity
    """
    movie_index = df[df['title'] == movie].index[0]
    vector = ohe_df.iloc[movie_index]
    distance = []
    for ind, row in ohe_df.iterrows():
        distance.append(spatial.distance.cosine(vector, row))
    
    indexes = sorted(range(len(distance)), key=lambda i: distance[i])[:top_items + 1]
    
    return list(df.iloc[indexes]['title'])[1:]
    

In [17]:
def recommend_by_knn(movie, top_items):
    """
    recommends top_similar movies based on knn algorithm
    """
    movie_index = df[df['title'] == movie].index[0]
    vector = ohe_df.iloc[movie_index]
    knn = NearestNeighbors(n_neighbors= top_items + 1, algorithm='auto')
    knn.fit(ohe_df.values)
    indexes = list(knn.kneighbors([vector], top_items + 1, return_distance=False)[0])
    return list(df.iloc[indexes]['title'])[1:]

**Recommendations by Cosine Similarity**

In [18]:
# the first index is the movie itself and the rest are recommendations
recommend_by_cosine('Pulp Fiction',15)

["Schindler's List",
 'Once Upon a Time in America',
 'Zodiac',
 'The Irishman',
 'A Clockwork Orange',
 'Lolita',
 'Magnolia',
 'Rain Man',
 'There Will Be Blood',
 'Gangs of New York',
 'GoodFellas',
 'Jackie Brown',
 'Do the Right Thing',
 'Django Unchained',
 'Boogie Nights']

**Movies Recommended by KNN**

In [19]:
# the first row is the movie itself and the rest are recommendations
recommend_by_knn('Friends', 10)

['Frasier',
 'Portlandia',
 'Community',
 'The Parkers',
 'Cheers',
 'Girlfriends',
 'Parks and Recreation',
 'Young & Hungry',
 '30 Rock',
 'The Office (U.S.)']