In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Movies_and_tv_shows_data.csv")
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


### In order to use the recommendation system algorithm, I removed a few columns and created a new dataset. This was done to ensure that if anything goes wrong during the process, I would still have access to the original dataset.

In [3]:
df1 = df.drop(["date_added", "country", "release_year", "rating", "duration"], axis = 1)
df1.head()

### The "show id" column was in a string format, such as "s1," "s2," "s3," etc. I converted it into an integer format, such as 1, 2, 3, etc.

In [4]:
df1 = df.drop(["date_added", "country", "release_year", "rating", "duration"], axis = 1)
df1["show_id"] =  df1["show_id"].apply(lambda x : int((str(x))[1:]))

### Since a recommendation system is going to be used, any null values in the dataset were replaced with blank spaces to ensure smooth functioning of the system.

In [5]:
df1.fillna(" ",inplace = True)
df1.head()

Unnamed: 0,show_id,type,title,director,cast,listed_in,description
0,1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,Documentaries,"As her father nears the end of his life, filmm..."
1,2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...","International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...","Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,4,TV Show,Jailbirds New Orleans,,,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...","International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


### In this step, we determined the number of movies and TV shows present in the given dataset.

In [6]:
type_of = df1.groupby("type")["type"].agg("count").sort_values(ascending = False)
type_of

type
Movie      6131
TV Show    2676
Name: type, dtype: int64

### In order to use the count vectorizer and improve accuracy, we combined the necessary columns and converted all the text to lowercase.

In [7]:
df2 = df1["director"] + " " + df1["cast"] + " " + df1["listed_in"] + " " + df1["description"]
df2 = df2.apply(lambda x: x.lower())
df2.head()

0    kirsten johnson   documentaries as her father ...
1      ama qamata, khosi ngema, gail mabalane, thab...
2    julien leclercq sami bouajila, tracy gotoas, s...
3        docuseries, reality tv feuds, flirtations ...
4      mayur more, jitendra kumar, ranjan raj, alam...
dtype: object

### We applied the count vectorizer on the dataset and printed the resulting output in an array format.

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
vector = CountVectorizer()
df_cv = vector.fit_transform(df2)
df_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### In this step, we used the cosine similarity algorithm to find the relation between movies and TV shows in the dataset. The results were presented in a float format.

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
similar = cosine_similarity(df_cv)
similar

array([[1.        , 0.0360492 , 0.15854801, ..., 0.10133892, 0.0946497 ,
        0.21873932],
       [0.0360492 , 1.        , 0.17646698, ..., 0.01735264, 0.        ,
        0.02996444],
       [0.15854801, 0.17646698, 1.        , ..., 0.07631865, 0.05346073,
        0.19767989],
       ...,
       [0.10133892, 0.01735264, 0.07631865, ..., 1.        , 0.13668171,
        0.0631754 ],
       [0.0946497 , 0.        , 0.05346073, ..., 0.13668171, 1.        ,
        0.07867373],
       [0.21873932, 0.02996444, 0.19767989, ..., 0.0631754 , 0.07867373,
        1.        ]])

### In this step, we created two lists of names, one for movies and one for TV shows, based on the values present in the "type" column of the given dataset.

In [12]:
tv_show_0 = df[(df.type == "TV Show")]
tv_show = tv_show_0.title
tv_show_list = tv_show.to_list()
tv_show_list

['Blood & Water',
 'Ganglands',
 'Jailbirds New Orleans',
 'Kota Factory',
 'Midnight Mass',
 'The Great British Baking Show',
 'Vendetta: Truth, Lies and The Mafia',
 'Bangkok Breaking',
 'Crime Stories: India Detectives',
 'Dear White People',
 'Falsa identidad',
 'Jaguar',
 'Monsters Inside: The 24 Faces of Billy Milligan',
 'Resurrection: Ertugrul',
 'Love on the Spectrum',
 'Chicago Party Aunt',
 'Sex Education',
 'Squid Game',
 'Tayo and Little Wizards',
 'Angry Birds',
 'Chhota Bheem',
 'He-Man and the Masters of the Universe',
 'The Smart Money Woman',
 'Castle and Castle',
 'Dharmakshetra',
 'Nailed It',
 'Numberblocks',
 'Raja Rasoi Aur Anya Kahaniyan',
 'Saved by the Bell',
 'Stories by Rabindranath Tagore',
 'Too Hot To Handle: Latino',
 'Jack Whitehall: Travels with My Father',
 "The World's Most Amazing Vacation Rentals",
 'Yowamushi Pedal',
 'Lucifer',
 'Metal Shop Masters',
 'Pokémon Master Journeys: The Series',
 'Titipo Titipo',
 'Mighty Raju',
 'Into the Night',
 'Th

In [13]:
movies_0 = df[~(df.type == "TV Show")]
movies = movies_0.title
movies_list = movies.to_list()
movies_list

['Dick Johnson Is Dead',
 'My Little Pony: A New Generation',
 'Sankofa',
 'The Starling',
 'Je Suis Karl',
 'Confessions of an Invisible Girl',
 "Europe's Most Dangerous Man: Otto Skorzeny in Spain",
 'Intrusion',
 'Avvai Shanmughi',
 'Go! Go! Cory Carson: Chrissy Takes the Wheel',
 'Jeans',
 'Minsara Kanavu',
 'Grown Ups',
 'Dark Skies',
 'Paranoia',
 'Ankahi Kahaniya',
 'The Father Who Moves Mountains',
 'The Stronghold',
 'Birth of the Dragon',
 'Jaws',
 'Jaws 2',
 'Jaws 3',
 'Jaws: The Revenge',
 'My Heroes Were Cowboys',
 'Safe House',
 'Training Day',
 'InuYasha the Movie 2: The Castle Beyond the Looking Glass',
 'InuYasha the Movie 3: Swords of an Honorable Ruler',
 'InuYasha the Movie 4: Fire on the Mystic Island',
 'InuYasha the Movie: Affections Touching Across Time',
 'Naruto Shippuden the Movie: Blood Prison',
 'Naruto Shippûden the Movie: Bonds',
 'Naruto Shippûden the Movie: The Will of Fire',
 'Naruto Shippuden: The Movie',
 'Naruto Shippuden: The Movie: The Lost Tower'

### Here is the final code for the recommendation system. I used the itemgetter and difflib libraries to find the movie or TV show that was searched for and to get 5 suggestions. If the movie you searched for is in the data set, it will show you movies with similar names. 

In [14]:
from operator import itemgetter as ig
import difflib as dl

In [15]:
search = input("What do you want to watch? Enter 'Movie' or 'TV Show': ")

if search.lower() == "movie":
    movie = input("Enter the movie name : ")
    close_match = (dl.get_close_matches(movie.capitalize(), movies_list))[0]
    print(close_match)
    l = list(enumerate(similar[int(df1[df1["title"]==close_match].values[0][0])]))
    match = sorted(l, key = ig(1,0), reverse = True)
    final_match = match[1:6]
    print("Movies suggested for you : \n")
    for i in final_match:
            print(df1.iloc[i[0]]["title"])
elif search.lower() == "tv show":
    tv_show = input("Enter the tv show name : ")
    close_match = (dl.get_close_matches(tv_show.capitalize(), tv_show_list))[0]
    print(close_match)
    l = list(enumerate(similar[int(df1[df1["title"]==close_match].values[0][0])]))
    match = sorted(l, key = ig(1,0), reverse = True)
    final_match = match[1:6]
    print("Tv Shows suggested for you : \n")
    for i in final_match:
         print(df1.iloc[i[0]]["title"])
else:
    print("Sorry, I didn't understand your choice.")

What do you want to watch? Enter 'Movie' or 'TV Show': Movie
Enter the movie name : Jaw
Jaws
Movies suggested for you : 

Jaws
Jaws: The Revenge
Exes Baggage
The River Runner
Psycho
