# This is a project about recommendation system using based content

In [1]:
#Import library
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sphinx.addnodes import index

# Data collection and Data pre-processing

In [2]:
# loading data 
data = pd.read_csv('./movies.csv')
data.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [3]:
col_data = data.columns.to_list()
print(col_data)

['index', 'budget', 'genres', 'homepage', 'id', 'keywords', 'original_language', 'original_title', 'overview', 'popularity', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'vote_average', 'vote_count', 'cast', 'crew', 'director']


In [4]:
#Number of rows and columns in dataset
data.shape

(4803, 24)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4775 non-null   object 
 3   homepage              1712 non-null   object 
 4   id                    4803 non-null   int64  
 5   keywords              4391 non-null   object 
 6   original_language     4803 non-null   object 
 7   original_title        4803 non-null   object 
 8   overview              4800 non-null   object 
 9   popularity            4803 non-null   float64
 10  production_companies  4803 non-null   object 
 11  production_countries  4803 non-null   object 
 12  release_date          4802 non-null   object 
 13  revenue               4803 non-null   int64  
 14  runtime               4801 non-null   float64
 15  spoken_languages     

### Summary 
- Categorical: original_language, status, director
- Numerical(continuous): index, budget, id
- Numerical(discrete): popularity, vote, runtimes, vote_average, vote_count
- String: genres, keywords, original_title
- Object: spoken_language, production_companies, production_countries, crew
- Date: release_date

In [6]:
missed_col = []
for col in col_data:
    missing_count = data[col].isnull().sum()
    
    if missing_count > 0:
        missed_col.append(col)
        print(f'Colum {col} has {missing_count*100 / data.shape[0]:.2f}% value has been lost')

Colum genres has 0.58% value has been lost
Colum homepage has 64.36% value has been lost
Colum keywords has 8.58% value has been lost
Colum overview has 0.06% value has been lost
Colum release_date has 0.02% value has been lost
Colum runtime has 0.04% value has been lost
Colum tagline has 17.57% value has been lost
Colum cast has 0.90% value has been lost
Colum director has 0.62% value has been lost


#### Because type of missed columns is `string`, so we will fill missing value by empty string 

In [7]:
for col in missed_col:
    data[col].fillna("" , inplace=True)

In [8]:
key_args = {
    'original_language' : 'category',
    'status' : 'category',
    'director' : 'category',
}
data['release_date'] = pd.to_datetime(data['release_date'])
data = data.astype(key_args)
print(data.dtypes)

index                            int64
budget                           int64
genres                          object
homepage                        object
id                               int64
keywords                        object
original_language             category
original_title                  object
overview                        object
popularity                     float64
production_companies            object
production_countries            object
release_date            datetime64[ns]
revenue                          int64
runtime                         object
spoken_languages                object
status                        category
tagline                         object
title                           object
vote_average                   float64
vote_count                       int64
cast                            object
crew                            object
director                      category
dtype: object


### Distribution of Numerical features

In [9]:
data.describe()

Unnamed: 0,index,budget,id,popularity,release_date,revenue,vote_average,vote_count
count,4803.0,4803.0,4803.0,4803.0,4802,4803.0,4803.0,4803.0
mean,2401.0,29045040.0,57165.484281,21.492301,2002-12-27 23:45:54.352353280,82260640.0,6.092172,690.217989
min,0.0,0.0,5.0,0.0,1916-09-04 00:00:00,0.0,0.0,0.0
25%,1200.5,790000.0,9014.5,4.66807,1999-07-14 00:00:00,0.0,5.6,54.0
50%,2401.0,15000000.0,14629.0,12.921594,2005-10-03 00:00:00,19170000.0,6.2,235.0
75%,3601.5,40000000.0,58610.5,28.313505,2011-02-16 00:00:00,92917190.0,6.8,737.0
max,4802.0,380000000.0,459488.0,875.581305,2017-02-03 00:00:00,2787965000.0,10.0,13752.0
std,1386.651002,40722390.0,88694.614033,31.81665,,162857100.0,1.194612,1234.585891


### Distribution of category features

In [10]:
data.describe(include='category')

Unnamed: 0,original_language,status,director
count,4803,4803,4803.0
unique,37,3,2350.0
top,en,Released,
freq,4505,4795,30.0


In [12]:
#Combining all the five selected features 
combined_features = data['genres'] + ' '+data['keywords'] + ' '+ data['tagline'] +' ' + data['cast'] + ' ' + data['director']
combined_features

TypeError: Object with dtype category cannot perform the numpy op add

In [13]:
#Coverting the text data to features vectors
vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(combined_features)

NameError: name 'combined_features' is not defined

In [None]:
print(feature_vectors[1])

# Cosine Similarity

In [None]:
similarity = cosine_similarity(feature_vectors)
similarity

In [None]:
# getting the movie name from user:
movie_name = input("Enter your favourite movie name:")

In [None]:
#Creating a list with all the movie names given in the dataset
list_title = data['title'].to_list()

In [None]:
find_close_match = difflib.get_close_matches(movie_name, list_title)
find_close_match

In [None]:
close_match = find_close_match[0]

In [None]:
#Finding the index movies with title
index_of_movie = data[data['title'] == close_match]['index'].values[0]
index_of_movie

In [None]:
# getting a list of similar movies  
similarity_of_movie = list(enumerate(similarity[index_of_movie]))
similarity_of_movie

In [None]:
# Sorting the movies based on their similarity score
sorted_similarity = sorted(similarity_of_movie, key= lambda x: x[1], reverse=True)
# sorted_similarity = list(filter(lambda x : x[1] > 0, sorted_similarity))
sorted_similarity

# Movie Recommendation

In [None]:


movie_name = input(' Enter your favourite movie name : ')

list_titles = data['title'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_titles)

if find_close_match == []:
    print("Can't find your movie please try again...!")
else:
    close_match = find_close_match[0]
    
    index_of_the_movie = data[data.title == close_match]['index'].values[0]
    
    similarity_score = list(enumerate(similarity[index_of_the_movie]))
    
    sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 
    
    print('Movies suggest for you:\n')
    i = 0
    
    for movie in sorted_similar_movies:
        index = movie[0]
        title_from_index = data[data.index==index]['title'].values[0]
        if i<30:
            print(i, ':',title_from_index)
            i+=1