In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('movies_metadata.csv')
df.columns

  interactivity=interactivity, compiler=compiler, result=result)


Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In a knowledge based recommender system, it needs to query the user for his/her specifications and then filter the results according to those specifications. In this the case the specifications are the 
1. Genres of movies 
2. Duration
3. Timeline  
So the system will recommend movies that satisfy the weighted rating as well as the above criteria.

In [4]:
df = df[['title', 'genres', 'release_date', 'runtime', 'vote_average', 'vote_count']]
df.head()

Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,127.0,6.1,34.0
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0


In [5]:
#convert release date into pandas datetime format 
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce') #coerce for invalid parsing

#extracting year of release
df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [6]:
def convert_int(x):
    try:
        return int(x)
    except:
        return 0

In [7]:
df['year'] = df['year'].apply(convert_int)
#Drop release date column
df = df.drop('release_date', axis=1)

df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",81.0,7.7,5415.0,1995
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",101.0,6.5,92.0,1995
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",127.0,6.1,34.0,1995
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",106.0,5.7,173.0,1995


In [8]:
df.iloc[0]['genres']

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [10]:
#import literal eval as function
from ast import literal_eval 

df['genres'] = df['genres'].fillna('[]')
df['genres'] = df['genres'].apply(literal_eval)
df['genres'] = df['genres'].apply(lambda x: [i['name'].lower() for i in x] if isinstance(x, list) else [])

In [11]:
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[animation, comedy, family]",81.0,7.7,5415.0,1995
1,Jumanji,"[adventure, fantasy, family]",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[romance, comedy]",101.0,6.5,92.0,1995
3,Waiting to Exhale,"[comedy, drama, romance]",127.0,6.1,34.0,1995
4,Father of the Bride Part II,[comedy],106.0,5.7,173.0,1995


In [13]:
#Create new feature by exploding genres 
s = df.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_df = df.drop('genres', axis=1).join(s)
gen_df.head()

  


Unnamed: 0,title,runtime,vote_average,vote_count,year,genre
0,Toy Story,81.0,7.7,5415.0,1995,animation
0,Toy Story,81.0,7.7,5415.0,1995,comedy
0,Toy Story,81.0,7.7,5415.0,1995,family
1,Jumanji,104.0,6.9,2413.0,1995,adventure
1,Jumanji,104.0,6.9,2413.0,1995,fantasy


In [16]:
def build_chart(gen_df, percentile=0.8):
    print("Input preferred genre")
    genre = input()
    print("Input shortest duration")
    low_time = int(input())
    print("Input longest duration")
    high_time = int(input())
    print("Input earliest year")
    low_year = int(input())
    print("Input highest year")
    high_year = int(input())
    
    movies = gen_df.copy()
    
    movies = movies[(movies['genre'] == genre) & 
                    (movies['runtime'] >= low_time) & 
                    (movies['runtime'] <= high_time) & 
                    (movies['year'] >= low_year) & 
                    (movies['year'] <= high_year)]
    
    C = movies['vote_average'].mean()
    m = movies['vote_count'].quantile(percentile)
    
    q_movies = movies.copy().loc[movies['vote_count'] >= m]
    q_movies['score'] = q_movies.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) 
                                       + (m/(m+x['vote_count']) * C)
                                       ,axis=1)
    
    q_movies = q_movies.sort_values('score', ascending=False)
    return q_movies

In [17]:
build_chart(gen_df).head()

Input preferred genre
action
Input shortest duration
80
Input longest duration
120
Input earliest year
1990
Input highest year
2010


Unnamed: 0,title,runtime,vote_average,vote_count,year,genre,score
9430,Oldboy,120.0,8.0,2000.0,2003,action,7.809534
6725,Kill Bill: Vol. 1,111.0,7.7,5091.0,2003,action,7.630827
723,Ghost in the Shell,83.0,7.8,854.0,1995,action,7.42621
8234,The Incredibles,115.0,7.4,5290.0,2004,action,7.342213
13286,Ip Man,108.0,7.5,1309.0,2008,action,7.274764


In [19]:
#save recommendations
df.to_csv('metadata_clean.csv', index=False)