# MOVIE RECOMMENDATIONS: 

# Imports:
    1. sklearn: for cosine_similarity and CountVectorizer
    2. rake_nltk: to analyze key phrases in text
    3. pandas: to store data from CSV file of around 5000 movies

In [1]:
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from difflib import SequenceMatcher

# Reads data from CSV file and stores it into pandas df:
    -- Includes keywords of plot, the movie titles, genre, and director's name -- 

In [2]:
df = pd.read_csv("movie_metadata.csv", encoding='utf-8')
df = df[['movie_title','genres','director_name','plot_keywords']]
df.head()

Unnamed: 0,movie_title,genres,director_name,plot_keywords
0,Avatar,Action|Adventure|Fantasy|Sci-Fi,James Cameron,avatar|future|marine|native|paraplegic
1,Pirates of the Caribbean: At World's End,Action|Adventure|Fantasy,Gore Verbinski,goddess|marriage ceremony|marriage proposal|pi...
2,Spectre,Action|Adventure|Thriller,Sam Mendes,bomb|espionage|sequel|spy|terrorist
3,The Dark Knight Rises,Action|Thriller,Christopher Nolan,deception|imprisonment|lawlessness|police offi...
4,Star Wars: Episode VII - The Force Awakens ...,Documentary,Doug Walker,


# Cleans data:
    - Gets rid of excess characters (eg.: '|' and replaces any null entries with spaces)
    - Lowercases any capital titles

In [3]:
df = df.replace(np.nan, '', regex = True)
df['plot_keywords']= [review.replace("|"," ") for review in df['plot_keywords'].values]
df['genres']= [review.replace("|"," ") for review in df['genres'].values]

In [4]:
df['Key_words'] = ""
df['Key_words'] = df["plot_keywords"].map(str) + ' ' + df['genres'].map(str) + ' ' + df['director_name']

c = df.columns[df.dtypes == object]
df[c] = df[c].apply(lambda x: x.str.replace(r'[^\x00-\x7F]+', ''))
df.head()

Unnamed: 0,movie_title,genres,director_name,plot_keywords,Key_words
0,Avatar,Action Adventure Fantasy Sci-Fi,James Cameron,avatar future marine native paraplegic,avatar future marine native paraplegic Action ...
1,Pirates of the Caribbean: At World's End,Action Adventure Fantasy,Gore Verbinski,goddess marriage ceremony marriage proposal pi...,goddess marriage ceremony marriage proposal pi...
2,Spectre,Action Adventure Thriller,Sam Mendes,bomb espionage sequel spy terrorist,bomb espionage sequel spy terrorist Action Adv...
3,The Dark Knight Rises,Action Thriller,Christopher Nolan,deception imprisonment lawlessness police offi...,deception imprisonment lawlessness police offi...
4,Star Wars: Episode VII - The Force Awakens ...,Documentary,Doug Walker,,Documentary Doug Walker


# Vectorizing Data:
    - Generates similarity matrix with similarity indices based on how similar keywords are to one another 

In [5]:
count = CountVectorizer()
count_matrix = count.fit_transform(df['Key_words'])
sim_matrix = cosine_similarity(count_matrix, count_matrix)
print(sim_matrix)

[[1.         0.23145502 0.18257419 ... 0.         0.         0.        ]
 [0.23145502 1.         0.16903085 ... 0.         0.         0.        ]
 [0.18257419 0.16903085 1.         ... 0.14142136 0.         0.        ]
 ...
 [0.         0.         0.14142136 ... 1.         0.2        0.        ]
 [0.         0.         0.         ... 0.2        1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


# Get Movie Recommendations:
    - Finds the movies that are quite similar to user's request (a.k.a: ones with highest similarity scores)

In [6]:
def recommendations(title,num):
    recommended_movies = []
    idx = df[df['movie_title'] == title].index[0]
    score_series = pd.Series(sim_matrix[idx]).sort_values(ascending = False)

    top_10_indexes = list(score_series.iloc[1:num+1].index)
    
    for i in top_10_indexes:
        recommended_movies.append(list(df.index)[i])
        
    recs = set(df.iloc[recommended_movies]['movie_title'])
    if title in recs:
        recs.remove(title)
    
    return recs

# Implements Process:
    - Gets movie title (that is within database) from user and print a definite user-defined number of movierecommendations within database
    - If user enters "movie franchise" (i.e.: Star Wars, Harry Potter, etc.), it finds the one with the highest similarity ratio using SequenceMatcher and adjusts it to the database of 5000+ films

In [7]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [8]:
def adjustToDatabase(movie):
    orig = movie
    max_sim = -1
    if orig not in df['movie_title']:
        for item in df['movie_title']:
            if orig == item[:len(orig)]:
                if similar(orig,movie[:len(orig)]) > max_sim:
                    max_sim = similar(orig,orig[:len(orig)])
                    orig = item
    return orig

In [9]:
def main():
    print('Select a movie from our vast selection of 5000+ movies and receive instant movie recommendations!')
    print()
    while True:
        print('Enter a movie:')
        movie = adjustToDatabase(input())
        
        try:
            recommendations(movie,0)
            print('How many movie recommendations would you like?:')
            number = int(input())
            print()
            print('Here are some movie recommendations:')
            for i,item in enumerate(recommendations(movie,number)):
                print(str(i+1) + ". " + item)
            print()
            print('Would you like to enter another?')
            response = input()
            if response[0].lower() == 'y':
                print()
                continue
            else:
                print()
                print('Thank you for using our MOVIE RECOMMENDATIONS WIDGET!')
                break
        except IndexError:
            print()
            print('*ERROR: THIS MOVIE IS UNAVAILABLE IN OUR DATABASE.*')
            print('Please try another one.')
            print()
            continue

In [None]:
main()