In [51]:
import pandas as pd
import numpy as np

In [52]:
movies_only = pd.read_csv('data/movies_only.csv')

In [53]:
movies_only.head()

Unnamed: 0,Title,genres,Director,Actors
0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,John Lasseter,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney"
1,Sabrina,Comedy|Romance,Billy Wilder,"Humphrey Bogart, Audrey Hepburn, William Holde..."
2,Mortal Kombat,Action|Adventure|Fantasy,Paul W.S. Anderson,"Christopher Lambert, Robin Shou, Linden Ashby,..."
3,Hackers,Action|Adventure|Crime|Thriller,Iain Softley,"Jonny Lee Miller, Angelina Jolie, Jesse Bradfo..."
4,Waterworld,Action|Adventure|Sci-Fi,Kevin Reynolds,"Kevin Costner, Chaim Jeraffi, Rick Aviles, R.D..."


In [54]:
movies_alt = movies_only.copy()

In [36]:
# remove spaces from actors and director names

def clean_data(x):
    if isinstance(x, list):
        return[str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''
            

In [55]:
credits = ['Actors', 'Director']

for credit in credits:
    movies_alt[credit] = movies_alt[credit].apply(clean_data)

In [56]:
# remove | from genres

movies_alt['genres'] = movies_alt['genres'].str.replace("|", " ")

In [57]:
movies_alt.head()

Unnamed: 0,Title,genres,Director,Actors
0,Toy Story,Adventure Animation Children Comedy Fantasy,johnlasseter,"tomhanks,timallen,donrickles,jimvarney"
1,Sabrina,Comedy Romance,billywilder,"humphreybogart,audreyhepburn,williamholden,wal..."
2,Mortal Kombat,Action Adventure Fantasy,paulw.s.anderson,"christopherlambert,robinshou,lindenashby,cary-..."
3,Hackers,Action Adventure Crime Thriller,iainsoftley,"jonnyleemiller,angelinajolie,jessebradford,mat..."
4,Waterworld,Action Adventure Sci-Fi,kevinreynolds,"kevincostner,chaimjeraffi,rickaviles,r.d.call"


In [58]:
#combine all data 

def create_soup(x):
    return  x['Actors'].replace(',',' ') + ' ' + x['Director'] + ' ' + ' '.join(x['genres'])

In [59]:
movies_alt['bag_of_words'] = movies_alt.apply(create_soup, axis=1)

In [21]:
# movies_alt.set_index('Title', inplace=True)

In [60]:
movies_alt.head()

Unnamed: 0,Title,genres,Director,Actors,bag_of_words
0,Toy Story,Adventure Animation Children Comedy Fantasy,johnlasseter,"tomhanks,timallen,donrickles,jimvarney",tomhanks timallen donrickles jimvarney johnlas...
1,Sabrina,Comedy Romance,billywilder,"humphreybogart,audreyhepburn,williamholden,wal...",humphreybogart audreyhepburn williamholden wal...
2,Mortal Kombat,Action Adventure Fantasy,paulw.s.anderson,"christopherlambert,robinshou,lindenashby,cary-...",christopherlambert robinshou lindenashby cary-...
3,Hackers,Action Adventure Crime Thriller,iainsoftley,"jonnyleemiller,angelinajolie,jessebradford,mat...",jonnyleemiller angelinajolie jessebradford mat...
4,Waterworld,Action Adventure Sci-Fi,kevinreynolds,"kevincostner,chaimjeraffi,rickaviles,r.d.call",kevincostner chaimjeraffi rickaviles r.d.call ...


In [61]:
movies_bow = movies_alt[['Title', 'bag_of_words']]

# movies_bow.set_index('Title', inplace=True)

In [62]:
# Import sklearn models

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [63]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(movies_bow['bag_of_words'])


# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [64]:
indices = pd.Series(movies_bow.index, index = movies_bow['Title'])

def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies_bow['Title'].iloc[movie_indices]

In [65]:
get_recommendations('Rebels of the Neon God')

4233                         Vive L'Amour
2926               What Time Is It There?
4188                  Goodbye, Dragon Inn
6876                      Secret Sunshine
4243                      One Night Stand
8605                  Teddy Bears' Picnic
1544                          Snowpiercer
310                        Street Fighter
227     Final Fantasy: The Spirits Within
1597           Sympathy for Mr. Vengeance
Name: Title, dtype: object

In [49]:
movies_bow['Title']

0                                               Toy Story
1                                                 Sabrina
2                                           Mortal Kombat
3                                                 Hackers
4                                              Waterworld
5                                   Beverly Hills Cop III
6                                               Tombstone
7                                      Courage Under Fire
8                                                  Ransom
9                                                 Tin Cup
10                                          Groundhog Day
11                                             Unforgiven
12                                                Dracula
13                                              Cape Fear
14                               Night Falls on Manhattan
15                        Beavis and Butt-Head Do America
16                                               Anaconda
17            