In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import sqlite3
from sklearn.preprocessing import StandardScaler
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt, seaborn as sns

from IPython.display import display

%matplotlib inline

In [2]:
moviedf = pd.read_csv('movies.csv')
ratingdf = pd.read_csv('ratings.csv')

In [3]:
moviedf.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
moviedf.shape

(9125, 3)

In [5]:
ratingdf.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [6]:
ratingdf.drop(columns = 'timestamp', axis = 1, inplace = True)
#moviedf.drop(columns = 'genres', axis = 1, inplace = True)

In [7]:
df = pd.merge(moviedf, ratingdf, on = 'movieId')

In [8]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,3.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,9,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,13,5.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19,3.0


In [9]:
genres = ['Crime', 'Drama', 'Adventure', 'Animation', 'Fantasy', 'Children', 'Horror', 'Romance',
          'War', 'Thriller', 'Sci-Fi', 'Action', 'Mystery', 'Musical', 'Documentary']

In [10]:
df['genres'].unique().sum()

'Adventure|Animation|Children|Comedy|FantasyAdventure|Children|FantasyComedy|RomanceComedy|Drama|RomanceComedyAction|Crime|ThrillerAdventure|ChildrenActionAction|Adventure|ThrillerComedy|HorrorAdventure|Animation|ChildrenDramaAction|Adventure|RomanceCrime|DramaDrama|RomanceAction|Comedy|Crime|Drama|ThrillerComedy|Crime|ThrillerCrime|Drama|Horror|Mystery|ThrillerDrama|Sci-FiChildren|DramaAdventure|Drama|Fantasy|Mystery|Sci-FiMystery|Sci-Fi|ThrillerDocumentary|IMAXChildren|ComedyDrama|WarAction|Crime|DramaAction|Adventure|FantasyComedy|Drama|ThrillerMystery|ThrillerAnimation|Children|Drama|Musical|RomanceCrime|Mystery|ThrillerAdventure|DramaDrama|MysteryDrama|ThrillerComedy|CrimeAction|Sci-Fi|ThrillerAction|Comedy|Horror|ThrillerComedy|DramaDocumentaryAction|Crime|Drama|ThrillerCrime|Drama|RomanceAction|Adventure|DramaAction|ThrillerDrama|Horror|ThrillerComedy|Horror|RomanceAdventure|Comedy|Crime|RomanceAdventure|Children|Comedy|MusicalAction|Drama|WarCrime|Drama|ThrillerAction|Adventure

In [11]:
genre_count = moviedf['genres'].value_counts()

In [12]:
genre_list =[]

In [13]:
# for i in genre_count:
#     if genres.contains(i):
#         genres.append(genre_list)
#     else:
#         print('no')

In [14]:
genre_list

[]

In [15]:
(moviedf['genres']=="Thriller").count()

9125

In [16]:
df.tail()

Unnamed: 0,movieId,title,genres,userId,rating
99999,161944,The Last Brickmaker in America (2001),Drama,287,5.0
100000,162376,Stranger Things,Drama,73,4.5
100001,162542,Rustom (2016),Romance|Thriller,611,5.0
100002,162672,Mohenjo Daro (2016),Adventure|Drama|Romance,611,3.0
100003,163949,The Beatles: Eight Days a Week - The Touring Y...,Documentary,547,5.0


In [17]:
df.shape

(100004, 5)

In [18]:
df['rating'].mean()

3.543608255669773

In [19]:
df.rating.mean()

3.543608255669773

In [20]:
pivot = pd.pivot_table(df, index = 'userId', columns='title', values= 'rating')

In [21]:
pivot.head()

title,"""Great Performances"" Cats (1998)",$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Neath the Arizona Skies (1934),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),...,Zulu (1964),Zulu (2013),[REC] (2007),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931),İtirazım Var (2014)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [22]:
sparse_pivot = sparse.csr_matrix(pivot.T.fillna(0))

In [23]:
sparse_pivot

<9064x671 sparse matrix of type '<class 'numpy.float64'>'
	with 100003 stored elements in Compressed Sparse Row format>

In [24]:
distances = pairwise_distances(sparse_pivot, metric='cosine')

In [25]:
distance_df = pd.DataFrame(distances, index=pivot.columns, columns=pivot.columns)

In [26]:
distance_df.head()

title,"""Great Performances"" Cats (1998)",$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Neath the Arizona Skies (1934),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),...,Zulu (1964),Zulu (2013),[REC] (2007),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931),İtirazım Var (2014)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Great Performances"" Cats (1998)",0.0,1.0,1.0,0.835601,0.979609,1.0,0.985954,1.0,1.0,0.996834,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
$9.99 (2008),1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.920526,1.0,0.84367,...,1.0,1.0,1.0,1.0,1.0,0.986101,1.0,0.941782,1.0,1.0
'Hellboy': The Seeds of Creation (2004),1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.782643,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
'Neath the Arizona Skies (1934),0.835601,1.0,1.0,0.0,0.875965,1.0,0.914564,1.0,1.0,0.980741,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
'Round Midnight (1986),0.979609,1.0,1.0,0.875965,0.0,1.0,0.989403,0.856214,1.0,0.863837,...,1.0,1.0,1.0,0.878433,1.0,1.0,1.0,1.0,1.0,1.0


# Selecting Two Movies to Work With

In [27]:
titles = []
for i in distance_df.columns:
    titles.append(i)

In [28]:
distance = []
for i in distance_df['Jurassic Park (1993)']:
    distance.append(i)

In [29]:
distance2 = []
for i in distance_df['Interstellar (2014)']:
    distance2.append(i)

In [30]:
df_work = pd.DataFrame(data = titles, columns = ['Jurassic Park (1993)'])

In [31]:
df_work['Interstellar (2014)'] = titles

In [32]:
df_work['JP Distance'] = distance
df_work['Int. Distance'] = distance2
df_work.head()

Unnamed: 0,Jurassic Park (1993),Interstellar (2014),JP Distance,Int. Distance
0,"""Great Performances"" Cats (1998)","""Great Performances"" Cats (1998)",1.0,1.0
1,$9.99 (2008),$9.99 (2008),0.985535,1.0
2,'Hellboy': The Seeds of Creation (2004),'Hellboy': The Seeds of Creation (2004),0.920876,1.0
3,'Neath the Arizona Skies (1934),'Neath the Arizona Skies (1934),1.0,1.0
4,'Round Midnight (1986),'Round Midnight (1986),0.952892,0.89253


In [33]:
df_work['Average Distance'] = (df_work['JP Distance'] + df_work['Int. Distance'])/2

In [34]:
df_work.head()

Unnamed: 0,Jurassic Park (1993),Interstellar (2014),JP Distance,Int. Distance,Average Distance
0,"""Great Performances"" Cats (1998)","""Great Performances"" Cats (1998)",1.0,1.0,1.0
1,$9.99 (2008),$9.99 (2008),0.985535,1.0,0.992767
2,'Hellboy': The Seeds of Creation (2004),'Hellboy': The Seeds of Creation (2004),0.920876,1.0,0.960438
3,'Neath the Arizona Skies (1934),'Neath the Arizona Skies (1934),1.0,1.0,1.0
4,'Round Midnight (1986),'Round Midnight (1986),0.952892,0.89253,0.922711


In [35]:
df_work['Average Distance Sq'] = ((df_work['JP Distance']**2) + (df_work['Int. Distance']**2))/2

In [36]:
df_work.head()

Unnamed: 0,Jurassic Park (1993),Interstellar (2014),JP Distance,Int. Distance,Average Distance,Average Distance Sq
0,"""Great Performances"" Cats (1998)","""Great Performances"" Cats (1998)",1.0,1.0,1.0,1.0
1,$9.99 (2008),$9.99 (2008),0.985535,1.0,0.992767,0.985639
2,'Hellboy': The Seeds of Creation (2004),'Hellboy': The Seeds of Creation (2004),0.920876,1.0,0.960438,0.924007
3,'Neath the Arizona Skies (1934),'Neath the Arizona Skies (1934),1.0,1.0,1.0,1.0
4,'Round Midnight (1986),'Round Midnight (1986),0.952892,0.89253,0.922711,0.852307


In [37]:
m = df_work.groupby(['Jurassic Park (1993)', 'Interstellar (2014)'])[['JP Distance','Int. Distance']].apply(np.median)
m.name = 'Median'

df_work.join(m, on=['Jurassic Park (1993)', 'Interstellar (2014)'])
# The median turns out to be exactly the same as the Average Distance

df_work.head()

Unnamed: 0,Jurassic Park (1993),Interstellar (2014),JP Distance,Int. Distance,Average Distance,Average Distance Sq
0,"""Great Performances"" Cats (1998)","""Great Performances"" Cats (1998)",1.0,1.0,1.0,1.0
1,$9.99 (2008),$9.99 (2008),0.985535,1.0,0.992767,0.985639
2,'Hellboy': The Seeds of Creation (2004),'Hellboy': The Seeds of Creation (2004),0.920876,1.0,0.960438,0.924007
3,'Neath the Arizona Skies (1934),'Neath the Arizona Skies (1934),1.0,1.0,1.0,1.0
4,'Round Midnight (1986),'Round Midnight (1986),0.952892,0.89253,0.922711,0.852307


# Writing the Function that will create this data frame

In [38]:
def movie_generator(movie1, movie2):
    
    titles = []
    for i in distance_df.columns:
        titles.append(i)
    
    distance = []
    for i in distance_df[movie1]:
        distance.append(i)
    
    distance2 = []
    for i in distance_df[movie2]:
        distance2.append(i)

    df_generator = pd.DataFrame(data = titles, columns = [movie1])
    df_generator[movie2] = titles

    df_generator[movie1 + ' Distance'] = distance
    df_generator[movie2 + ' Distance'] = distance2

    df_generator['Average Distance'] = (df_generator[movie1 + ' Distance'] + df_generator[movie2 + ' Distance'])/2
    df_generator['Average Distance Sq'] = ((df_generator[movie1 + ' Distance']**2) + (df_generator[movie2 + ' Distance']**2)/2)

    return df_generator.sort_values(by = ['Average Distance Sq'], ascending=True).iloc[2:7, :]

In [39]:
movie_generator('Twilight (2008)', "Interstellar (2014)")

Unnamed: 0,Twilight (2008),Interstellar (2014),Twilight (2008) Distance,Interstellar (2014) Distance,Average Distance,Average Distance Sq
4141,It Follows (2014),It Follows (2014),0.573344,0.732375,0.652859,0.59691
5575,National Treasure: Book of Secrets (2007),National Treasure: Book of Secrets (2007),0.576753,0.775018,0.675886,0.632971
8013,The Spectacular Now (2013),The Spectacular Now (2013),0.557893,0.820002,0.688947,0.647446
3693,"Holiday, The (2006)","Holiday, The (2006)",0.543718,0.842735,0.693227,0.650731
6380,"Proposition, The (2005)","Proposition, The (2005)",0.577987,0.822484,0.700235,0.672309


In [40]:
def movie_generator(movie1, movie2):
    
    
#     search = movie1
#     for title in moviedf.loc[moviedf['title'].str.contains(search), 'title'].values:
#         print(title)
#         print('Average', pivot[title].mean())
#         print('Number of ratings', pivot[title].count())
#         print('')
#         print('')
    
    
    
    titles = []
    for i in distance_df.columns:
        titles.append(i)
    
    distance = []
    for i in distance_df[movie1]:
        distance.append(i)
    
    distance2 = []
    for i in distance_df[movie2]:
        distance2.append(i)

    df_generator = pd.DataFrame(data = titles, columns = [movie1])
    df_generator[movie2] = titles

    df_generator[movie1 + 'Distance'] = distance
    df_generator[movie2 + 'Distance'] = distance2

    df_generator['Average Distance'] = (df_generator[movie1 + 'Distance'] + df_generator[movie2 + 'Distance'])/2
    df_generator['Average Similarity'] = ((1-df_generator[movie1 + 'Distance']) + (1-df_generator[movie2 + 'Distance'])/2)
    df_generator['Weight'] = 1 - (np.abs(df_generator[movie1 + 'Distance']-df_generator[movie2 + 'Distance']))
    df_generator['Score'] = df_generator['Weight'] * df_generator['Average Distance']
    #df_generator[movie1 + ' and ' + movie2] = df_generator.sort_values(by = ['Score'], ascending=True).iloc[2:12, 0:2]
    return df_generator.sort_values(by = ['Score'], ascending=True).iloc[2:7, 0:2]

In [41]:
# My movie DF is from the years 1915 - 2016 inclusive

In [43]:
search = 'Braveheart'
for title in moviedf.loc[moviedf['title'].str.contains(search), 'title'].values:
    print(title)

Braveheart (1995)


In [44]:
movie_generator('Titanic (1997)', 'Braveheart (1995)')

Unnamed: 0,Titanic (1997),Braveheart (1995)
2933,Forrest Gump (1994),Forrest Gump (1994)
7856,Terminator 2: Judgment Day (1991),Terminator 2: Judgment Day (1991)
8332,True Lies (1994),True Lies (1994)
1968,Dances with Wolves (1990),Dances with Wolves (1990)
4307,Jurassic Park (1993),Jurassic Park (1993)


In [None]:
Hit_or_Not = {'Person A pick': 'Exorcist, The (1973)',
              'Person B pick': 'Die Hard (1988)',
              'Person A': 'yes',
              'Person B': 'yes',
             'Person A pick': 'Interstellar (2014)',
              'Person B pick': 'Never Been Kissed (1999)',
              'Person A': 'yes',
              'Person B': 'no',
             'Person A pick': 'Bad Boys (1995)',
              'Person B pick': 'Clerks II (2006)',
              'Person A': 'yes',
              'Person B': 'no',
             'Person A pick': 'Clueless (1995)',
              'Person B pick': 'Harold and Maude (1971)',
              'Person A': 'yes',
              'Person B': 'yes',
             'Person A pick': 'Fight Club (1999)',
              'Person B pick': 'Moulin Rouge (2001)',
              'Person A': 'yes',
              'Person B': 'no',
             'Person A pick': 'Field of Dreams (1989)',
              'Person B pick': 'Bio-Dome (1996)',
              'Person A': 'no',
              'Person B': 'no',
             'Person A pick': 'Shining, The (1980)',
              'Person B pick': 'Casablanca (1942)',
              'Person A': 'yes',
              'Person B': 'yes',
             'Person A pick': 'Pokemon 4 Ever (a.k.a. Pokémon 4: The Movie) (2002)',
              'Person B pick': 'Ratatouille (2007)',
              'Person A': 'no',
              'Person B': 'no',
             'Person A pick': 'Gladiator (2000)',
              'Person B pick': 'Interstellar (2014)',
              'Person A': 'yes',
              'Person B': 'yes',
             'Person A pick': 'Predator (1987)',
              'Person B pick': 'Team America: World Police (2004)',
              'Person A': 'yes',
              'Person B': 'yes',
             }