In [107]:
#Import the libraries
import requests
from google.colab import files
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


In [108]:
#initiate data storage
titles = []
years = []
time = []
imdb_ratings = []
metascores = []
votes = []
us_gross = []
genres = []
director = []
cast = []
movie_id = [i for i in range(0,950)]
start = 51
count = 1


In [109]:
#Scraping the data
while count < 20:
  url = "https://www.imdb.com/search/title/?groups=top_1000&start={}&ref_=adv_prv".format(start)

  headers = {"Accept-Language": "en-US, en;q=0.5"}
  results = requests.get(url, headers=headers)

  soup = BeautifulSoup(results.text, "html.parser")

  movie_div = soup.find_all('div', class_='lister-item mode-advanced')
  

  #our loop through each container
  for container in movie_div:

          #name
          name = container.h3.a.text
          titles.append(name)
          
          #year
          year = container.h3.find('span', class_='lister-item-year').text
          years.append(year)

          # runtime
          runtime = container.p.find('span', class_='runtime').text if container.p.find('span', class_='runtime').text else '-'
          time.append(runtime)

          #IMDb rating
          imdb = float(container.strong.text)
          imdb_ratings.append(imdb)

          #director
          dum = container.find_all('a')
          director.append(dum[13].text)

          #cast
          dummy = []
          for i in range(14,18):
            dummy.append(dum[i].text)
          stri = " ,".join(dummy)
            
          cast.append(stri)

          #genre
          genre = container.p.find('span', class_='genre').text if container.p.find('span', class_='genre').text else '-'
          genres.append(genre)

          #metascore
          m_score = container.find('span', class_='metascore').text if container.find('span', class_='metascore') else '-'
          metascores.append(m_score)

          #there are two NV containers, grab both of them as they hold both the votes and the grosses
          nv = container.find_all('span', attrs={'name': 'nv'})
          
          #filter nv for votes
          vote = nv[0].text
          votes.append(vote)
          
          #filter nv for gross
          grosses = nv[1].text if len(nv) > 1 else '-'
          us_gross.append(grosses)
  count+=1
  start += 50



In [110]:
#pandas dataframe        
movies = pd.DataFrame({
'movie': titles,
'year': years,
'director': director,
'genre': genres,
'cast': cast,
'timeMin': time,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes,
'us_gross': us_gross,
'movie_id': movie_id

})


In [111]:
#add dataframe to csv file named 'movies.csv'
movies.to_csv('movies.csv')

In [112]:
#clean the data
movies['genre'] = movies['genre'].str.replace('\n','')
movies.head()
#files.download('movies.csv')

Unnamed: 0,movie,year,director,genre,cast,timeMin,imdb,metascore,votes,us_gross,movie_id
0,Gone Girl,(2014),David Fincher,"Drama, Mystery, Thriller","Ben Affleck ,Rosamund Pike ,Neil Patrick Harri...",149 min,8.1,79,834408,$167.77M,0
1,Zodiac,(2007),David Fincher,"Crime, Drama, Mystery","Jake Gyllenhaal ,Robert Downey Jr. ,Mark Ruffa...",157 min,7.7,78,444849,$33.08M,1
2,Blade Runner 2049,(2017),Denis Villeneuve,"Action, Drama, Mystery","Harrison Ford ,Ryan Gosling ,Ana de Armas ,Dav...",164 min,8.0,81,441516,$92.05M,2
3,Nightcrawler,(2014),Dan Gilroy,"Crime, Drama, Thriller","Jake Gyllenhaal ,Rene Russo ,Bill Paxton ,Riz ...",117 min,7.9,76,449851,$32.38M,3
4,Portrait of a Lady on Fire,(2019),Céline Sciamma,"Drama, Romance","Noémie Merlant ,Adèle Haenel ,Luàna Bajrami ,V...",122 min,8.1,95,50443,-,4


In [113]:
#Create a list of important Columns for the recommendation engine
columns = ['movie','director','cast','genre']
movies[columns].head(3)

Unnamed: 0,movie,director,cast,genre
0,Gone Girl,David Fincher,"Ben Affleck ,Rosamund Pike ,Neil Patrick Harri...","Drama, Mystery, Thriller"
1,Zodiac,David Fincher,"Jake Gyllenhaal ,Robert Downey Jr. ,Mark Ruffa...","Crime, Drama, Mystery"
2,Blade Runner 2049,Denis Villeneuve,"Harrison Ford ,Ryan Gosling ,Ana de Armas ,Dav...","Action, Drama, Mystery"


In [114]:
#check for missing data in the important columns
movies[columns].isnull().values.any()

False

In [115]:
#Create a function to combine the columns of the important columns into a single string
def get_important_features(data):
  important_features = []
  for i in range(0,data.shape[0]):
    important_features.append(data['cast'][i]+' '+data['director'][i]+' '+data['genre'][i]+' '+data['movie'][i])
  return important_features

In [116]:
#Create a column to hold combined string
movies['important features'] = get_important_features(movies)

#show the data
movies.head(3)

Unnamed: 0,movie,year,director,genre,cast,timeMin,imdb,metascore,votes,us_gross,movie_id,important features
0,Gone Girl,(2014),David Fincher,"Drama, Mystery, Thriller","Ben Affleck ,Rosamund Pike ,Neil Patrick Harri...",149 min,8.1,79,834408,$167.77M,0,"Ben Affleck ,Rosamund Pike ,Neil Patrick Harri..."
1,Zodiac,(2007),David Fincher,"Crime, Drama, Mystery","Jake Gyllenhaal ,Robert Downey Jr. ,Mark Ruffa...",157 min,7.7,78,444849,$33.08M,1,"Jake Gyllenhaal ,Robert Downey Jr. ,Mark Ruffa..."
2,Blade Runner 2049,(2017),Denis Villeneuve,"Action, Drama, Mystery","Harrison Ford ,Ryan Gosling ,Ana de Armas ,Dav...",164 min,8.0,81,441516,$92.05M,2,"Harrison Ford ,Ryan Gosling ,Ana de Armas ,Dav..."


In [117]:
#convert the text to a matrix of token counts
cm = CountVectorizer().fit_transform(movies['important features'])

In [118]:
#Get the cosine similarity from the count matrix
cs = cosine_similarity(cm)

#print the cosine similarity
print(cs)

[[1.         0.25819889 0.12126781 ... 0.         0.         0.125     ]
 [0.25819889 1.         0.12524486 ... 0.         0.         0.06454972]
 [0.12126781 0.12524486 1.         ... 0.         0.         0.12126781]
 ...
 [0.         0.         0.         ... 1.         0.05263158 0.        ]
 [0.         0.         0.         ... 0.05263158 1.         0.        ]
 [0.125      0.06454972 0.12126781 ... 0.         0.         1.        ]]


In [119]:
#Get the title of the movie that user likes
title = 'Nightcrawler'

mov_id = movies[movies.movie == title]['movie_id'].values[0]

In [120]:
#Create a list of enumerations of the similarity score
scores = list(enumerate(cs[mov_id]))

In [121]:
#sort the list
sorted_scores = sorted(scores,key= lambda x:x[1],reverse=True)
sorted_scores = sorted_scores[1:]

In [122]:
#Print the sorted scores
print(sorted_scores)

[(19, 0.28571428571428575), (1, 0.2760262237369417), (383, 0.2672612419124244), (528, 0.25928148942086576), (938, 0.24174688920761409), (225, 0.2142857142857143), (229, 0.2142857142857143), (782, 0.2142857142857143), (18, 0.20701966780270625), (21, 0.20701966780270625), (47, 0.20701966780270625), (155, 0.20701966780270625), (226, 0.20701966780270625), (350, 0.20701966780270625), (355, 0.20701966780270625), (432, 0.20701966780270625), (538, 0.20701966780270625), (797, 0.20701966780270625), (850, 0.20701966780270625), (108, 0.2004459314343183), (142, 0.2004459314343183), (162, 0.2004459314343183), (189, 0.2004459314343183), (347, 0.2004459314343183), (376, 0.2004459314343183), (443, 0.2004459314343183), (503, 0.2004459314343183), (586, 0.2004459314343183), (802, 0.2004459314343183), (895, 0.2004459314343183), (279, 0.19446111706564934), (473, 0.19446111706564934), (553, 0.19446111706564934), (560, 0.19446111706564934), (662, 0.18898223650461363), (543, 0.17928429140015906), (724, 0.17496

In [123]:
#First 5 recommendations

j = 0
print('The 5 most recommended movies to '+title+' are : ')
for item in sorted_scores:
  movie_title = movies[movies.movie_id == item[0]]['movie'].values[0]
  print(j+1,movie_title)
  j+=1
  if j>4:
    break

The 5 most recommended movies to Nightcrawler are : 
1 Prisoners
2 Zodiac
3 End of Watch
4 Blood Simple
5 Drishyam
