In [1]:
from bs4 import BeautifulSoup as bs 
from requests import get
import pandas as pd

import time
import random

from datetime import datetime

import matplotlib.dates as mdates
import numpy as np 
import matplotlib.pyplot as plt

import csv

# Scrape Movie Reviews

In [2]:
response = get("https://www.imdb.com/chart/top-english-movies?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=4da9d9a5-d299-43f2-9c53-f0efa18182cd&pf_rd_r=EF4GB4YES4GGPNV86W4A&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=moviemeter&ref_=chtmvm_ql_4")

In [3]:
print(response.status_code)

200


In [4]:
html_soup = bs(response.text, 'lxml')

In [5]:
movie_table = html_soup.find('table',class_ = 'chart full-width')
movie_list = movie_table.find('tbody',class_ = 'lister-list')

In [6]:
# find all a-tags with class:None
movie_tags = movie_list.find_all('a', attrs={'class': None})

# filter the a-tags to get just the titles
movie_tags = [tag.attrs['href'] for tag in movie_tags 
              if tag.attrs['href'].startswith('/title') & tag.attrs['href'].endswith('/')]

# remove duplicate links
movie_tags = list(dict.fromkeys(movie_tags))

print("There are a total of " + str(len(movie_tags)) + " movie titles")
print("Displaying 10 titles")
movie_tags[50:60]

There are a total of 250 movie titles
Displaying 10 titles


['/title/tt0051201/',
 '/title/tt0043014/',
 '/title/tt0910970/',
 '/title/tt0081505/',
 '/title/tt4633694/',
 '/title/tt0057012/',
 '/title/tt1345836/',
 '/title/tt0087843/',
 '/title/tt4154796/',
 '/title/tt2380307/']

In [7]:
# movie links
base_url = "https://www.imdb.com"
movie_links = [base_url + tag + 'reviews?ref_=tt_ov_rt' for tag in movie_tags]
print("There are a total of " + str(len(movie_links)) + " movie user reviews")
print("Displaying 10 user reviews links")
movie_links[:10]

There are a total of 250 movie user reviews
Displaying 10 user reviews links


['https://www.imdb.com/title/tt0111161/reviews?ref_=tt_ov_rt',
 'https://www.imdb.com/title/tt0068646/reviews?ref_=tt_ov_rt',
 'https://www.imdb.com/title/tt0071562/reviews?ref_=tt_ov_rt',
 'https://www.imdb.com/title/tt0468569/reviews?ref_=tt_ov_rt',
 'https://www.imdb.com/title/tt0050083/reviews?ref_=tt_ov_rt',
 'https://www.imdb.com/title/tt0108052/reviews?ref_=tt_ov_rt',
 'https://www.imdb.com/title/tt0167260/reviews?ref_=tt_ov_rt',
 'https://www.imdb.com/title/tt0110912/reviews?ref_=tt_ov_rt',
 'https://www.imdb.com/title/tt0120737/reviews?ref_=tt_ov_rt',
 'https://www.imdb.com/title/tt0137523/reviews?ref_=tt_ov_rt']

In [9]:
def scrape_spirited_reviews(spirited_url):
    
    # Access the main website for reviews
    connection = get(spirited_url)
    soup = bs(connection.content, 'html.parser')
    
    # Set up empty lists for the upcoming ratings and dates data 
    ratings_list = []
    dates_list = []
    edited_ratings_list = []
    datetime_list = []
    reviews_list =[]
    
    def grab_datakeys():
    
        # Create a Boolean variable for the upcoming while loop
        newpage = True

        # Grab the data-key and ajaxurl from the original user review website
        new_section = soup.select('.load-more-data')
        datakey = new_section[0]['data-key']
        print('Scrapping user review page. Utilizing original datakey: ' + datakey)
        ajax_link = new_section[0]['data-ajaxurl']

        # While the Boolean variable is true
        while newpage == True:

            # Check if there is still a datakey
            if datakey != None:

                # Combine the url to create the new link to the next page of reviews
                combine_url = 'http://www.imdb.com/' + ajax_link + '?paginationKey=' + datakey

                # Establish a new connection to this new url
                new_connection = get(combine_url)

                # Parse the HTML content of the new page
                broth = bs(new_connection.content,'html.parser')

                # Select the class = 'load-more-data' on the page
                new_section = broth.select('.load-more-data')

                # Find all review text sections on this website
                n_allsections = broth.find_all('div', class_ = 'lister-item mode-detail imdb-user-review collapsable')

                # Define the function for scrapping the data (ratings and dates)
                def scrapping():

                    # For every review on the page:
                    for review in n_allsections:
                  
                        try:
                            locate_rating = review.find('span', class_ = 'rating-other-user-rating').text.strip()

                            locate_date = review.find('span', class_ = 'review-date').text.strip()
                            
                            locate_review = review.find('div', class_=['text','show-more__control clickable']).text.strip()
                        except:
                            
                            continue
                            
                        else:
                            ratings_list.append(locate_rating)
                            dates_list.append(locate_date)
                            reviews_list.append(locate_review)

                # Execute the scrapping function
                scrapping()


                # Test to see if can get the datakey
                try:
                    datakey = new_section[0]['data-key']

                except: # If you get an error for accessing the datakey, break out of the while loop
                    
                    # This may mean that you reached the end of the reviews, therefore break out of the loop
                    newpage = False
                    break
                    
                else: # If you can get an datakey, grab the new datakey and store it into the variable and repeat the loop

                    # Use the time function to make sure it creates a random time between 1 to 3 seconds before repeating the loop
                    time.sleep(random.randint(1,3))
                    print('\nScrapping next review page. Utilizing next datakey: ' + datakey)
                    continue

            else:
                newpage = False
                break
    
    # Execute the datakeys functon
    grab_datakeys()
    
    # Remove the '/10' from the Ratings List
    def filter_ratings():
        for rating in ratings_list:
            remove_slash = rating.split('/') 			# Remove the / from all the ratings, making each value an individual index
            popped_numer = remove_slash.pop(0) 			# Pop out all numerator values of each rating
            popped_numer = int(popped_numer)			# Convert the string values into integer values (necessary for plotting the data points later)
            edited_ratings_list.append(popped_numer) 		# Add these popped numerators into a new list
    
    # Execute the Filter Ratings Function
    filter_ratings()
    
    # Convert Dates to DateTime Type
    def convert_dates():
        
        # For every date in the Dates List (string type)
        for date in dates_list:
            
            # Convert each date from a string to a datetime type
            datetime_date = datetime.strptime(date, '%d %B %Y')
            
            # Append the new datetime dates into the empty list
            datetime_list.append(datetime_date)
    
    # Execute the converting date function
    convert_dates()
    
    
    # Import the data into a csv file
    file_name = spirited_url.split('/')[4] + '.csv'
    with open(file_name, 'w') as f:
        writer = csv.writer(f)

        # Create the header row
        writer.writerow(['Data of Review', 'Rating (Out of 10)','Review'])

        # Use the zip function to 'zip' the two lists together so each item contains the date and its respective rating
        writer.writerows(zip(datetime_list, edited_ratings_list, reviews_list))
        
    f.close()
       
    print('\nScrapping Completed.')
    print('\nTotal Data Points: ')
    print(len(edited_ratings_list))

In [15]:
for i in range(100):
    scrape_spirited_reviews(movie_links[i])

Scrapping user review page. Utilizing original datakey: g4wp7cbiry2dgyyl72xx5nrwqlumwcb5y4hhzo5ziwr26fbyhvrl4ty4oiyvlnbidfr5dtvw4voloefkzvnarimywuglszq3

Scrapping next review page. Utilizing next datakey: g4wp7dzjry5tiyqc7ovhvmztrts42arhzfmxvlnomwklyczuf43o6ss7o42vtpzkd54k4marawpxpj2gqnfdy2njqopxcuy

Scrapping next review page. Utilizing next datakey: g4wp7drfr45t4zak7guhzmzzrpt4sbrhzfmxvlnomwklyczuf43o6ss6o44f7njodz4k5umbi5rz3u6uf66rur2mp6tdoyq

Scrapping next review page. Utilizing next datakey: g4wp7drjqy3de3yk7wxhhmjxqls4ubrhzfmxvlnomwklyczuf43o6ssyoqyfvmjlcr4k4macfunr3bzsjtpzcoiz3ncz57a

Scrapping next review page. Utilizing next datakey: g4wp7drpqqzdczqc7wvxhnjvqpu4mbrhzfmxvlnomwklyczuf43o6sszo44vzpzldf4k5i36bftt3gy6ij7ynvsqbpfurxy

Scrapping next review page. Utilizing next datakey: g4wp7drnqmydmyig7kwhtnbtr7u4yaz5y4hhzo5ziwr26fbyhvrl4ty4oe4f7mzcdfrndtwueiuoupcpgiq53mb7pgl54nfv

Scrapping next review page. Utilizing next datakey: g4wp7drmqe2dczyd72vx3ojrrds4wajyy4hhzo5ziwr26fby


Scrapping next review page. Utilizing next datakey: g4wp7dbfqi3tizqg66uhhnzrrdt4ucbyy4hhzo5ziwr26fbyhvrl4ty4oizvtnrndvwndts5ys5rtiz7he4c2prhqvrdg6eg

Scrapping next review page. Utilizing next datakey: g4wp7dbfqi3tizqg66uhhnzrrdt4ucbyy4hhzo5ziwr26fbyhvrl4ty4oezftmzcdnv5dtxxh7ssefvvt6t4kympbiiydpfx

Scrapping next review page. Utilizing next datakey: g4wp7dbfqi3tizqg66uhhnzrrdt4ucbyy4hhzo5ziwr26fbyhvrl4ty4oa2v7nbddrv5dtvzbe6bxclgloa7xu6lrodvx5al

Scrapping next review page. Utilizing next datakey: g4wp7dbfqi3tizqg66uhhnzrrdt4ucbyy4hhzo5ziwr26fbyhvrl4ty4oa2v7nbndnundtsfkq4yxe6p7pkvui5pbbcfznze

Scrapping next review page. Utilizing next datakey: g4wp7dbkq43d4zid7wvxhobxrps44bbsy4hhzo5ziwr26fbyhvrl4ty4om2vtmbkdfvndttzsbeys4aza6w4o5cvcbkatzr3

Scrapping next review page. Utilizing next datakey: g4wp7dbkq43d4zid7wvxhobxrps44bbsy4hhzo5ziwr26fbyhvrl4ty4oa2v7nbddjw5dtsq2moh7zhj2d4256fsk6tiq7oo

Scrapping next review page. Utilizing next datakey: g4wp7dbpqaydkyya72wh5nrzrhr4sbb234acxpnpkkbzqbz


Scrapping next review page. Utilizing next datakey: g4wp7dbfqi3tizqg66uhhnzrrdt4ucbyy4hhzo5ziwr26fbyhvrl4ty4ou3frpzndnr5dtuuec5f7px2n5qr66e4yhqlh43m

Scrapping next review page. Utilizing next datakey: g4wp7dbfqi3tizqg66uhhnzrrdt4ucbyy4hhzo5ziwr26fbyhvrl4ty4ou3vjnzidzxndtsrc5ybuscem77scgqg2s27cou3

Scrapping next review page. Utilizing next datakey: g4wp7dbfqi3tizqg66uhhnzrrdt4ucbyy4hhzo5ziwr26fbyhvrl4ty4ou3v7nzldjxndttdesysz4qbcwwxsesnv2cksche

Scrapping next review page. Utilizing next datakey: g4wp7dbfqi3tizqg66uhhnzrrdt4ucbyy4hhzo5ziwr26fbyhvrl4ty4ou2frprddvu5dtru2zg3kxwi7px6vw53uwcoju5s

Scrapping next review page. Utilizing next datakey: g4wp7dbfqi3tizqg66uhhnzrrdt4ucbyy4hhzo5ziwr26fbyhvrl4ty4oq3f3prjdfv5dttqzdlzbeho3dpulvaiwahuashq

Scrapping next review page. Utilizing next datakey: g4wp7dbfqi3tizqg66uhhnzrrdt4ucbyy4hhzo5ziwr26fbyhvrl4ty4oq2vvpzpdnv5dtwlwf4kpfcste4rb3xdq6kz4nhh

Scrapping next review page. Utilizing next datakey: g4wp7dbkq43d4zid7wvxhobxrps44bbsy4hhzo5ziwr26fb


Scrapping next review page. Utilizing next datakey: g4wp7drmqa3tg3ql66xx3mrrqls42abty4hhzo5ziwr26fbyhvrl4ty4ou3v7mjpdjwndtxxvtb3f7tu7fvjjr76t4uevbtx

Scrapping next review page. Utilizing next datakey: g4wp7drmqa3tg3ql66xx3mrrqls42abty4hhzo5ziwr26fbyhvrl4ty4oq2f5nzodbx5dtqqa4y3b6c5zhujdee74lzpmdac

Scrapping next review page. Utilizing next datakey: g4wp7drmqa3tg3ql66xx3mrrqls42abty4hhzo5ziwr26fbyhvrl4ty4omzfxnjncvrndtwrzbcam7nsbiegtuiokxrhy5la

Scrapping next review page. Utilizing next datakey: g4wp7drmqa3tg3ql66xx3mrrqls42abty4hhzo5ziwr26fbyhvrl4ty4oi3frmridrv5dtrz52tzn2bzdncovekhon2ajmgh

Scrapping next review page. Utilizing next datakey: g4wp7drmqa3tg3ql66xx3mrrqls42abty4hhzo5ziwr26fbyhvrl4ty4oe4f7njlcrv5dtweuyc4zus46sywvydesuikhncl

Scrapping next review page. Utilizing next datakey: g4wp7drmqa3tg3ql66xx3mrrqls42abty4hhzo5ziwr26fbyhvrl4ty4oazvrmbodbundtvmd5uk5iinuf7bj3djzmhhbjl7

Scrapping next review page. Utilizing next datakey: g4wp7drmqa3tg3ql66xx3mrrqls42abty4hhzo5ziwr26fb


Scrapping next review page. Utilizing next datakey: g4wp7drmqa3tg3ql66xx3mrrqls42abty4hhzo5ziwr26fbyhvrl4ty4oi3vtmrpdzxndtuv7hafru2lo3kixr7vhxs7fxvz

Scrapping next review page. Utilizing next datakey: g4wp7drmqa3tg3ql66xx3mrrqls42abty4hhzo5ziwr26fbyhvrl4ty4oe4f7nbmdnr5dtqtzmt7awifms2qf2qdlslyar2k

Scrapping next review page. Utilizing next datakey: g4wp7drmqa3tg3ql66xx3mrrqls42abty4hhzo5ziwr26fbyhvrl4ty4oe2vjnjmdfu5dtxjsocrgrea4qou3d6henvremgo

Scrapping next review page. Utilizing next datakey: g4wp7drmqa3tg3ql66xx3mrrqls42abty4hhzo5ziwr26fbyhvrl4ty4oezvxmrmdfw5dtqakclou46q57yw6e4oizsvddmk

Scrapping next review page. Utilizing next datakey: g4wp7drmqa3tg3ql66xx3mrrqls42abty4hhzo5ziwr26fbyhvrl4ty4oeyflnbldrvndtvsbduboqapmjsuif32dtf56gx2

Scrapping next review page. Utilizing next datakey: g4wp7drmqa3tg3ql66xx3mrrqls42abty4hhzo5ziwr26fbyhvrl4ty4oeyfzmjmdzu5dtso2vn3ej3xa4wsqwbexorrndmr

Scrapping next review page. Utilizing next datakey: g4wp7drmqa3tg3ql66xx3mrrqls42abty4hhzo5ziwr26fb


Scrapping next review page. Utilizing next datakey: g4wp7dbkq43d4zid7wvxhobxrps44bbsy4hhzo5ziwr26fbyhvrl4ty4oe4fznzmdbundtxdl4axvsonbohj4xv57aopl2ft

Scrapping next review page. Utilizing next datakey: g4wp7dbkq43d4zid7wvxhobxrps44bbsy4hhzo5ziwr26fbyhvrl4ty4oeyf5nrndjv5dtw2deply7u54cu2qekbh5twu5ok

Scrapping next review page. Utilizing next datakey: g4wp7dbkq43d4zid7wvxhobxrps44bbsy4hhzo5ziwr26fbyhvrl4ty4oeyfzpzkdfw5dtuxdl6opavlfdiqw5u452b4jf4m

Scrapping next review page. Utilizing next datakey: g4wp7dbkq43d4zid7wvxhobxrps44bbsy4hhzo5ziwr26fbyhvrl4ty4oeyf3pzcdfx5dtsst6dvqts7vgs7mygrl3ljklad

Scrapping next review page. Utilizing next datakey: g4wp7dbkq43d4zid7wvxhobxrps44bbsy4hhzo5ziwr26fbyhvrl4ty4oeyf3mrkcrxndtrk4xfdftsucmqwo3vycuqp5q6v

Scrapping next review page. Utilizing next datakey: g4wp7dbiqm3t6yab62whtnrrrdumubjzy4hhzo5ziwr26fbyhvrl4ty4oeyf5nbodnu5dtvisjoujb55f53e4ipp5f2mhsqw

Scrapping next review page. Utilizing next datakey: g4wp7dbpqm5tkzia7kuh7ojyrdtm2bj23yacxpnpkkbzqbz


Scrapping next review page. Utilizing next datakey: g4wp7drmqe2dczyd72vx3ojrrds4wajyy4hhzo5ziwr26fbyhvrl4ty4om2ftnbmd5r5dtqsxuj3yb75t7xea427lfh7ziei

Scrapping next review page. Utilizing next datakey: g4wp7drmqa3tg3ql66xx3mrrqls42abty4hhzo5ziwr26fbyhvrl4ty4oyyfrnjcdjw5dtvykachueiw6nkm3iqwhitr7ulf

Scrapping next review page. Utilizing next datakey: g4wp7djjryydkzyd7cth3njvrpq4obbyy4hhzo5ziwr26fbyhvrl4ty4oe4vtmbidbwndtv2vc5litdqmhvdsw4h6vwucztp

Scrapping next review page. Utilizing next datakey: g4wp7djjqyzdi3qb7kwxvnrvrhs4wbzzy4hhzo5ziwr26fbyhvrl4ty4oazfzmrndjrndtr7dgg3bsvgeguomwtyrhfpnwl6

Scrapping next review page. Utilizing next datakey: g4wp7djjtiqhejcxxxgs753i36t52q343yndz4hoaxeoqp5sspgj7zcp7vo3nuie7zdh5pgz

Scrapping next review page. Utilizing next datakey: g4wp7djjtiqhejcxxxgs753i36t52q343imtx7xjb7boqp6yip5vvl3nvv4a337fkrvkw44f

Scrapping next review page. Utilizing next datakey: g4wp7djjtiqhejcxxxgs753i36t52q343mptr7hlapb6qp4a7obbwiotcrafypwxvt7ydxd3

Scrapping next review

# Scrape Movie Characteristics

In [16]:
movie_char_link = [base_url + tag for tag in movie_tags]
movie_char_link[:10]

['https://www.imdb.com/title/tt0111161/',
 'https://www.imdb.com/title/tt0068646/',
 'https://www.imdb.com/title/tt0071562/',
 'https://www.imdb.com/title/tt0468569/',
 'https://www.imdb.com/title/tt0050083/',
 'https://www.imdb.com/title/tt0108052/',
 'https://www.imdb.com/title/tt0167260/',
 'https://www.imdb.com/title/tt0110912/',
 'https://www.imdb.com/title/tt0120737/',
 'https://www.imdb.com/title/tt0137523/']

In [12]:
response = get("https://www.imdb.com/title/tt0111161/")
html_soup = bs(response.text, 'lxml')

In [17]:
#function that scrapes movie characteristics
def movie_char(url):
    response = get(url)
    html_soup = bs(response.text, 'lxml')
    
    rating = html_soup.find("div", class_ ='ratingValue').strong.text
    title = html_soup.find("div", class_ = 'title_wrapper').find('h1').contents[0]
    year= html_soup.find("div", class_ = 'title_wrapper').find('span').a.text
    length = html_soup.find('div', class_ = "subtext").find("time").text.strip()
    genre = html_soup.find('div', class_ = "subtext").find('a').text
    
    return(rating,title,year,length,genre)
    
    

In [18]:
title=[]
rating=[]
year=[]
length=[]
genre=[]

for i in movie_char_link[:100]:
    r,t,y,l,g = movie_char(i)
    
    rating.append(r)
    title.append(t)
    year.append(y)
    length.append(l)
    genre.append(g)

(100, 100, 100, 100, 100)

In [19]:
import pandas as pd
movie100 = pd.DataFrame({"id": movie_tags[:100],
                     "title": title,
                     "year":year,
                     "length":length,
                     "genre":genre,
                     "rating":rating})

In [20]:
movie100['id']=movie100['id'].str.split('/').str[2]

In [21]:
movie100.head()

Unnamed: 0,id,title,year,length,genre,rating
0,tt0111161,The Shawshank Redemption,1994,2h 22min,Drama,9.3
1,tt0068646,The Godfather,1972,2h 55min,Crime,9.2
2,tt0071562,The Godfather: Part II,1974,3h 22min,Crime,9.0
3,tt0468569,The Dark Knight,2008,2h 32min,Action,9.0
4,tt0050083,12 Angry Men,1957,1h 36min,Crime,8.9


In [22]:
movie100.to_csv("movie100.csv")

In [39]:
import os
import glob
import pandas as pd
#os.chdir('./movie_review_100')

In [32]:
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
len(all_filenames)

100

In [33]:
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv
combined_csv= combined_csv.drop('Unnamed: 0', axis=1)
combined_csv.to_csv( "Movie_Reviews.csv", index=False, encoding='utf-8-sig')

In [36]:
rev = pd.read_csv("Movie_Reviews.csv")
rev['id']=rev['id'].str.strip(".csv")

In [38]:
rev.to_csv("Movie_Reviews_100.csv", index=False, encoding='utf-8-sig')