In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('../ml-latest-small/links.csv')

In [4]:
df.sample(4)

Unnamed: 0,movieId,imdbId,tmdbId
8685,122890,803096,68735.0
3063,4110,93044,4918.0
1450,1973,101917,11284.0
5386,8978,388419,13673.0


In [5]:
df.shape

(9742, 3)

#### Goal: Webscrape webimdb
- gets links from links.csv
- loop through all link values
    - create respective link
    - scrape data 

## Importing tools needed for webscraping

In [6]:
import csv
from bs4 import BeautifulSoup
import requests
import re

---
### Code blocks that will be combined for master webscraper
#### Looking to collect relevent meta data for every movie from imdb wesbite
- title
- cast
- director
- storyline
- Content Rating (like R, PG, etc.)
- Runtime (length of movie)
- genres
- country
- language
- cumulative earnings
- production companies
- color pallete
- number of people who rated the movie on imdb

#### Make request

In [36]:
url = 'https://www.imdb.com/title/tt0110357'
# url = 'https://www.imdb.com/title/tt0113987'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

#### title

In [39]:
# get title
title_ = soup.find('div', class_="title_wrapper")
title = ' '.join(title_.find('h1').text.split()).lower()
title


'the lion king (1994)'

#### cast 

In [38]:
cast = soup.find_all('div', class_="credit_summary_item")[2].text
cast = re.findall(r'\w.*[|]', cast)
cast = ','.join(cast)
cast = ''.join(re.findall(r'\w.*\w' ,cast))
cast = cast.replace(' ', '').replace(',', ' ').lower()
cast

'matthewbroderick jeremyirons jamesearljones'

#### directors

In [37]:
director  = soup.find('div', class_="credit_summary_item").find('a').text
director.replace(' ', '').lower()

'rogerallers'

#### storyline

In [12]:
# retrive storyline paragraph from website
storyline_ = soup.find('div', class_="inline canwrap").text
storyline_ = re.findall(r'\w*\w', storyline_)
storyline = list()
for x in storyline_:
    if x == 'Written':
        break
    storyline.append(x)
storyline = ' '.join(storyline).lower()
storyline

'a little boy named andy loves to be in his room playing with his toys especially his doll named woody but what do the toys do when andy is not with them they come to life woody believes that his life as a toy is good however he must worry about andy s family moving and what woody does not know is about andy s birthday party woody does not realize that andy s mother gave him an action figure known as buzz lightyear who does not believe that he is a toy and quickly becomes andy s new favorite toy woody who is now consumed with jealousy tries to get rid of buzz then both woody and buzz are now lost they must find a way to get back to andy before he moves without them but they will have to pass through a ruthless toy killer sid phillips'

#### rating 

In [14]:
x = soup.find('div', class_="subtext").text
x = re.findall(r'\w', x)[0]
x

'G'

#### Runtime

In [15]:
x = soup.find('div', class_="subtext").text
runtime = re.findall(r'\w*\w', x)[1:3]
runtime = ''.join(runtime)
runtime

'1h21min'

#### genre

In [20]:
x = soup.find_all('div', class_="see-more inline canwrap")
# x = re.findall(r'\w*\w', x)
genre = re.findall(r'\w*\w',x[1].text)
genre = ' '.join(genre)
genre

'Genres Animation Adventure Comedy Family Fantasy'

#### keywords

In [14]:
x = soup.find_all('div', class_="see-more inline canwrap")
# x = re.findall(r'\w*\w', x)
keywords = re.findall(r'\w*\w',x[0].text)
keywords = ' '.join(keywords[2:])
keywords

'american president american politics year 1972 year 1973 alcoholic drink See All 81'

#### country

In [18]:
y = soup.find_all('div', class_="txt-block")
for i in range(len(y)):
    x = re.findall(r'\w*\w', y[i].text)
    if 'Country' in  x:
        country = ' '.join(x[1:])
country

'USA'

#### Language

In [19]:
y = soup.find_all('div', class_="txt-block")
for i in range(len(y)):
    x = re.findall(r'\w*\w', y[i].text)
    if 'Language' in  x:
        language = ' '.join(x[1:])
language

'English'

#### cumulative earnings

In [21]:
y = soup.find_all('div', class_="txt-block")
for i in range(len(y)):
    x = re.findall(r'\w*\w', y[i].text)
    if 'Cumulative' in  x:
        cumulative = ' '.join(x[3:])
cumulative

'394 436 586'

#### company

In [23]:
y = soup.find_all('div', class_="txt-block")
for i in range(len(y)):
    x = re.findall(r'\w*\w', y[i].text)
    if 'Production' in  x:
        production = ' '.join(x[3:])
production

'Disney Pictures Pixar Animation Studios See more'

#### color

In [24]:
y = soup.find_all('div', class_="txt-block")
for i in range(len(y)):
    x = re.findall(r'\w*\w', y[i].text)
    if 'Color' in  x:
        color = ' '.join(x[1:])
color

'Color Technicolor'

#### num of rating

In [27]:
num_rating = soup.find_all('span', class_="small")[0].text
num_rating

'28,922'

---
# Main web scrape script

In [29]:
"""
title, director, cast, storyline, rated, language, gross, company, color, country, genre, runtime , keywords
"""

from nltk.corpus import stopwords


with open('movie_meta.csv', mode='w', encoding='utf-8', newline='') as f:
    # instantiate csv writer
    write = csv.writer(f)
    write.writerow(['imdbid', 'title', 'director', 'cast','storyline', 'keywords', 'genre', 'production', 'color',  'language','country','rated', 'runtime',  'Worldwide_Gross', 'num_rating'])

    # loop through movie id's from link.csv-df using their index
    for i in range(len(df['imdbId'].values)):
        print((i+1) / len(df['imdbId'].values) * 100)       
        id = df['imdbId'].values[i]
        
        # ids' starting with 0 need to have to added back to start in string form
        if len(str(id)) < 6:
            id = '0' + str(id)
        
        # create the url needed for soup, make request, and beautify it
        movie_url = 'https://www.imdb.com/title/tt0' + str(id)
        response = requests.get(movie_url)
        soup = BeautifulSoup(response.text, 'lxml')


        try:
             
            # get title
            title_ = soup.find('div', class_="title_wrapper")
            title = ' '.join(title_.find('h1').text.split()).lower()
            
            
            # get direcot
            director  = soup.find('div', class_="credit_summary_item").find('a').text
            director = director.replace(' ', '')          
                  
               
            # get the cast
            cast = soup.find_all('div', class_="credit_summary_item")[2].text
            cast = re.findall(r'\w.*[|]', cast)
            cast = ','.join(cast)
            cast = ''.join(re.findall(r'\w.*\w' ,cast))
            cast = cast.replace(' ', '').replace(',', ' ').lower()
            
            
            
            # retrive storyline paragraph from website
            storyline_ = soup.find('div', class_="inline canwrap").text
            storyline_ = re.findall(r'\w*\w', storyline_)
            storyline = list()
            for x in storyline_:
                if x == 'Written':
                    break
                storyline.append(x)
            storyline = ' '.join(storyline).lower()            
            
            # get movie letter ratings 
            subtext = soup.find('div', class_="subtext").text
            rated = re.findall(r'\w', subtext)[0]
            
            # get movie runtime
            runtime = re.findall(r'\w*\w', subtext)[1:3]
            runtime = ''.join(runtime)
            
            ### Getting genre, keywords
            genre_key = soup.find_all('div', class_="see-more inline canwrap")
            # get genre
            genre = re.findall(r'\w*\w',genre_key[1].text)
            genre = ' '.join(genre)
            # get keywords
            keywords = re.findall(r'\w*\w',genre_key[0].text)
            keywords = ' '.join(keywords[2:])

            
            # number of people who rated the movie
            num_rating = soup.find_all('span', class_="small")[0].text            
            
            # get movie language
            y = soup.find_all('div', class_="txt-block")
            for i in range(len(y)):
                x = re.findall(r'\w*\w', y[i].text)
                if 'Language' in  x:
                    language = ' '.join(x[1:])
        
            # get movie cumulative earnings
                if 'Cumulative' in  x:
                    gross = ''.join(x[3:])
            
            # get production company
                if 'Production' in  x:
                    production = ' '.join(x[3:])
            
            # get movie color
                if 'Color' in  x:
                    color = ' '.join(x[1:])
            
            # get country 
                if 'Country' in  x:
                    country = ' '.join(x[1:])
            # write all of the info into csv
            write.writerow([id, title, director, cast, storyline,  keywords, genre, production, color, language, country, rated, runtime, gross, num_rating])
        except:
            #         # if the id doesn't work, writ ean empty string to csv file
            write.writerow([id])

0.010264832683227263
0.020529665366454525
0.03079449804968179
0.04105933073290905
0.05132416341613632
0.06158899609936358
0.07185382878259083
0.0821186614658181
0.09238349414904537
0.10264832683227264
0.11291315951549989
0.12317799219872716
0.13344282488195444
0.14370765756518167
0.15397249024840895
0.1642373229316362
0.17450215561486349
0.18476698829809074
0.195031820981318
0.20529665366454528
0.21556148634777253
0.22582631903099978
0.23609115171422707
0.24635598439745432
0.2566208170806816
0.2668856497639089
0.27715048244713614
0.28741531513036334
0.29768014781359065
0.3079449804968179
0.31820981318004515
0.3284746458632724
0.3387394785464997
0.34900431122972697
0.3592691439129542
0.3695339765961815
0.37979880927940873
0.390063641962636
0.4003284746458633
0.41059330732909055
0.4208581400123178
0.43112297269554506
0.4413878053787723
0.45165263806199957
0.4619174707452269
0.47218230342845413
0.4824471361116814
0.49271196879490864
0.5029768014781358
0.5132416341613631
0.5235064668445905

SSLError: HTTPSConnectionPool(host='www.imdb.com', port=443): Max retries exceeded with url: /title/tt07738550 (Caused by SSLError(SSLError("bad handshake: SysCallError(10060, 'WSAETIMEDOUT')")))

---