In [13]:
# Dependencies
from bs4 import BeautifulSoup
import requests


In [14]:
# Import SQL Alchemy
from sqlalchemy import create_engine

# Import and establish Base for which classes will be constructed 
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()

# Import modules to declare columns and column data types
from sqlalchemy import Column, Integer, String, Float

In [15]:
# Create the Garbage class
class imdb_movie(Base):
    __tablename__ = 'imdb_top_250_movies'
    title = Column(String(30), primary_key=True)
    movie_name = Column(String(500))


In [16]:
# Create a connection to a SQLite database
engine = create_engine('sqlite:///movies_db.sqlite')
#engine = create_engine(f'postgresql://postgres:password@localhost/GarbageDB')
Base.metadata.create_all(engine)

In [17]:
# To push the objects made and query the server we use a Session object
from sqlalchemy.orm import Session
session = Session(bind=engine)

In [18]:
url="https://www.imdb.com/chart/top/?ref_=nv_mv_250"

In [19]:
# Retrieve page with the requests module
response = requests.get(url)

In [20]:
# Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(response.text, 'html.parser')

In [21]:
# Examine the results, then determine element that contains sought info
print(soup.prettify())

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <script type="text/javascript">
   var IMDbTimer={starttime: new Date().getTime(),pt:'java'};
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <title>
   Top 250 Movies - IMDb
  </title>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});
    }
  </script>
  <link href="https://www.imdb.com/chart/top/" rel="canonical"/>
  <meta content="http://w

In [22]:
# results are returned as an iterable list
top_250_results = soup.find_all('td', class_="titleColumn")

In [23]:
# Loop through returned results
for result in top_250_results:
    # Error handling
    try:
        # Identify and return title of listing
        title_name = result.find('a').text
        # Identify and return price of listing
        # Identify and return link to listing
        link = result.a['href']
        title_strings = link.split('/')
        

        # Print results only if title, price, and link are available
        if (title_name and link):
            print('-------------')
            print(title_name)
            print(link)
            print(title_strings[2])
            imdb_title_id = title_strings[2]
            # Create  instances of the imdb movie class
            movie = imdb_movie(title=imdb_title_id, movie_name=title_name)
            # Add these objects to the session
            session.add(movie)
                                   
    except AttributeError as e:
        print(e)


-------------
The Shawshank Redemption
/title/tt0111161/
tt0111161
-------------
The Godfather
/title/tt0068646/
tt0068646
-------------
The Godfather: Part II
/title/tt0071562/
tt0071562
-------------
The Dark Knight
/title/tt0468569/
tt0468569
-------------
12 Angry Men
/title/tt0050083/
tt0050083
-------------
Schindler's List
/title/tt0108052/
tt0108052
-------------
The Lord of the Rings: The Return of the King
/title/tt0167260/
tt0167260
-------------
Pulp Fiction
/title/tt0110912/
tt0110912
-------------
The Good, the Bad and the Ugly
/title/tt0060196/
tt0060196
-------------
The Lord of the Rings: The Fellowship of the Ring
/title/tt0120737/
tt0120737
-------------
Fight Club
/title/tt0137523/
tt0137523
-------------
Forrest Gump
/title/tt0109830/
tt0109830
-------------
Inception
/title/tt1375666/
tt1375666
-------------
The Lord of the Rings: The Two Towers
/title/tt0167261/
tt0167261
-------------
Star Wars: Episode V - The Empire Strikes Back
/title/tt0080684/
tt0080684
---

tt0050986
-------------
The Third Man
/title/tt0041959/
tt0041959
-------------
The Truman Show
/title/tt0120382/
tt0120382
-------------
Jurassic Park
/title/tt0107290/
tt0107290
-------------
Memories of Murder
/title/tt0353969/
tt0353969
-------------
V for Vendetta
/title/tt0434409/
tt0434409
-------------
Blade Runner
/title/tt0083658/
tt0083658
-------------
Trainspotting
/title/tt0117951/
tt0117951
-------------
Inside Out
/title/tt2096673/
tt2096673
-------------
The Bridge on the River Kwai
/title/tt0050212/
tt0050212
-------------
Fargo
/title/tt0116282/
tt0116282
-------------
Warrior
/title/tt1291584/
tt1291584
-------------
Finding Nemo
/title/tt0266543/
tt0266543
-------------
Kill Bill: Vol. 1
/title/tt0266697/
tt0266697
-------------
Gone with the Wind
/title/tt0031381/
tt0031381
-------------
Tokyo Story
/title/tt0046438/
tt0046438
-------------
On the Waterfront
/title/tt0047296/
tt0047296
-------------
My Father and My Son
/title/tt0476735/
tt0476735
-------------
Wi

In [24]:
session.commit()

session.close()