In [1]:
# dependencies and setup
import pandas as pd
from bs4 import BeautifulSoup
import requests
import datetime as dt
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy import create_engine, func, ForeignKey
from sqlalchemy.orm import Session
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Date, Integer, String, Float

In [2]:
# files to load
file1 = "Raw_Data/IMDB movies.csv"
file2 = "Raw_Data/rotten_tomatoes_movies.csv"

# read files and store into pandas dataframe
IMDB_movies = pd.read_csv(file1, low_memory=False)
rotten_tomatoes = pd.read_csv(file2, low_memory=False)

In [3]:
# view IMDB_movies dataframe
IMDB_movies.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.8,188,,,,,5.0,2.0
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,446,$ 45000,,,,25.0,3.0
4,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2237,,,,,31.0,14.0


In [4]:
# rename IMDB avg_vote to ratings
IMDB_movies = IMDB_movies.rename(columns={"avg_vote":"imdb_ratings", "worlwide_gross_income":"worldwide_gross_income"})
IMDB_movies.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,imdb_ratings,votes,budget,usa_gross_income,worldwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.8,188,,,,,5.0,2.0
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,446,$ 45000,,,,25.0,3.0
4,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2237,,,,,31.0,14.0


In [5]:
# remove metascore, reviews_from_users, and reviews_from_critics from IMDB
IMDB_movies = IMDB_movies.drop(columns=['metascore', 'reviews_from_users', 'reviews_from_critics'])
IMDB_movies.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,imdb_ratings,votes,budget,usa_gross_income,worldwide_gross_income
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,Alexander Black,Alexander Black Photoplays,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,"Urban Gad, Gebhard Schätzler-Perasini",Fotorama,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.8,188,,,
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,Victorien Sardou,Helen Gardner Picture Players,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,446,$ 45000,,
4,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",Dante Alighieri,Milano Film,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2237,,,


In [6]:
#Look at the data types of all the columns
IMDB_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85855 entries, 0 to 85854
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   imdb_title_id           85855 non-null  object 
 1   title                   85855 non-null  object 
 2   original_title          85855 non-null  object 
 3   year                    85855 non-null  object 
 4   date_published          85855 non-null  object 
 5   genre                   85855 non-null  object 
 6   duration                85855 non-null  int64  
 7   country                 85791 non-null  object 
 8   language                85022 non-null  object 
 9   director                85768 non-null  object 
 10  writer                  84283 non-null  object 
 11  production_company      81400 non-null  object 
 12  actors                  85786 non-null  object 
 13  description             83740 non-null  object 
 14  imdb_ratings            85855 non-null

In [7]:
#Checks max lenghts of all string i.e. object columns
for c in IMDB_movies:
    if IMDB_movies[c].dtype=='object':
        print(f'{c} {IMDB_movies[c].str.len().max()}')

imdb_title_id 10
title 196
original_title 196
year 13
date_published 13
genre 31
country 225.0
language 163.0
director 62.0
writer 64.0
production_company 101.0
actors 415.0
description 402.0
budget 16.0
usa_gross_income 12.0
worldwide_gross_income 14.0


In [8]:
#Year should be integer, checking which values are forcing pandas to make it string i.e. object
IMDB_movies['year'].unique()

array(['1894', '1906', '1911', '1912', '1919', '1913', '1914', '1915',
       '1916', '1917', '1918', '1920', '1921', '1924', '1922', '1923',
       '1925', '1926', '1935', '1927', '1928', '1983', '1929', '1930',
       '1932', '1931', '1937', '1938', '1933', '1934', '1936', '1940',
       '1939', '1942', '1943', '1941', '1948', '1944', '2001', '1946',
       '1945', '1947', '1973', '1949', '1950', '1952', '1951', '1962',
       '1953', '1954', '1955', '1961', '1956', '1958', '1957', '1959',
       '1960', '1963', '1965', '1971', '1964', '1966', '1968', '1967',
       '1969', '1976', '1970', '1979', '1972', '1981', '1978', '2000',
       '1989', '1975', '1974', '1986', '1990', '2018', '1977', '1982',
       '1980', '1993', '1984', '1985', '1988', '1987', '2005', '1991',
       '2002', '1994', '1992', '1995', '2017', '1997', '1996', '2006',
       '1999', '1998', '2007', '2008', '2003', '2004', '2010', '2009',
       '2011', '2013', '2012', '2016', '2015', '2014', '2019', '2020',
      

In [9]:
#Replacing 'TV Movie 2019' to '2019'
IMDB_movies['year']= IMDB_movies['year'].replace(['TV Movie 2019'],2019)

In [10]:
#Data After Replacing
IMDB_movies['year'].unique()

array(['1894', '1906', '1911', '1912', '1919', '1913', '1914', '1915',
       '1916', '1917', '1918', '1920', '1921', '1924', '1922', '1923',
       '1925', '1926', '1935', '1927', '1928', '1983', '1929', '1930',
       '1932', '1931', '1937', '1938', '1933', '1934', '1936', '1940',
       '1939', '1942', '1943', '1941', '1948', '1944', '2001', '1946',
       '1945', '1947', '1973', '1949', '1950', '1952', '1951', '1962',
       '1953', '1954', '1955', '1961', '1956', '1958', '1957', '1959',
       '1960', '1963', '1965', '1971', '1964', '1966', '1968', '1967',
       '1969', '1976', '1970', '1979', '1972', '1981', '1978', '2000',
       '1989', '1975', '1974', '1986', '1990', '2018', '1977', '1982',
       '1980', '1993', '1984', '1985', '1988', '1987', '2005', '1991',
       '2002', '1994', '1992', '1995', '2017', '1997', '1996', '2006',
       '1999', '1998', '2007', '2008', '2003', '2004', '2010', '2009',
       '2011', '2013', '2012', '2016', '2015', '2014', '2019', '2020',
      

In [11]:
#Convert Year to Integer from Object i.e. string
IMDB_movies['year']=pd.to_numeric(IMDB_movies['year'])

In [12]:
#verify after conversion
IMDB_movies['year'].dtype

dtype('int64')

In [13]:
#Date Published should be date, checking which values are forcing pandas to make it string i.e. object
IMDB_movies['date_published'].unique()
#Looking at it typically all values are of length 10 (yyyy-mm-dd)

array(['1894-10-09', '1906-12-26', '1911-08-19', ..., '2020-10-22',
       '2019-01-13', '2020-09-04'], dtype=object)

In [14]:
#Finding the value that has length greater than 10
[x for x in IMDB_movies['date_published'] if len(x) > 10]

['TV Movie 2019']

In [15]:
#Finding the movie with publised data of 'TV Movie 2019'
IMDB_movies[IMDB_movies['date_published']=='TV Movie 2019']

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,imdb_ratings,votes,budget,usa_gross_income,worldwide_gross_income
83917,tt8206668,Bad Education,Bad Education,2019,TV Movie 2019,"Biography, Comedy, Crime",108,USA,English,Cory Finley,"Mike Makowsky, Robert Kolker",HBO Films,"Hugh Jackman, Ray Romano, Welker White, Alliso...",The beloved superintendent of New York's Rosly...,7.1,23973,,,


In [16]:
#By google search release date of "Bad Education" was 8th Sept 2019
#Replacing the date to 8th Sept 2019
IMDB_movies['date_published']= IMDB_movies['date_published'].replace(['TV Movie 2019'],'2019-09-08')

In [17]:
#Convert date_published to Date from Object i.e. string
IMDB_movies['date_published']=pd.to_datetime(IMDB_movies['date_published'],format='%Y-%m-%d')

In [18]:
#Verify datatype after conversion
IMDB_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85855 entries, 0 to 85854
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   imdb_title_id           85855 non-null  object        
 1   title                   85855 non-null  object        
 2   original_title          85855 non-null  object        
 3   year                    85855 non-null  int64         
 4   date_published          85855 non-null  datetime64[ns]
 5   genre                   85855 non-null  object        
 6   duration                85855 non-null  int64         
 7   country                 85791 non-null  object        
 8   language                85022 non-null  object        
 9   director                85768 non-null  object        
 10  writer                  84283 non-null  object        
 11  production_company      81400 non-null  object        
 12  actors                  85786 non-null  object

In [19]:
# view Rotten Tomatoes dataframe
rotten_tomatoes.head()

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2/12/2010,...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",4/30/2010,...,Sony Pictures Classics,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0,44,123,19
2,m/10,10,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",10/5/1979,...,Waner Bros.,Fresh,67.0,24.0,Spilled,53.0,14684.0,2,16,8
3,m/1000013-12_angry_men,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",4/13/1957,...,Criterion Collection,Certified-Fresh,100.0,54.0,Upright,97.0,105386.0,6,54,0
4,m/1000079-20000_leagues_under_the_sea,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1/1/1954,...,Disney,Fresh,89.0,27.0,Upright,74.0,68918.0,5,24,3


In [20]:
rotten_tomatoes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17712 entries, 0 to 17711
Data columns (total 22 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   rotten_tomatoes_link              17712 non-null  object 
 1   movie_title                       17712 non-null  object 
 2   movie_info                        17391 non-null  object 
 3   critics_consensus                 9134 non-null   object 
 4   content_rating                    17712 non-null  object 
 5   genres                            17693 non-null  object 
 6   directors                         17518 non-null  object 
 7   authors                           16170 non-null  object 
 8   actors                            17360 non-null  object 
 9   original_release_date             16546 non-null  object 
 10  streaming_release_date            17328 non-null  object 
 11  runtime                           17398 non-null  float64
 12  prod

In [21]:
#Date Published should be date, checking which values are forcing pandas to make it string i.e. object
rotten_tomatoes['original_release_date'].unique()
#Looking at it typically all values are of length 10 (mm/dd/yyyy)

array(['2/12/2010', '4/30/2010', '10/5/1979', ..., '10/2/1981',
       '12/17/1964', '6/17/1964'], dtype=object)

In [22]:
#Convert date_published to Date from Object i.e. string
rotten_tomatoes['original_release_date']=pd.to_datetime(rotten_tomatoes['original_release_date'],format='%m/%d/%Y')

In [23]:
#Convert date_published to Date from Object i.e. string
rotten_tomatoes['streaming_release_date']=pd.to_datetime(rotten_tomatoes['streaming_release_date'],format='%m/%d/%Y')

In [24]:
rotten_tomatoes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17712 entries, 0 to 17711
Data columns (total 22 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   rotten_tomatoes_link              17712 non-null  object        
 1   movie_title                       17712 non-null  object        
 2   movie_info                        17391 non-null  object        
 3   critics_consensus                 9134 non-null   object        
 4   content_rating                    17712 non-null  object        
 5   genres                            17693 non-null  object        
 6   directors                         17518 non-null  object        
 7   authors                           16170 non-null  object        
 8   actors                            17360 non-null  object        
 9   original_release_date             16546 non-null  datetime64[ns]
 10  streaming_release_date            17328 non-nu

In [25]:
merged_movies=IMDB_movies.merge(rotten_tomatoes, how='left', left_on=['original_title','date_published'], right_on=['movie_title','original_release_date'],suffixes=('_imdb','_rt'))
merged_movies.head(20)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,production_company_rt,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,,,,,,,,,,
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,,,,,,,,,,
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,,,,,,,,,,
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,,,,,,,,,,
4,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,,,,,,,,,,
5,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1912,1913-01-01,"Biography, Drama",60,USA,English,Sidney Olcott,...,,,,,,,,,,
6,tt0002423,Madame DuBarry,Madame DuBarry,1919,1919-11-26,"Biography, Drama, Romance",85,Germany,German,Ernst Lubitsch,...,,,,,,,,,,
7,tt0002445,Quo Vadis?,Quo Vadis?,1913,1913-03-01,"Drama, History",120,Italy,Italian,Enrico Guazzoni,...,,,,,,,,,,
8,tt0002452,Independenta Romaniei,Independenta Romaniei,1912,1912-09-01,"History, War",120,Romania,,"Aristide Demetriade, Grigore Brezeanu",...,,,,,,,,,,
9,tt0002461,Richard III,Richard III,1912,1912-10-15,Drama,55,"France, USA",English,"André Calmettes, James Keane",...,,,,,,,,,,


In [26]:
len(IMDB_movies)

85855

In [27]:
len(rotten_tomatoes)

17712

In [28]:
len(merged_movies)

85855

In [29]:
# remove audience_status
merged_movies = merged_movies.drop(columns=['audience_status'])
merged_movies.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,runtime,production_company_rt,tomatometer_status,tomatometer_rating,tomatometer_count,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,,,,,,,,,,
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,,,,,,,,,,
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,,,,,,,,,,
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,,,,,,,,,,
4,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,,,,,,,,,,


In [30]:
merged_movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85855 entries, 0 to 85854
Data columns (total 40 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   imdb_title_id                     85855 non-null  object        
 1   title                             85855 non-null  object        
 2   original_title                    85855 non-null  object        
 3   year                              85855 non-null  int64         
 4   date_published                    85855 non-null  datetime64[ns]
 5   genre                             85855 non-null  object        
 6   duration                          85855 non-null  int64         
 7   country                           85791 non-null  object        
 8   language                          85022 non-null  object        
 9   director                          85768 non-null  object        
 10  writer                            84283 non-nu

In [31]:
#declarative Base
Base = declarative_base()

In [32]:
#Create Engine movies_db
engine = create_engine("sqlite:///movies_db.sqlite")

In [33]:
# This creates the DB (movies_dv.sqllite) and connects to it
conn = engine.connect()

In [34]:
#Define class i.e table to be created in DB
class Movies(Base):
    __tablename__='movies'
    imdb_title_id = Column(String, primary_key=True)
    title = Column(String)
    original_title = Column(String)
    year = Column(Integer)
    date_published = Column(Date)
    genre = Column(String)
    duration = Column(Integer)
    country = Column(String)
    language = Column(String)
    director = Column(String)
    writer = Column(String)
    production_company_imdb = Column(String)
    actors_imdb = Column(String)
    description = Column(String)
    imdb_ratings = Column(Float)
    votes = Column(Integer)
    budget = Column(String)
    usa_gross_income = Column(String)
    worldwide_gross_income = Column(String)
    rotten_tomatoes_link = Column(String)
    movie_title = Column(String)
    movie_info = Column(String)
    critics_consensus = Column(String)
    content_rating = Column(String)
    genres = Column(String)
    directors = Column(String)
    authors = Column(String)
    actors_rt = Column(String)
    original_release_date = Column(Date)
    streaming_release_date = Column(Date)
    runtime = Column(Float)
    production_company_rt = Column(String)
    tomatometer_status = Column(String)
    tomatometer_rating = Column(Float)
    tomatometer_count = Column(Float)
    audience_rating = Column(Float)
    audience_count = Column(Float)
    tomatometer_top_critics_count = Column(Float)
    tomatometer_fresh_critics_count = Column(Float)
    tomatometer_rotten_critics_count = Column(Float)

In [35]:
# Create the imdb_movie class
class imdb_movie(Base):
    __tablename__ = 'imdb_top_250_movies'
    title = Column(String, primary_key=True)
    movie_name = Column(String)

In [36]:
#Creates 'movies' and "imdb_top_250_movies" tables in DB
Base.metadata.create_all(engine)

In [37]:
#Inserts data from movies_csv to movies table in movies_db sqlite DB.
merged_movies.to_sql('movies',conn,index=False,if_exists='append')

In [38]:
#Close Connection
conn.close()

In [39]:
# To push the objects made and query the server we use a Session object
session = Session(bind=engine)

In [40]:
# create url variable
url="https://www.imdb.com/chart/top/?ref_=nv_mv_250"

In [41]:
# Retrieve page with the requests module
response = requests.get(url)

In [42]:
# Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(response.text, 'html.parser')

In [43]:
# Examine the results, then determine element that contains sought info
#print(soup.prettify())

In [44]:
# results are returned as an iterable list
top_250_results = soup.find_all('td', class_="titleColumn")

In [45]:
# Loop through returned results
for result in top_250_results:
    # Error handling
    try:
        # Identify and return title of listing
        title_name = result.find('a').text
        # Identify and return link to listing
        link = result.a['href']
        title_strings = link.split('/')
        

        # Print results only if title, price, and link are available
        if (title_name and link):
            print('-------------')
            print(title_name)
            print(link)
            print(title_strings[2])
            imdb_title_id = title_strings[2]
            # Create  instances of the imdb movie class
            movie = imdb_movie(title=imdb_title_id, movie_name=title_name)
            # Add these objects to the session
            session.add(movie)
                                   
    except AttributeError as e:
        print(e)

-------------
The Shawshank Redemption
/title/tt0111161/
tt0111161
-------------
The Godfather
/title/tt0068646/
tt0068646
-------------
The Godfather: Part II
/title/tt0071562/
tt0071562
-------------
The Dark Knight
/title/tt0468569/
tt0468569
-------------
12 Angry Men
/title/tt0050083/
tt0050083
-------------
Schindler's List
/title/tt0108052/
tt0108052
-------------
The Lord of the Rings: The Return of the King
/title/tt0167260/
tt0167260
-------------
Pulp Fiction
/title/tt0110912/
tt0110912
-------------
The Good, the Bad and the Ugly
/title/tt0060196/
tt0060196
-------------
The Lord of the Rings: The Fellowship of the Ring
/title/tt0120737/
tt0120737
-------------
Fight Club
/title/tt0137523/
tt0137523
-------------
Forrest Gump
/title/tt0109830/
tt0109830
-------------
Inception
/title/tt1375666/
tt1375666
-------------
The Lord of the Rings: The Two Towers
/title/tt0167261/
tt0167261
-------------
Star Wars: Episode V - The Empire Strikes Back
/title/tt0080684/
tt0080684
---

In [46]:
# push top 250 to imdb_top_250_movies table
session.commit()

In [47]:
# close session
session.close()