In [8]:
# dependencies and setup
import pandas as pd
from bs4 import BeautifulSoup
import requests
from sqlalchemy import create_engine

# Import and establish Base for which classes will be constructed 
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()

# Import modules to declare columns and column data types
from sqlalchemy import Column, Integer, String, Float

In [2]:
# files to load
file1 = "Raw_Data/IMDB movies.csv"
file2 = "Raw_Data/rotten_tomatoes_movies.csv"

# read files and store into pandas dataframe
IMDB_movies = pd.read_csv(file1, low_memory=False)
rotten_tomatoes = pd.read_csv(file2, low_memory=False)

In [3]:
# view IMDB_movies dataframe
IMDB_movies.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.8,188,,,,,5.0,2.0
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,446,$ 45000,,,,25.0,3.0
4,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2237,,,,,31.0,14.0


In [4]:
# rename IMDB avg_vote to ratings
IMDB_movies = IMDB_movies.rename(columns={"avg_vote":"IMBD Ratings"})
IMDB_movies.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,IMBD Ratings,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.8,188,,,,,5.0,2.0
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,446,$ 45000,,,,25.0,3.0
4,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2237,,,,,31.0,14.0


In [5]:
# remove metascore, reviews_from_users, and reviews_from_critics from IMDB
IMDB_movies = IMDB_movies.drop(columns=['metascore', 'reviews_from_users', 'reviews_from_critics'])
IMDB_movies.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,IMBD Ratings,votes,budget,usa_gross_income,worlwide_gross_income
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,Alexander Black,Alexander Black Photoplays,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,"Urban Gad, Gebhard Schätzler-Perasini",Fotorama,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.8,188,,,
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,Victorien Sardou,Helen Gardner Picture Players,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,446,$ 45000,,
4,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",Dante Alighieri,Milano Film,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2237,,,


In [9]:
#Look at the data types of all the columns
IMDB_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85855 entries, 0 to 85854
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_title_id          85855 non-null  object 
 1   title                  85855 non-null  object 
 2   original_title         85855 non-null  object 
 3   year                   85855 non-null  object 
 4   date_published         85855 non-null  object 
 5   genre                  85855 non-null  object 
 6   duration               85855 non-null  int64  
 7   country                85791 non-null  object 
 8   language               85022 non-null  object 
 9   director               85768 non-null  object 
 10  writer                 84283 non-null  object 
 11  production_company     81400 non-null  object 
 12  actors                 85786 non-null  object 
 13  description            83740 non-null  object 
 14  IMBD Ratings           85855 non-null  float64
 15  vo

In [10]:
#Checks max lenghts of all string i.e. object columns
for c in IMDB_movies:
    if IMDB_movies[c].dtype=='object':
        print(f'{c} {IMDB_movies[c].str.len().max()}')

imdb_title_id 10
title 196
original_title 196
year 13
date_published 13
genre 31
country 225.0
language 163.0
director 62.0
writer 64.0
production_company 101.0
actors 415.0
description 402.0
budget 16.0
usa_gross_income 12.0
worlwide_gross_income 14.0


In [11]:
#Year should be integer, checking which values are forcing pandas to make it string i.e. object
IMDB_movies['year'].unique()

array(['1894', '1906', '1911', '1912', '1919', '1913', '1914', '1915',
       '1916', '1917', '1918', '1920', '1921', '1924', '1922', '1923',
       '1925', '1926', '1935', '1927', '1928', '1983', '1929', '1930',
       '1932', '1931', '1937', '1938', '1933', '1934', '1936', '1940',
       '1939', '1942', '1943', '1941', '1948', '1944', '2001', '1946',
       '1945', '1947', '1973', '1949', '1950', '1952', '1951', '1962',
       '1953', '1954', '1955', '1961', '1956', '1958', '1957', '1959',
       '1960', '1963', '1965', '1971', '1964', '1966', '1968', '1967',
       '1969', '1976', '1970', '1979', '1972', '1981', '1978', '2000',
       '1989', '1975', '1974', '1986', '1990', '2018', '1977', '1982',
       '1980', '1993', '1984', '1985', '1988', '1987', '2005', '1991',
       '2002', '1994', '1992', '1995', '2017', '1997', '1996', '2006',
       '1999', '1998', '2007', '2008', '2003', '2004', '2010', '2009',
       '2011', '2013', '2012', '2016', '2015', '2014', '2019', '2020',
      

In [12]:
#Replacing 'TV Movie 2019' to '2019'
IMDB_movies['year']= IMDB_movies['year'].replace(['TV Movie 2019'],2019)

In [13]:
#Data After Replacing
IMDB_movies['year'].unique()

array(['1894', '1906', '1911', '1912', '1919', '1913', '1914', '1915',
       '1916', '1917', '1918', '1920', '1921', '1924', '1922', '1923',
       '1925', '1926', '1935', '1927', '1928', '1983', '1929', '1930',
       '1932', '1931', '1937', '1938', '1933', '1934', '1936', '1940',
       '1939', '1942', '1943', '1941', '1948', '1944', '2001', '1946',
       '1945', '1947', '1973', '1949', '1950', '1952', '1951', '1962',
       '1953', '1954', '1955', '1961', '1956', '1958', '1957', '1959',
       '1960', '1963', '1965', '1971', '1964', '1966', '1968', '1967',
       '1969', '1976', '1970', '1979', '1972', '1981', '1978', '2000',
       '1989', '1975', '1974', '1986', '1990', '2018', '1977', '1982',
       '1980', '1993', '1984', '1985', '1988', '1987', '2005', '1991',
       '2002', '1994', '1992', '1995', '2017', '1997', '1996', '2006',
       '1999', '1998', '2007', '2008', '2003', '2004', '2010', '2009',
       '2011', '2013', '2012', '2016', '2015', '2014', '2019', '2020',
      

In [14]:
#Convert Year to Integer from Object i.e. string
IMDB_movies['year']=pd.to_numeric(IMDB_movies['year'])

In [15]:
#verify after conversion
IMDB_movies['year'].dtype

dtype('int64')

In [16]:
#Date Published should be date, checking which values are forcing pandas to make it string i.e. object
IMDB_movies['date_published'].unique()
#Looking at it typically all values are of lenght 10 (yyyy-mm-dd)

array(['1894-10-09', '1906-12-26', '1911-08-19', ..., '2020-10-22',
       '2019-01-13', '2020-09-04'], dtype=object)

In [17]:
#Finding the value that has length greater than 10
[x for x in IMDB_movies['date_published'] if len(x) > 10]

['TV Movie 2019']

In [18]:
#Finding the movie with publised data of 'TV Movie 2019'
IMDB_movies[IMDB_movies['date_published']=='TV Movie 2019']

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,IMBD Ratings,votes,budget,usa_gross_income,worlwide_gross_income
83917,tt8206668,Bad Education,Bad Education,2019,TV Movie 2019,"Biography, Comedy, Crime",108,USA,English,Cory Finley,"Mike Makowsky, Robert Kolker",HBO Films,"Hugh Jackman, Ray Romano, Welker White, Alliso...",The beloved superintendent of New York's Rosly...,7.1,23973,,,


In [19]:
#By google search release date of "Bad Education" was 8th Sept 2019
#Replacing the date to 8th Sept 2019
IMDB_movies['date_published']= IMDB_movies['date_published'].replace(['TV Movie 2019'],'2019-09-08')

In [20]:
#Convert date_published to Date from Object i.e. string
IMDB_movies['date_published']=pd.to_datetime(IMDB_movies['date_published'],format='%Y-%m-%d')

In [21]:
#Verify datatype after conversion
IMDB_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85855 entries, 0 to 85854
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   imdb_title_id          85855 non-null  object        
 1   title                  85855 non-null  object        
 2   original_title         85855 non-null  object        
 3   year                   85855 non-null  int64         
 4   date_published         85855 non-null  datetime64[ns]
 5   genre                  85855 non-null  object        
 6   duration               85855 non-null  int64         
 7   country                85791 non-null  object        
 8   language               85022 non-null  object        
 9   director               85768 non-null  object        
 10  writer                 84283 non-null  object        
 11  production_company     81400 non-null  object        
 12  actors                 85786 non-null  object        
 13  d

In [6]:
# view Rotten Tomatoes dataframe
rotten_tomatoes.head()

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2/12/2010,...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",4/30/2010,...,Sony Pictures Classics,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0,44,123,19
2,m/10,10,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",10/5/1979,...,Waner Bros.,Fresh,67.0,24.0,Spilled,53.0,14684.0,2,16,8
3,m/1000013-12_angry_men,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",4/13/1957,...,Criterion Collection,Certified-Fresh,100.0,54.0,Upright,97.0,105386.0,6,54,0
4,m/1000079-20000_leagues_under_the_sea,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1/1/1954,...,Disney,Fresh,89.0,27.0,Upright,74.0,68918.0,5,24,3


In [7]:
merged_movies=IMDB_movies.merge(rotten_tomatoes, how='inner', left_on='original_title', right_on='movie_title',suffixes=('_imdb','_rt'))
merged_movies.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,production_company_rt,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,20th Century Fox,Fresh,62.0,39.0,Upright,69.0,20541.0,10,24,15
1,tt0007801,Cleopatra,Cleopatra,1917,1917-10-14,"Biography, Drama, History",125,USA,English,J. Gordon Edwards,...,20th Century Fox,Fresh,62.0,39.0,Upright,69.0,20541.0,10,24,15
2,tt0024991,Cleopatra,Cleopatra,1934,1934-10-05,"Biography, Drama, History",100,USA,English,Cecil B. DeMille,...,20th Century Fox,Fresh,62.0,39.0,Upright,69.0,20541.0,10,24,15
3,tt0056937,Cleopatra,Cleopatra,1963,1964-01-30,"Biography, Drama, History",192,"Switzerland, UK, USA",English,Joseph L. Mankiewicz,...,20th Century Fox,Fresh,62.0,39.0,Upright,69.0,20541.0,10,24,15
4,tt0346765,Cleopatra,Cleopatra,2003,2003-08-14,"Comedy, Drama",104,"Spain, Argentina",Spanish,Eduardo Mignogna,...,20th Century Fox,Fresh,62.0,39.0,Upright,69.0,20541.0,10,24,15


In [None]:
# remove 
#merged_movies = merged_movies.drop(columns=['metascore', 'reviews_from_users', 'reviews_from_critics'])
#merged_movies.head()

In [25]:
#declarative Base
Base = declarative_base()

In [26]:
#Create Engine movies_db
engine = create_engine("sqlite:///movies_db.sqlite")

In [27]:
# This creates the DB (movies_dv.sqllite) and connects to it
conn = engine.connect()

In [None]:
#Define class i.e table to be created in DB
class Movies(Base):
    __tablename__='movies'
    imdb_title_id = Column(String(15),primary_key=True)
    title = Column(String(200))
    original_title = Column(String(200))
    year = Column(Integer)
    date_published = Column(Date)
    genre = Column(String(35))
    duration = Column(Integer)
    country = Column(String(255))
    language = Column(String(165))
    director = Column(String(65))
    writer = Column(String(65))
    production_company = Column(String(105))
    actors = Column(String(420))
    description = Column(String(405))
    avg_vote = Column(Float)
    votes = Column(Integer)
    budget = Column(String(20))
    usa_gross_income = Column(String(15))
    worlwide_gross_income = Column(String(15))

In [None]:
#IMDB_movies.to_csv("Raw_Data/updated_IMDB.csv", index=False, header=True)
#rotten_tomatoes.to_csv("Raw_Data/updated_rotten_tomatoes.csv", index=False, header=True)