# Project 5: Working with Pandas and SQL Databases (Movies Dataset)

## Creating an SQLite Database

In [1]:
import sqlite3 as sq3

In [2]:
con = sq3.connect("movies.db")

In [3]:
#Verifica se a conexão deu certo
con

<sqlite3.Connection at 0x23e6c508a80>

In [4]:
con.execute("Select * FROM sqlite_master").fetchall()

[('table',
  'Movies',
  'Movies',
  2,
  'CREATE TABLE "Movies" (\n"id" INTEGER,\n  "title" TEXT,\n  "revenue" REAL,\n  "budget" REAL,\n  "belongs_to_collection_name" TEXT,\n  "release_date" TIMESTAMP\n)'),
 ('table',
  'Genres',
  'Genres',
  3,
  'CREATE TABLE "Genres" (\n"genre_id" INTEGER,\n  "genre_name" TEXT,\n  "id" INTEGER\n)')]

In [5]:
con.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name").fetchall()

[('Genres',), ('Movies',)]

In [6]:
#Fecha a conexão
con.close()

## Loading Data from DataFrames into an SQLite Database

In [9]:
import pandas as pd
import json
import sqlite3 as sq3

In [10]:
#Abertura do arquivo 
with open("some_movies.json") as f:
    data = json.load(f)

In [11]:
data

[{'adult': False,
  'backdrop_path': '/orjiB3oUIsyz60hoEqkiGpy5CeO.jpg',
  'belongs_to_collection': {'id': 86311,
   'name': 'The Avengers Collection',
   'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
   'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
  'budget': 356000000,
  'genres': [{'id': 12, 'name': 'Adventure'},
   {'id': 878, 'name': 'Science Fiction'},
   {'id': 28, 'name': 'Action'}],
  'homepage': 'https://www.marvel.com/movies/avengers-endgame',
  'id': 299534,
  'imdb_id': 'tt4154796',
  'original_language': 'en',
  'original_title': 'Avengers: Endgame',
  'overview': "After the devastating events of Avengers: Infinity War, the universe is in ruins due to the efforts of the Mad Titan, Thanos. With the help of remaining allies, the Avengers must assemble once more in order to undo Thanos' actions and restore order to the universe once and for all, no matter what consequences may be in store.",
  'popularity': 50.279,
  'poster_path': '/or06FN3Dka5tukK1e9sl16pB3iy

In [12]:
#Armazena o banco de dados na variável df
#Há uma normalização com o separador 
df = pd.json_normalize(data, sep = "_")

In [13]:
df

Unnamed: 0,adult,backdrop_path,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,tagline,title,video,vote_average,vote_count,belongs_to_collection_id,belongs_to_collection_name,belongs_to_collection_poster_path,belongs_to_collection_backdrop_path,belongs_to_collection
0,False,/orjiB3oUIsyz60hoEqkiGpy5CeO.jpg,356000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 878, ...",https://www.marvel.com/movies/avengers-endgame,299534,tt4154796,en,Avengers: Endgame,After the devastating events of Avengers: Infi...,...,Part of the journey is the end.,Avengers: Endgame,False,8.3,12607,86311.0,The Avengers Collection,/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg,/zuW6fOiusv4X9nnW3paHGfXcSll.jpg,
1,False,/wcC7kCICL6x6zHUlUyNp9pWoqW1.jpg,237000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://www.avatarmovie.com/,19995,tt0499549,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",...,Enter the World of Pandora.,Avatar,False,7.4,21000,87096.0,Avatar Collection,/nslJVsO58Etqkk17oXMuVK4gNOF.jpg,/8nCr9W7sKus2q9PLbYsnT7iCkuT.jpg,
2,False,/8BTsTfln4jlQrLXUBquXJ0ASQy9.jpg,245000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://www.starwars.com/films/star-wars-episod...,140607,tt2488496,en,Star Wars: The Force Awakens,Thirty years after defeating the Galactic Empi...,...,Every generation has a story.,Star Wars: The Force Awakens,False,7.4,14205,10.0,Star Wars Collection,/r8Ph5MYXL04Qzu4QBbq2KjqwtkQ.jpg,/d8duYyyC9J5T825Hg7grmaabfxQ.jpg,
3,False,/lmZFxXgJE3vgrciwuDib0N8CfQo.jpg,300000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",https://www.marvel.com/movies/avengers-infinit...,299536,tt4154756,en,Avengers: Infinity War,As the Avengers and their allies have continue...,...,An entire universe. Once and for all.,Avengers: Infinity War,False,8.3,17718,86311.0,The Avengers Collection,/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg,/zuW6fOiusv4X9nnW3paHGfXcSll.jpg,
4,False,/6VmFqApQRyZZzmiGOQq2C92jyvH.jpg,200000000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,597,tt0120338,en,Titanic,101-year-old Rose DeWitt Bukater tells the sto...,...,Nothing on Earth could come between them.,Titanic,False,7.8,16661,,,,,
5,False,/9OywzJLgobAbSGfhMXligzTylko.jpg,150000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://www.jurassicworld.com/,135397,tt0369610,en,Jurassic World,Twenty-two years after the events of Jurassic ...,...,The park is open.,Jurassic World,False,6.6,15399,328.0,Jurassic Park Collection,/qIm2nHXLpBBdMxi8dvfrnDkBUDh.jpg,/njFixYzIxX8jsn6KMSEtAzi4avi.jpg,
6,False,/nRXO2SnOA75OsWhNhXstHB8ZmI3.jpg,260000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",https://movies.disney.com/the-lion-king-2019,420818,tt6105098,en,The Lion King,"Simba idolizes his father, King Mufasa, and ta...",...,The King has Returned.,The Lion King,False,7.2,5425,,,,,
7,False,/v3A0T4fAz8xRugAkfUVkxGLd377.jpg,220000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",http://marvel.com/avengers_movie/,24428,tt0848228,en,The Avengers,When an unexpected enemy emerges and threatens...,...,Some assembly required.,The Avengers,False,7.7,22101,86311.0,The Avengers Collection,/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg,/zuW6fOiusv4X9nnW3paHGfXcSll.jpg,
8,False,/7X6zOOEzXTQJkzDgBHVly1BgyYu.jpg,190000000,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",http://www.furious7.com/,168259,tt2820852,en,Furious 7,Deckard Shaw seeks revenge against Dominic Tor...,...,Vengeance Hits Home,Furious 7,False,7.3,7359,9485.0,The Fast and the Furious Collection,/uv63yAGg1zETAs1XQsOQpava87l.jpg,/z5A5W3WYJc3UVEWljSGwdjDgQ0j.jpg,
9,False,/4zmNJNQkugbalqJLnKp0wnlp8i3.jpg,250000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://marvel.com/movies/movie/193/avengers_ag...,99861,tt2395427,en,Avengers: Age of Ultron,When Tony Stark tries to jumpstart a dormant p...,...,A New Age Has Come.,Avengers: Age of Ultron,False,7.3,15548,86311.0,The Avengers Collection,/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg,/zuW6fOiusv4X9nnW3paHGfXcSll.jpg,


In [14]:
#Criação de um banco de dados chamado movies
#Há seleção de colunas
movies = df[["id", "title", "revenue", "budget", "belongs_to_collection_name", "release_date"]].copy()
movies

Unnamed: 0,id,title,revenue,budget,belongs_to_collection_name,release_date
0,299534,Avengers: Endgame,2797800564,356000000,The Avengers Collection,2019-04-24
1,19995,Avatar,2787965087,237000000,Avatar Collection,2009-12-10
2,140607,Star Wars: The Force Awakens,2068223624,245000000,Star Wars Collection,2015-12-15
3,299536,Avengers: Infinity War,2046239637,300000000,The Avengers Collection,2018-04-25
4,597,Titanic,1845034188,200000000,,1997-11-18
5,135397,Jurassic World,1671713208,150000000,Jurassic Park Collection,2015-06-06
6,420818,The Lion King,1656943394,260000000,,2019-07-12
7,24428,The Avengers,1519557910,220000000,The Avengers Collection,2012-04-25
8,168259,Furious 7,1515047671,190000000,The Fast and the Furious Collection,2015-04-01
9,99861,Avengers: Age of Ultron,1405403694,250000000,The Avengers Collection,2015-04-22


In [15]:
#informações do dataset
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 6 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   id                          18 non-null     int64 
 1   title                       18 non-null     object
 2   revenue                     18 non-null     int64 
 3   budget                      18 non-null     int64 
 4   belongs_to_collection_name  15 non-null     object
 5   release_date                18 non-null     object
dtypes: int64(3), object(3)
memory usage: 992.0+ bytes


In [24]:
#Conversão do texto para data. No programa ele não compreende que 2019-04-24 é uma data
#Não foi feito no df pq é o banco de dados original.
movies.release_date = pd.to_datetime(df.release_date)

In [17]:
#É feito a divisão para ficar um número menor 
movies.revenue = df.revenue/1000000
movies.budget = df.budget/1000000

In [18]:
#Criação de um novo banco de dados chamado votes
votes = df[["id", "vote_count", "vote_average"]].copy()
votes

Unnamed: 0,id,vote_count,vote_average
0,299534,12607,8.3
1,19995,21000,7.4
2,140607,14205,7.4
3,299536,17718,8.3
4,597,16661,7.8
5,135397,15399,6.6
6,420818,5425,7.2
7,24428,22101,7.7
8,168259,7359,7.3
9,99861,15548,7.3


In [19]:
#Criação de um novo banco de dados chamado genres
#É uma outra forma de criar 
genres = pd.json_normalize(data = data, record_path = "genres", meta = "id", record_prefix = "genre_")
genres

Unnamed: 0,genre_id,genre_name,id
0,12,Adventure,299534
1,878,Science Fiction,299534
2,28,Action,299534
3,28,Action,19995
4,12,Adventure,19995
5,14,Fantasy,19995
6,878,Science Fiction,19995
7,28,Action,140607
8,12,Adventure,140607
9,878,Science Fiction,140607


In [20]:
prod = pd.json_normalize(data = data, record_path = "production_companies", meta = "id", record_prefix = "comp_")
prod

Unnamed: 0,comp_id,comp_logo_path,comp_name,comp_origin_country,id
0,420,/hUzeosd33nzE5MCNsZxCGEKTXaQ.png,Marvel Studios,US,299534
1,444,/42UPdZl6B2cFXgNUASR8hSt9mpS.png,Dune Entertainment,US,19995
2,574,/iB6GjNVHs5hOqcEYt2rcjBqIjki.png,Lightstorm Entertainment,US,19995
3,25,/qZCc1lty5FzX30aOCVRBLzaVmcp.png,20th Century Fox,US,19995
4,290,/Q8mw2AOQQc8Qg0uNwLWq86DVZv.png,Ingenious Media,GB,19995
5,1634,,Truenorth Productions,IS,140607
6,1,/o86DbpburjxrqAzEDhXZcyE8pDb.png,Lucasfilm,US,140607
7,11461,/p9FoEt5shEKRWRKVIlvFaEmRnun.png,Bad Robot,US,140607
8,420,/hUzeosd33nzE5MCNsZxCGEKTXaQ.png,Marvel Studios,US,299536
9,4,/fycMZt242LVjagMByZOLUGbCvv3.png,Paramount,US,597


In [21]:
con = sq3.connect("movies.db")

In [22]:
con

<sqlite3.Connection at 0x23e746898a0>

In [23]:
movies.to_sql("Movies", con, index = False)

ValueError: Table 'Movies' already exists.

In [25]:
votes.to_sql("Votes", con, index = False)

In [26]:
genres.to_sql("Genres", con, index = False)

ValueError: Table 'Genres' already exists.

In [29]:
prod.to_sql("Prod", con, index = False)

ProgrammingError: Cannot operate on a closed database.

In [30]:
con.execute("Select * FROM sqlite_master").fetchall()

ProgrammingError: Cannot operate on a closed database.

In [27]:
con.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name").fetchall()

[('Genres',), ('Movies',), ('Votes',)]

In [28]:
con.close()

## Loading Data from SQLite Databases into DataFrames

In [54]:
import pandas as pd
import sqlite3 as sq3

In [55]:
con = sq3.connect("movies.db")

In [56]:
con.execute("Select * FROM sqlite_master").fetchall()

[('table',
  'Movies',
  'Movies',
  2,
  'CREATE TABLE "Movies" (\n"id" INTEGER,\n  "title" TEXT,\n  "revenue" REAL,\n  "budget" REAL,\n  "belongs_to_collection_name" TEXT,\n  "release_date" TIMESTAMP\n)'),
 ('table',
  'Votes',
  'Votes',
  3,
  'CREATE TABLE "Votes" (\n"id" INTEGER,\n  "vote_count" INTEGER,\n  "vote_average" REAL\n)'),
 ('table',
  'Genres',
  'Genres',
  4,
  'CREATE TABLE "Genres" (\n"genre_id" INTEGER,\n  "genre_name" TEXT,\n  "id" INTEGER\n)'),
 ('table',
  'Prod',
  'Prod',
  5,
  'CREATE TABLE "Prod" (\n"comp_id" INTEGER,\n  "comp_logo_path" TEXT,\n  "comp_name" TEXT,\n  "comp_origin_country" TEXT,\n  "id" INTEGER\n)')]

In [57]:
pd.read_sql("SELECT * FROM Movies", con)

Unnamed: 0,id,title,revenue,budget,belongs_to_collection_name,release_date
0,299534,Avengers: Endgame,2797.800564,356.0,The Avengers Collection,2019-04-24 00:00:00
1,19995,Avatar,2787.965087,237.0,Avatar Collection,2009-12-10 00:00:00
2,140607,Star Wars: The Force Awakens,2068.223624,245.0,Star Wars Collection,2015-12-15 00:00:00
3,299536,Avengers: Infinity War,2046.239637,300.0,The Avengers Collection,2018-04-25 00:00:00
4,597,Titanic,1845.034188,200.0,,1997-11-18 00:00:00
5,135397,Jurassic World,1671.713208,150.0,Jurassic Park Collection,2015-06-06 00:00:00
6,420818,The Lion King,1656.943394,260.0,,2019-07-12 00:00:00
7,24428,The Avengers,1519.55791,220.0,The Avengers Collection,2012-04-25 00:00:00
8,168259,Furious 7,1515.047671,190.0,The Fast and the Furious Collection,2015-04-01 00:00:00
9,99861,Avengers: Age of Ultron,1405.403694,250.0,The Avengers Collection,2015-04-22 00:00:00


In [58]:
df = pd.read_sql("SELECT * FROM Movies", con, index_col = "id", parse_dates = "release_date")
df

Unnamed: 0_level_0,title,revenue,budget,belongs_to_collection_name,release_date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
299534,Avengers: Endgame,2797.800564,356.0,The Avengers Collection,2019-04-24
19995,Avatar,2787.965087,237.0,Avatar Collection,2009-12-10
140607,Star Wars: The Force Awakens,2068.223624,245.0,Star Wars Collection,2015-12-15
299536,Avengers: Infinity War,2046.239637,300.0,The Avengers Collection,2018-04-25
597,Titanic,1845.034188,200.0,,1997-11-18
135397,Jurassic World,1671.713208,150.0,Jurassic Park Collection,2015-06-06
420818,The Lion King,1656.943394,260.0,,2019-07-12
24428,The Avengers,1519.55791,220.0,The Avengers Collection,2012-04-25
168259,Furious 7,1515.047671,190.0,The Fast and the Furious Collection,2015-04-01
99861,Avengers: Age of Ultron,1405.403694,250.0,The Avengers Collection,2015-04-22


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18 entries, 299534 to 260513
Data columns (total 5 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   title                       18 non-null     object        
 1   revenue                     18 non-null     float64       
 2   budget                      18 non-null     float64       
 3   belongs_to_collection_name  15 non-null     object        
 4   release_date                18 non-null     datetime64[ns]
dtypes: datetime64[ns](1), float64(2), object(2)
memory usage: 864.0+ bytes


In [60]:
genres = pd.read_sql("SELECT * FROM Genres", con, index_col = "id")
genres

Unnamed: 0_level_0,genre_id,genre_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1
299534,12,Adventure
299534,878,Science Fiction
299534,28,Action
19995,28,Action
19995,12,Adventure
19995,14,Fantasy
19995,878,Science Fiction
140607,28,Action
140607,12,Adventure
140607,878,Science Fiction


In [61]:
con.close()