# Team: Omni Oracle
## ETL - Top 250 Movies

# Step 3: Loading process

With all the data extracted and cleaned, we are going to load them inside the relational database in pgAdmin. We are using the SQLAlchemy module to help us connect to the database in pgAdmin and send our queries through this notebook. Even though we only have two DataFrames here, we are going to build a relational database so we can expand the tables into multiple ones.

The tables that we are going to make are:
1. movies - Contains unique movies and their IDs
2. actors - Contains unique actors and their IDs
3. genres - Contains unique genres and their IDs
4. movies_actors - Referential table connecting movies and actors
5. movies_genres - Referential table connecting movies and genres

In [None]:
!pip install SQLAlchemy

In [None]:
# to manage json data
import json

# for pandas dataframes
import pandas as pd

import sqlalchemy as db
from sqlalchemy_utils import create_database
from sqlalchemy_utils import database_exists
from sqlalchemy.types import Integer, String, Text, ARRAY, VARCHAR, Date, Numeric, BigInteger

## This is the main function for loading the DataFrames into the database and making the other tables.

We establish the connection to pgAdmin to create the database and the tables. After that, we load in our DataFrames and create the referential tables to connect our tables in the database.

In [None]:
def load_data(movies_df, actors_df, database_name='omni_oracle_movies_db'):
    # user postgres, password admin
    engine = db.create_engine(f'postgresql://postgres:admin@localhost:5432/{database_name}')
    if database_exists(f'postgresql://postgres:admin@localhost:5432/{database_name}'):
        print(f"{database_name} already exists.")
    else:
        print(f"Creating database: {database_name}")
        # create database
        create_database(engine.url)
    
    conn = engine.raw_connection()
    
    # release resources associated with engine
    engine.dispose()
    
    # Initialize connection to PostgreSQL
    cur = conn.cursor()
    
    # Create new tables in PostgreSQL
    commands = (# TABLE 1: WEATHER
                '''
                CREATE TABLE IF NOT EXISTS movies(id SERIAL PRIMARY KEY,
                                                    title TEXT,
                                                    year INT,
                                                    certification VARCHAR(10),
                                                    release_date DATE,
                                                    runtime INT,
                                                    genre VARCHAR(20)[],
                                                    description TEXT,
                                                    language VARCHAR(50)[],
                                                    country VARCHAR(60)[],
                                                    directors TEXT[],
                                                    actors TEXT[],
                                                    oscars INT,
                                                    winnings INT,
                                                    nominations INT,
                                                    ratings NUMERIC,
                                                    num_of_votes BIGINT,
                                                    revenue BIGINT,
                                                    budget BIGINT);
    
                CREATE TABLE IF NOT EXISTS actors(id SERIAL PRIMARY KEY,
                                                 name TEXT,
                                                 date_of_birth DATE,
                                                 date_of_death DATE,
                                                 gender VARCHAR(6),
                                                 num_of_acting_credits INT);
    
                CREATE TABLE IF NOT EXISTS genres (
                    id SERIAL PRIMARY KEY,
                    genre_name VARCHAR(20) UNIQUE NOT NULL
                );
                                                 
                ''')
    
    # Create cursor to execute SQL commands
    
    #for command in commands:
    cur.execute(commands)
    
    # Commit changes
    conn.commit()
    
    movies_df.to_sql(name= 'movies', con = engine, if_exists= 'replace', index= False,  # Note replace here
              dtype={'id': Integer, 'title':Text, 'year':Integer, 'certification': VARCHAR(10), 'release_date': Date, 'runtime': Integer,
                    'genre': ARRAY(VARCHAR(20)), 'description': Text, "language": ARRAY(VARCHAR(50)), "country": ARRAY(VARCHAR(60)), 
                     "directors": ARRAY(Text), "actors": ARRAY(Text), "oscars": Integer, "winnings": Integer, "nominations": Integer,
                    "ratings": Numeric, "num_of_votes": BigInteger, "revenue": BigInteger, "budget": BigInteger})
    
    actors_df.to_sql(name= 'actors', con = engine, if_exists= 'replace', index= False,  # Note replace here
              dtype={'id': Integer, 'name':Text, 'date_of_birth':Date, 'date_of_death': Date, 'gender': VARCHAR(6),
                    'num_of_acting_credits': Integer})
    
    # Initialize connection to PostgreSQL
    cur = conn.cursor()
    
    # Create new tables in PostgreSQL
    commands = (# TABLE 1: WEATHER
                '''
                ALTER TABLE movies
                ADD PRIMARY KEY (id);
    
                ALTER TABLE actors
                ADD PRIMARY KEY (id);
    
                CREATE TABLE movies_genres (
                    movie_id INT REFERENCES movies(id),
                    genre_id INT REFERENCES genres(id),
                    PRIMARY KEY (movie_id, genre_id)
                );
    
                INSERT INTO genres (genre_name)
                SELECT DISTINCT UNNEST(genre) AS genre_name
                FROM movies
                ORDER BY genre_name;
                
                INSERT INTO movies_genres (movie_id, genre_id)
                SELECT 
                    m.id,
                    g.id
                FROM 
                    movies m,
                    UNNEST(m.genre) AS genre
                JOIN 
                    genres g 
                ON g.genre_name = genre;
    
                CREATE TABLE movies_actors (
                    movie_id INT REFERENCES movies(id),
                    actor_id INT REFERENCES actors(id),
                    PRIMARY KEY (movie_id, actor_id)
                );
                
                INSERT INTO movies_actors (movie_id, actor_id)
                SELECT 
                    m.id,
                    a.id
                FROM 
                    movies m,
                    UNNEST(m.actors) AS actor_name
                JOIN 
                    actors a 
                ON a.name = actor_name;
    
                ALTER TABLE movies
                DROP COLUMN actors, DROP COLUMN genre;
                                                 
                ''')
    
    # Create cursor to execute SQL commands
    cur.execute(commands)
    
    # Commit changes
    conn.commit()
    
    # Close communication with server
    cur.close()
    conn.close()