# Data Project 1 - NoSQL

**Name:** Victoria Ok (vyo7tv) <br>
**Database Used:** Sakila (https://dev.mysql.com/doc/sakila/en/)

### Set Up
- libraries

In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine

In [2]:
# set global variables for connection to server/database

host_name = "localhost"
host_ip = "127.0.0.1"
port = "3306"
user_id = "root"
pwd = "ViolinOkTree5678!"

src_dbname = "sakila"
dst_dbname = "sakila_dw"

In [3]:
# get and set data in database

def get_dataframe(user_id, pwd, host_name, db_name, sql_query):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    dframe = pd.read_sql(sql_query, connection);
    connection.close()
    
    return dframe


# create connection, can insert or update
def set_dataframe(user_id, pwd, host_name, db_name, df, table_name, pk_column, db_operation):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    if db_operation == "insert":
        # adds to table
        df.to_sql(table_name, con=connection, index=False, if_exists='replace')
        # adds primary key
        sqlEngine.execute(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});")
            
    elif db_operation == "update":
        # updates table
        df.to_sql(table_name, con=connection, index=False, if_exists='append')
    
    connection.close()

### Load and Transform data from CSV files

In [4]:
data_dir = os.path.join(os.getcwd(), 'sakila-csv-data')
data_file = os.path.join(data_dir, 'actor.csv')

df_actors = pd.read_csv(data_file, header=0, index_col=False)
df_actors.head()

Unnamed: 0,actor_id,first_name,last_name,last_update
0,1,PENELOPE,GUINESS,2006-02-15 04:34:33
1,2,NICK,WAHLBERG,2006-02-15 04:34:33
2,3,ED,CHASE,2006-02-15 04:34:33
3,4,JENNIFER,DAVIS,2006-02-15 04:34:33
4,5,JOHNNY,LOLLOBRIGIDA,2006-02-15 04:34:33


In [5]:
df_actors.drop(['last_update'], axis=1, inplace=True)
df_actors.rename(columns={"actor_id":"actor_key"}, inplace=True)
df_actors.head(2)

Unnamed: 0,actor_key,first_name,last_name
0,1,PENELOPE,GUINESS
1,2,NICK,WAHLBERG


In [6]:
data_dir = os.path.join(os.getcwd(), 'sakila-csv-data')
data_file = os.path.join(data_dir, 'film.csv')

df_film = pd.read_csv(data_file, header=0, index_col=False)
df_film.head()

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,1,,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,1,,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",2006-02-15 05:03:42
2,3,ADAPTATION HOLES,A Astounding Reflection of a Lumberjack And a ...,2006,1,,7,2.99,50,18.99,NC-17,"Trailers,Deleted Scenes",2006-02-15 05:03:42
3,4,AFFAIR PREJUDICE,A Fanciful Documentary of a Frisbee And a Lumb...,2006,1,,5,2.99,117,26.99,G,"Commentaries,Behind the Scenes",2006-02-15 05:03:42
4,5,AFRICAN EGG,A Fast-Paced Documentary of a Pastry Chef And ...,2006,1,,6,2.99,130,22.99,G,Deleted Scenes,2006-02-15 05:03:42


In [7]:
df_film.drop(['description','original_language_id', 'last_update'], axis=1, inplace=True)
df_film.rename(columns={"film_id":"film_key", "language_id":"language_key"}, inplace=True)
df_film.head()

Unnamed: 0,film_key,title,release_year,language_key,rental_duration,rental_rate,length,replacement_cost,rating,special_features
0,1,ACADEMY DINOSAUR,2006,1,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes"
1,2,ACE GOLDFINGER,2006,1,3,4.99,48,12.99,G,"Trailers,Deleted Scenes"
2,3,ADAPTATION HOLES,2006,1,7,2.99,50,18.99,NC-17,"Trailers,Deleted Scenes"
3,4,AFFAIR PREJUDICE,2006,1,5,2.99,117,26.99,G,"Commentaries,Behind the Scenes"
4,5,AFRICAN EGG,2006,1,6,2.99,130,22.99,G,Deleted Scenes


In [8]:
data_dir = os.path.join(os.getcwd(), 'sakila-csv-data')
data_file = os.path.join(data_dir, 'film_actor.csv')

df_film_actor = pd.read_csv(data_file, header=0, index_col=False)
df_film_actor.head()

Unnamed: 0,actor_id,film_id,last_update
0,1,1,2006-02-15 05:05:03
1,1,23,2006-02-15 05:05:03
2,1,25,2006-02-15 05:05:03
3,1,106,2006-02-15 05:05:03
4,1,140,2006-02-15 05:05:03


In [9]:
df_film_actor.drop(['last_update'], axis=1, inplace=True)
df_film_actor.rename(columns={"actor_id":"actor_key", "film_id":"film_key"}, inplace=True)
df_film_actor.head()

Unnamed: 0,actor_key,film_key
0,1,1
1,1,23
2,1,25
3,1,106
4,1,140


In [10]:
data_dir = os.path.join(os.getcwd(), 'sakila-csv-data')
data_file = os.path.join(data_dir, 'language.csv')

df_language = pd.read_csv(data_file, header=0, index_col=False)
df_language.head()

Unnamed: 0,language_id,name,last_update
0,1,English,2006-02-15 05:02:19
1,2,Italian,2006-02-15 05:02:19
2,3,Japanese,2006-02-15 05:02:19
3,4,Mandarin,2006-02-15 05:02:19
4,5,French,2006-02-15 05:02:19


In [11]:
df_language.drop(['last_update'], axis=1, inplace=True)
df_language.rename(columns={"language_id":"language_key"}, inplace=True)
df_language.head()

Unnamed: 0,language_key,name
0,1,English
1,2,Italian
2,3,Japanese
3,4,Mandarin
4,5,French


In [12]:
df_film_actor = pd.merge(df_film_actor, df_actors, on='actor_key', how='inner')
df_film_actor.drop(['actor_key'], axis=1, inplace=True)

df_film_actor.head()

Unnamed: 0,film_key,first_name,last_name
0,1,PENELOPE,GUINESS
1,23,PENELOPE,GUINESS
2,25,PENELOPE,GUINESS
3,106,PENELOPE,GUINESS
4,140,PENELOPE,GUINESS


In [13]:
df_film_actor = pd.merge(df_film_actor, df_film, on='film_key', how='inner')
df_film_actor.drop(['film_key'], axis=1, inplace=True)

df_film_actor.head()

Unnamed: 0,first_name,last_name,title,release_year,language_key,rental_duration,rental_rate,length,replacement_cost,rating,special_features
0,PENELOPE,GUINESS,ACADEMY DINOSAUR,2006,1,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes"
1,CHRISTIAN,GABLE,ACADEMY DINOSAUR,2006,1,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes"
2,LUCILLE,TRACY,ACADEMY DINOSAUR,2006,1,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes"
3,SANDRA,PECK,ACADEMY DINOSAUR,2006,1,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes"
4,PENELOPE,GUINESS,ANACONDA CONFESSIONS,2006,1,3,0.99,92,9.99,R,"Trailers,Deleted Scenes"


In [14]:
df_film_actor = pd.merge(df_film_actor, df_language, on='language_key', how='inner')
df_film_actor.drop(['language_key'], axis=1, inplace=True)

df_film_actor.head()

Unnamed: 0,first_name,last_name,title,release_year,rental_duration,rental_rate,length,replacement_cost,rating,special_features,name
0,PENELOPE,GUINESS,ACADEMY DINOSAUR,2006,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",English
1,CHRISTIAN,GABLE,ACADEMY DINOSAUR,2006,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",English
2,LUCILLE,TRACY,ACADEMY DINOSAUR,2006,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",English
3,SANDRA,PECK,ACADEMY DINOSAUR,2006,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",English
4,PENELOPE,GUINESS,ANACONDA CONFESSIONS,2006,3,0.99,92,9.99,R,"Trailers,Deleted Scenes",English


In [15]:
df_film_actor.rename(columns={"first_name":"actor_first_name", "last_name":"actor_last_name", 
                              "title":"film_title", "name":"language"}, inplace=True)
df_film_actor.head()

Unnamed: 0,actor_first_name,actor_last_name,film_title,release_year,rental_duration,rental_rate,length,replacement_cost,rating,special_features,language
0,PENELOPE,GUINESS,ACADEMY DINOSAUR,2006,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",English
1,CHRISTIAN,GABLE,ACADEMY DINOSAUR,2006,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",English
2,LUCILLE,TRACY,ACADEMY DINOSAUR,2006,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",English
3,SANDRA,PECK,ACADEMY DINOSAUR,2006,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",English
4,PENELOPE,GUINESS,ANACONDA CONFESSIONS,2006,3,0.99,92,9.99,R,"Trailers,Deleted Scenes",English


In [16]:
# Insert a new column, with an ever-incrementing numeric value, to serve as the primary key.
df_film_actor.insert(0, "film_actor_key", range(1, df_film_actor.shape[0]+1))
df_film_actor.head()

Unnamed: 0,film_actor_key,actor_first_name,actor_last_name,film_title,release_year,rental_duration,rental_rate,length,replacement_cost,rating,special_features,language
0,1,PENELOPE,GUINESS,ACADEMY DINOSAUR,2006,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",English
1,2,CHRISTIAN,GABLE,ACADEMY DINOSAUR,2006,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",English
2,3,LUCILLE,TRACY,ACADEMY DINOSAUR,2006,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",English
3,4,SANDRA,PECK,ACADEMY DINOSAUR,2006,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",English
4,5,PENELOPE,GUINESS,ANACONDA CONFESSIONS,2006,3,0.99,92,9.99,R,"Trailers,Deleted Scenes",English


In [18]:
table_name = "dim_film_actor"
primary_key = "film_actor_key"
db_operation = "insert"

set_dataframe(user_id, pwd, host_name, dst_dbname, df_film_actor, table_name, primary_key, db_operation)