In [None]:
# Tamera Fang (ven7sg) - DS2002 Midterm Project 
# This project focuses on extracting, transforming, and loading (ETL) data from multiple sources including MySQL, MongoDB, and CSV files into a data warehouse. 
# In this notebook, we are exploring sales data, customer information, track metadata, and album details from the Chinook database. 
# The goal is to understand how the data from different sources can be merged, cleaned, and analyzed to provide insights like total sales across countries, top customers, and average song lengths. 

In [None]:
# INSTALL LIBRARIES 

In [10]:
!pip install mysql-connector-python
!pip install sqlalchemy
!pip install pymongo[srv]
!pip install pandas
!pip install matplotlib
!pip install cryptography
!pip install PyMySQL

zsh:1: no matches found: pymongo[srv]


In [None]:
# IMPORT LIBRARIES 

In [6]:
import os
import pymysql
import mysql.connector
import sqlalchemy
from sqlalchemy import create_engine
import pymongo
import datetime
import json
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [8]:
print(f"Running SQL Alchemy Version: {sqlalchemy.__version__}")
print(f"Running PyMongo Version: {pymongo.__version__}")

Running SQL Alchemy Version: 2.0.35
Running PyMongo Version: 4.10.1


In [None]:
# SQL ARGS

In [4]:
host_name = "localhost"
host_ip = "127.0.0.1"
port = "3306"

user_id = "root"
pwd = "Passw0rd123"
db_name = "chinook"

In [None]:
# TEST CONNECTION

In [14]:
import cryptography
import pymysql
print("PyMySQL version:", pymysql.__version__)

try:
    connection = pymysql.connect(
        host="localhost",
        user="root",
        password="Passw0rd123",
        database="chinook"
    )
    print("Connection successful!")
    connection.close()
except Exception as e:
    print(f"Connection failed: {e}")

PyMySQL version: 1.0.2
Connection successful!


In [5]:
# IMPORT DATA FROM SQL

In [6]:
# RETRIEVE DATAFRAME FROM SQL
def get_sqldf(host_name, user_id, pwd, db_name, sqlQuery):
    try:
        conn = pymysql.connect(host=host_name, user=user_id, password=pwd, database=db_name)
        df = pd.read_sql(sqlQuery, conn)
        conn.close()
        return df
    except:
        print("Error. Check your connection.")

In [7]:
# INVOICE TABLE 
df_invoice = get_sqldf(host_name, user_id, pwd, db_name, ("SELECT * FROM invoice order by CustomerId;"))
df_invoice.head()

Unnamed: 0,InvoiceId,CustomerId,InvoiceDate,BillingAddress,BillingCity,BillingState,BillingCountry,BillingPostalCode,Total
0,98,1,2010-03-11,"Av. Brigadeiro Faria Lima, 2170",São José dos Campos,SP,Brazil,12227-000,3.98
1,121,1,2010-06-13,"Av. Brigadeiro Faria Lima, 2170",São José dos Campos,SP,Brazil,12227-000,3.96
2,143,1,2010-09-15,"Av. Brigadeiro Faria Lima, 2170",São José dos Campos,SP,Brazil,12227-000,5.94
3,195,1,2011-05-06,"Av. Brigadeiro Faria Lima, 2170",São José dos Campos,SP,Brazil,12227-000,0.99
4,316,1,2012-10-27,"Av. Brigadeiro Faria Lima, 2170",São José dos Campos,SP,Brazil,12227-000,1.98


In [8]:
# CUSTOMER TABLE
df_customer = get_sqldf(host_name, user_id, pwd, db_name, ("SELECT * FROM customer order by CustomerId;"))
df_customer.head()

Unnamed: 0,CustomerId,FirstName,LastName,Company,Address,City,State,Country,PostalCode,Phone,Fax,Email,SupportRepId
0,1,Luís,Gonçalves,Embraer - Empresa Brasileira de Aeronáutica S.A.,"Av. Brigadeiro Faria Lima, 2170",São José dos Campos,SP,Brazil,12227-000,+55 (12) 3923-5555,+55 (12) 3923-5566,luisg@embraer.com.br,3
1,2,Leonie,Köhler,,Theodor-Heuss-Straße 34,Stuttgart,,Germany,70174,+49 0711 2842222,,leonekohler@surfeu.de,5
2,3,François,Tremblay,,1498 rue Bélanger,Montréal,QC,Canada,H2G 1A7,+1 (514) 721-4711,,ftremblay@gmail.com,3
3,4,Bjørn,Hansen,,Ullevålsveien 14,Oslo,,Norway,0171,+47 22 44 22 22,,bjorn.hansen@yahoo.no,4
4,5,František,Wichterlová,JetBrains s.r.o.,Klanova 9/506,Prague,,Czech Republic,14700,+420 2 4172 5555,+420 2 4172 5555,frantisekw@jetbrains.com,4


In [9]:
# FACT TABLE 
conn = pymysql.connect(host=host_name, user=user_id, password=pwd, database=db_name)
df_invoice_fct = pd.read_sql("SELECT * FROM invoice order by InvoiceDate;", conn)
conn.close()
df_invoice_fct.head()

Unnamed: 0,InvoiceId,CustomerId,InvoiceDate,BillingAddress,BillingCity,BillingState,BillingCountry,BillingPostalCode,Total
0,1,2,2009-01-01,Theodor-Heuss-Straße 34,Stuttgart,,Germany,70174,1.98
1,2,4,2009-01-02,Ullevålsveien 14,Oslo,,Norway,0171,3.96
2,3,8,2009-01-03,Grétrystraat 63,Brussels,,Belgium,1000,5.94
3,4,14,2009-01-06,8210 111 ST NW,Edmonton,AB,Canada,T6G 2C7,8.91
4,5,23,2009-01-11,69 Salem Street,Boston,MA,USA,2113,13.86


In [10]:
# IMPORT DATA FROM MONGODB

In [11]:
mysql_uid = "root"
mysql_pwd = "Passw0rd123"
atlas_cluster_name = "cluster0.kqsmdw7"
atlas_default_dbname = "local"
atlas_user_name = "ven7sg"
atlas_password = "Passw0rd123"
conn_str = {"local" : f"mongodb://localhost:27017/",
    "atlas" : f"mongodb+srv://{atlas_user_name}:{atlas_password}@{atlas_cluster_name}.mongodb.net"}
src_dbname = "chinook"
dst_dbname = "chinook_dw"
print(f"Local Connection String: {conn_str['local']}")

Local Connection String: mongodb://localhost:27017/


In [19]:
def get_mongo_dataframe(connect_str, db_name, collection, query):
    try:
        '''Create a connection to MongoDB'''
        client = pymongo.MongoClient(connect_str)
    
        '''Query MongoDB, and fill a python list with documents to create a dataframe'''
        db = client[db_name]
        df = pd.DataFrame(list(db[collection].find(query)))
        df.drop(['_id'], axis=1, inplace=True)
        client.close()
        return df
    except:
        print('Error: failure to get dataframe in MongoDB')

In [20]:
client = pymongo.MongoClient(conn_str["atlas"])
db = client[src_dbname]
data_dir = os.path.join("/Users/tamerafang/Desktop/DS2002_Midterm") 

In [21]:
# TRACK TABLE
json_files = {"track" : 'Track.json'}

for file in json_files:
    try:
        db.drop_collection(file)
        json_file = os.path.join(data_dir, json_files[file])
        with open(json_file, 'r') as openfile:
            json_object = json.load(openfile)
            file = db[file]
            result = file.insert_many(json_object)
            print(f"{file} was successfully loaded.")
    except:
        print('Error. Failure to retrieve JSON file.')
        
client.close() 

Collection(Database(MongoClient(host=['ac-jg6lcfb-shard-00-00.kqsmdw7.mongodb.net:27017', 'ac-jg6lcfb-shard-00-02.kqsmdw7.mongodb.net:27017', 'ac-jg6lcfb-shard-00-01.kqsmdw7.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin', replicaset='atlas-tm12uj-shard-0', tls=True), 'chinook'), 'track') was successfully loaded.


In [22]:
query = {}
collection = "track"

df_track = get_mongo_dataframe(conn_str['atlas'], src_dbname, collection, query) 
df_track.head(2)

Unnamed: 0,__href,MediaType,Genre,Album,TrackId,Name,AlbumId,MediaTypeId,GenreId,Composer,Milliseconds,Bytes,UnitPrice,PlaylistTrack,InvoiceLine
0,/db/Chinook/Track/TrackId/1.json,{'__href': '/db/Chinook/Track/TrackId/1/MediaT...,{'__href': '/db/Chinook/Track/TrackId/1/Genre....,{'__href': '/db/Chinook/Track/TrackId/1/Album....,1,For Those About To Rock (We Salute You) - TEST,1,1,1,"Angus Young, Malcolm Young, Brian Johnson",343719,11170334,0.99,{'__href': '/db/Chinook/Track/TrackId/1/Playli...,{'__href': '/db/Chinook/Track/TrackId/1/Invoic...
1,/db/Chinook/Track/TrackId/2.json,{'__href': '/db/Chinook/Track/TrackId/2/MediaT...,{'__href': '/db/Chinook/Track/TrackId/2/Genre....,{'__href': '/db/Chinook/Track/TrackId/2/Album....,2,Balls to the Wall,2,2,1,,342562,5510424,0.99,{'__href': '/db/Chinook/Track/TrackId/2/Playli...,{'__href': '/db/Chinook/Track/TrackId/2/Invoic...


In [23]:
# IMPORT DATA FROM CSV

In [24]:
# ARTIST TABLE
df_artist = pd.read_csv("/Users/tamerafang/Desktop/DS2002_Midterm/Artist.csv")
df_artist.head(5)

Unnamed: 0,ArtistId,Name
0,1,AC/DC
1,2,Accept
2,3,Aerosmith
3,4,Alanis Morissette
4,5,Alice In Chains


In [25]:
# ALBUM TABLE
df_album = pd.read_csv("/Users/tamerafang/Desktop/DS2002_Midterm/Album.csv")
df_album.head(5)

Unnamed: 0,AlbumId,Title,ArtistId
0,1,For Those About To Rock We Salute You,1
1,2,Balls to the Wall,2
2,3,Restless and Wild,2
3,4,Let There Be Rock,1
4,5,Big Ones,3


In [26]:
# RENAMING AND DELETING COLUMNS

In [27]:
df_artist.rename(columns={"ArtistId":"Artist_Key"}, inplace=True)
df_artist.head()

Unnamed: 0,Artist_Key,Name
0,1,AC/DC
1,2,Accept
2,3,Aerosmith
3,4,Alanis Morissette
4,5,Alice In Chains


In [28]:
df_album.rename(columns={"AlbumId":"Album_Key","ArtistId":"Artist_Key"}, inplace=True)
df_album.head()

Unnamed: 0,Album_Key,Title,Artist_Key
0,1,For Those About To Rock We Salute You,1
1,2,Balls to the Wall,2
2,3,Restless and Wild,2
3,4,Let There Be Rock,1
4,5,Big Ones,3


In [29]:
df_invoice.rename(columns={"InvoiceId":"Invoice_Key","CustomerId":"Customer_Key"}, inplace=True)
df_invoice.drop(["BillingAddress","BillingState"],axis=1, inplace=True)
df_invoice.head()

Unnamed: 0,Invoice_Key,Customer_Key,InvoiceDate,BillingCity,BillingCountry,BillingPostalCode,Total
0,98,1,2010-03-11,São José dos Campos,Brazil,12227-000,3.98
1,121,1,2010-06-13,São José dos Campos,Brazil,12227-000,3.96
2,143,1,2010-09-15,São José dos Campos,Brazil,12227-000,5.94
3,195,1,2011-05-06,São José dos Campos,Brazil,12227-000,0.99
4,316,1,2012-10-27,São José dos Campos,Brazil,12227-000,1.98


In [30]:
df_customer.rename(columns={"CustomerId":"Customer_Key"}, inplace=True)
df_customer.drop(["Address","State","City","Country","PostalCode"], axis=1, inplace=True)
df_customer.head()

Unnamed: 0,Customer_Key,FirstName,LastName,Company,Phone,Fax,Email,SupportRepId
0,1,Luís,Gonçalves,Embraer - Empresa Brasileira de Aeronáutica S.A.,+55 (12) 3923-5555,+55 (12) 3923-5566,luisg@embraer.com.br,3
1,2,Leonie,Köhler,,+49 0711 2842222,,leonekohler@surfeu.de,5
2,3,François,Tremblay,,+1 (514) 721-4711,,ftremblay@gmail.com,3
3,4,Bjørn,Hansen,,+47 22 44 22 22,,bjorn.hansen@yahoo.no,4
4,5,František,Wichterlová,JetBrains s.r.o.,+420 2 4172 5555,+420 2 4172 5555,frantisekw@jetbrains.com,4


In [31]:
df_track.drop(['__href', 'MediaType', 'Genre', 'Album','PlaylistTrack','InvoiceLine'], axis=1, inplace=True)
df_track.rename(columns={"TrackId":"Track_Key","AlbumId":"Album_Key","MediaTypeId":"MediaType_Key","GenreId":"Genre_Key"},inplace=True)
df_track.head()

Unnamed: 0,Track_Key,Name,Album_Key,MediaType_Key,Genre_Key,Composer,Milliseconds,Bytes,UnitPrice
0,1,For Those About To Rock (We Salute You) - TEST,1,1,1,"Angus Young, Malcolm Young, Brian Johnson",343719,11170334,0.99
1,2,Balls to the Wall,2,2,1,,342562,5510424,0.99
2,3,Fast As a Shark,3,2,1,"F. Baltes, S. Kaufman, U. Dirkscneider & W. Ho...",230619,3990994,0.99
3,4,Restless and Wild,3,2,1,"F. Baltes, R.A. Smith-Diesel, S. Kaufman, U. D...",252051,4331779,0.99
4,5,Princess of the Dawn,3,2,1,Deaffy & R.A. Smith-Diesel,375418,6290521,0.99


In [32]:
# MERGING TABLES TO MAKE DIMENSION TABLES

In [33]:
# MERGE df_customer and df_invoice

In [34]:
dim_customers = pd.merge(df_invoice, df_customer, on = "Customer_Key", how = "inner")
dim_customers.head()

Unnamed: 0,Invoice_Key,Customer_Key,InvoiceDate,BillingCity,BillingCountry,BillingPostalCode,Total,FirstName,LastName,Company,Phone,Fax,Email,SupportRepId
0,98,1,2010-03-11,São José dos Campos,Brazil,12227-000,3.98,Luís,Gonçalves,Embraer - Empresa Brasileira de Aeronáutica S.A.,+55 (12) 3923-5555,+55 (12) 3923-5566,luisg@embraer.com.br,3
1,121,1,2010-06-13,São José dos Campos,Brazil,12227-000,3.96,Luís,Gonçalves,Embraer - Empresa Brasileira de Aeronáutica S.A.,+55 (12) 3923-5555,+55 (12) 3923-5566,luisg@embraer.com.br,3
2,143,1,2010-09-15,São José dos Campos,Brazil,12227-000,5.94,Luís,Gonçalves,Embraer - Empresa Brasileira de Aeronáutica S.A.,+55 (12) 3923-5555,+55 (12) 3923-5566,luisg@embraer.com.br,3
3,195,1,2011-05-06,São José dos Campos,Brazil,12227-000,0.99,Luís,Gonçalves,Embraer - Empresa Brasileira de Aeronáutica S.A.,+55 (12) 3923-5555,+55 (12) 3923-5566,luisg@embraer.com.br,3
4,316,1,2012-10-27,São José dos Campos,Brazil,12227-000,1.98,Luís,Gonçalves,Embraer - Empresa Brasileira de Aeronáutica S.A.,+55 (12) 3923-5555,+55 (12) 3923-5566,luisg@embraer.com.br,3


In [35]:
# MERGE df_artist and df_album

In [36]:
dim_artstats = pd.merge(df_album, df_artist, on="Artist_Key", how="inner")
dim_artstats.head()

Unnamed: 0,Album_Key,Title,Artist_Key,Name
0,1,For Those About To Rock We Salute You,1,AC/DC
1,4,Let There Be Rock,1,AC/DC
2,2,Balls to the Wall,2,Accept
3,3,Restless and Wild,2,Accept
4,5,Big Ones,3,Aerosmith


In [37]:
# MERGE df_album and df_track

In [38]:
dim_alb_track = pd.merge(df_track,df_album,on="Album_Key", how="inner")
dim_alb_track.head()

Unnamed: 0,Track_Key,Name,Album_Key,MediaType_Key,Genre_Key,Composer,Milliseconds,Bytes,UnitPrice,Title,Artist_Key
0,1,For Those About To Rock (We Salute You) - TEST,1,1,1,"Angus Young, Malcolm Young, Brian Johnson",343719,11170334,0.99,For Those About To Rock We Salute You,1
1,6,Put The Finger On You,1,1,1,"Angus Young, Malcolm Young, Brian Johnson",205662,6713451,0.99,For Those About To Rock We Salute You,1
2,7,Let's Get It Up,1,1,1,"Angus Young, Malcolm Young, Brian Johnson",233926,7636561,0.99,For Those About To Rock We Salute You,1
3,8,Inject The Venom,1,1,1,"Angus Young, Malcolm Young, Brian Johnson",210834,6852860,0.99,For Those About To Rock We Salute You,1
4,9,Snowballed,1,1,1,"Angus Young, Malcolm Young, Brian Johnson",203102,6599424,0.99,For Those About To Rock We Salute You,1


In [39]:
# LOAD DIMENSION TABLES BACK INTO SQL

In [40]:
# INSERT dim_customers INTO chinook_dw INTO SQL, DECLARE PK

In [41]:
dw_name = 'chinook_dw'

In [42]:
conn_string = f'mysql+pymysql://{user_id}:{pwd}@{host_name}/{dw_name}'
sqlEngine = create_engine(conn_string, pool_recycle = 3600)
connection = sqlEngine.connect()
dim_customers.to_sql('dim_customers', connection, index=False, if_exists='replace')
sqlEngine.execute(f"ALTER TABLE {'dim_customers'} ADD PRIMARY KEY ({'Invoice_Key'});")
connection.close()

In [43]:
# INSERT dim_artstats INTO chinook_dw INTO SQL, DECLARE PK

In [44]:
conn_string = f'mysql+pymysql://{user_id}:{pwd}@{host_name}/{dw_name}'
sqlEngine = create_engine(conn_string, pool_recycle = 3600)
connection = sqlEngine.connect()
dim_artstats.to_sql('dim_artstats', connection, index=False, if_exists='replace')
sqlEngine.execute(f"ALTER TABLE {'dim_artStats'} ADD PRIMARY KEY ({'Album_Key'});")
connection.close()

In [45]:
# INSERT dim_alb_track INTO chinook_dw INTO SQL, DECLARE PK

In [46]:
conn_string = f'mysql+pymysql://{user_id}:{pwd}@{host_name}/{dw_name}'
sqlEngine = create_engine(conn_string, pool_recycle = 3600)
connection = sqlEngine.connect()
dim_alb_track.to_sql('dim_alb_track', connection, index=False, if_exists='replace')
sqlEngine.execute(f"ALTER TABLE {'dim_alb_track'} ADD PRIMARY KEY ({'Track_Key'});")
connection.close()

In [47]:
# VERIFY dim_customers IS IN chinook_dw IN SQL

In [48]:
conn = pymysql.connect(host=host_name, user=user_id, password=pwd, database=dw_name)
df_dim_customers = pd.read_sql("SELECT * FROM dim_customers order by Invoice_Key;", conn)
conn.close()
df_dim_customers.head()

Unnamed: 0,Invoice_Key,Customer_Key,InvoiceDate,BillingCity,BillingCountry,BillingPostalCode,Total,FirstName,LastName,Company,Phone,Fax,Email,SupportRepId
0,1,2,2009-01-01,Stuttgart,Germany,70174,1.98,Leonie,Köhler,,+49 0711 2842222,,leonekohler@surfeu.de,5
1,2,4,2009-01-02,Oslo,Norway,0171,3.96,Bjørn,Hansen,,+47 22 44 22 22,,bjorn.hansen@yahoo.no,4
2,3,8,2009-01-03,Brussels,Belgium,1000,5.94,Daan,Peeters,,+32 02 219 03 03,,daan_peeters@apple.be,4
3,4,14,2009-01-06,Edmonton,Canada,T6G 2C7,8.91,Mark,Philips,Telus,+1 (780) 434-4554,+1 (780) 434-5565,mphilips12@shaw.ca,5
4,5,23,2009-01-11,Boston,USA,2113,13.86,John,Gordon,,+1 (617) 522-1333,,johngordon22@yahoo.com,4


In [49]:
# VERIFY dim_artstats IS IN chinook_dw IN SQL

In [50]:
conn = pymysql.connect(host=host_name, user=user_id, password=pwd, database=dw_name)
df_dim_artstats = pd.read_sql("SELECT * FROM dim_artstats order by Album_Key;", conn)
conn.close()
df_dim_artstats.head()

Unnamed: 0,Album_Key,Title,Artist_Key,Name
0,1,For Those About To Rock We Salute You,1,AC/DC
1,2,Balls to the Wall,2,Accept
2,3,Restless and Wild,2,Accept
3,4,Let There Be Rock,1,AC/DC
4,5,Big Ones,3,Aerosmith


In [51]:
# verify that dim_alb_track is in chinook_dw in sql

In [52]:
conn = pymysql.connect(host=host_name, user=user_id, password=pwd, database=dw_name)
df_dim_alb_track = pd.read_sql("SELECT * FROM dim_alb_track order by Track_Key;", conn)
conn.close()
df_dim_alb_track.head()

Unnamed: 0,Track_Key,Name,Album_Key,MediaType_Key,Genre_Key,Composer,Milliseconds,Bytes,UnitPrice,Title,Artist_Key
0,1,For Those About To Rock (We Salute You) - TEST,1,1,1,"Angus Young, Malcolm Young, Brian Johnson",343719,11170334,0.99,For Those About To Rock We Salute You,1
1,2,Balls to the Wall,2,2,1,,342562,5510424,0.99,Balls to the Wall,2
2,3,Fast As a Shark,3,2,1,"F. Baltes, S. Kaufman, U. Dirkscneider & W. Ho...",230619,3990994,0.99,Restless and Wild,2
3,4,Restless and Wild,3,2,1,"F. Baltes, R.A. Smith-Diesel, S. Kaufman, U. D...",252051,4331779,0.99,Restless and Wild,2
4,5,Princess of the Dawn,3,2,1,Deaffy & R.A. Smith-Diesel,375418,6290521,0.99,Restless and Wild,2


In [53]:
# DATE DIMENSION, FACT TABLE

In [54]:
# CREATE DATE KEYS
date_list = []
counter = 0

if df_invoice_fct['InvoiceDate'][0] == df_invoice_fct['InvoiceDate'][1]:
    counter = counter
    date_list.append(counter)
else:
    counter += 1
    date_list.append(counter)

for i in range(1, len(df_invoice_fct)-1):
    if df_invoice_fct['InvoiceDate'][i] == df_invoice_fct['InvoiceDate'][i+1]:
        counter == counter
        date_list.append(counter)
    else:
        counter += 1
        date_list.append(counter)

date_list.insert(0, 0)
df_invoice_fct['DateKey'] = date_list

# DROP DATE COLUMN
df_invoice_fct.drop(['InvoiceDate'], axis=1, inplace=True)

# FINAL FACT TABLE WITH DATE DIMENSION 
df_invoice_fct.head(30)

Unnamed: 0,InvoiceId,CustomerId,BillingAddress,BillingCity,BillingState,BillingCountry,BillingPostalCode,Total,DateKey
0,1,2,Theodor-Heuss-Straße 34,Stuttgart,,Germany,70174,1.98,0
1,2,4,Ullevålsveien 14,Oslo,,Norway,0171,3.96,1
2,3,8,Grétrystraat 63,Brussels,,Belgium,1000,5.94,2
3,4,14,8210 111 ST NW,Edmonton,AB,Canada,T6G 2C7,8.91,3
4,5,23,69 Salem Street,Boston,MA,USA,2113,13.86,4
5,6,37,Berger Straße 10,Frankfurt,,Germany,60316,0.99,5
6,7,38,Barbarossastraße 19,Berlin,,Germany,10779,1.98,6
7,8,40,"8, Rue Hanovre",Paris,,France,75002,1.98,6
8,9,42,"9, Place Louis Barthou",Bordeaux,,France,33000,3.96,7
9,10,46,3 Chatham Street,Dublin,Dublin,Ireland,,5.94,8


In [55]:
# INSERT FINAL FACT TABLE INTO SQL, DECLARE PK 

In [56]:
invoice_fact = df_invoice_fct
conn_string = f'mysql+pymysql://{user_id}:{pwd}@{host_name}/{dw_name}'
sqlEngine = create_engine(conn_string, pool_recycle = 3600)
connection = sqlEngine.connect()
invoice_fact.to_sql('invoice_fact', connection, index=False, if_exists='replace')
sqlEngine.execute(f"ALTER TABLE {'invoice_fact'} ADD PRIMARY KEY ({'InvoiceId'});")
connection.close()

In [57]:
# VERIFY fact table IS IN chinook_dw IN SQL

In [58]:
conn = pymysql.connect(host=host_name, user=user_id, password=pwd, database=dw_name)
invoice_fact = pd.read_sql("SELECT * FROM invoice_fact order by InvoiceId;", conn)
conn.close()
invoice_fact.head()

Unnamed: 0,InvoiceId,CustomerId,BillingAddress,BillingCity,BillingState,BillingCountry,BillingPostalCode,Total,DateKey
0,1,2,Theodor-Heuss-Straße 34,Stuttgart,,Germany,70174,1.98,0
1,2,4,Ullevålsveien 14,Oslo,,Norway,0171,3.96,1
2,3,8,Grétrystraat 63,Brussels,,Belgium,1000,5.94,2
3,4,14,8210 111 ST NW,Edmonton,AB,Canada,T6G 2C7,8.91,3
4,5,23,69 Salem Street,Boston,MA,USA,2113,13.86,4


In [59]:
# QUERIES - ANALYSIS 

In [60]:
# TOTAL SALES FROM DIFFERENT COUNTRIES 

In [62]:
conn = pymysql.connect(host=host_name, user=user_id, password=pwd, database=dw_name)
df_sales = pd.read_sql("select sum(Total),BillingCountry from chinook_dw.dim_customers group by BillingCountry;", conn)
conn.close()
df_sales

Unnamed: 0,sum(Total),BillingCountry
0,156.48,Germany
1,39.62,Norway
2,37.62,Belgium
3,303.96,Canada
4,523.06,USA
5,195.1,France
6,45.62,Ireland
7,112.86,United Kingdom
8,37.62,Australia
9,46.62,Chile


In [63]:
# TOP 3 CUSTOMERS BASED ON SALES 

In [64]:
conn = pymysql.connect(host=host_name, user=user_id, password=pwd, database=dw_name)
df_customer_sales = pd.read_sql("select sum(Total),CustomerId from chinook_dw.invoice_fact group by CustomerId order by sum(Total) desc;", conn)
conn.close()
df_customer_sales.head(3)

Unnamed: 0,sum(Total),CustomerId
0,49.62,6
1,47.62,26
2,46.62,57


In [65]:
# AVG LENGTH OF SONG, BASED ON COMPOSER

In [66]:
conn = pymysql.connect(host=host_name, user=user_id, password=pwd, database=dw_name)
df_length = pd.read_sql("select avg(Milliseconds),Composer from chinook_dw.dim_alb_track group by Composer;", conn)
conn.close()
df_length.head()

Unnamed: 0,avg(Milliseconds),Composer
0,240041.5,"Angus Young, Malcolm Young, Brian Johnson"
1,342562.0,
2,230619.0,"F. Baltes, S. Kaufman, U. Dirkscneider & W. Ho..."
3,252051.0,"F. Baltes, R.A. Smith-Diesel, S. Kaufman, U. D..."
4,375418.0,Deaffy & R.A. Smith-Diesel
