#    ETL PROJECT
### Spotify x Billboard Music Charts
#### Todd Tuchek, Lesly Sok and Raul Villa

In [1]:
# Dependencies
import pandas as pd
import sqlalchemy
import numpy as np
from sqlalchemy import create_engine
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, inspect
from sqlalchemy import Column, Integer, String, Float
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()

## Extract
### Resources: https://www.kaggle.com/danield2255/data-on-songs-from-billboard-19992019 

In [2]:
# Import Spotify Data
spotify_data = "spotifyWeeklyTop200Streams_cleaned12.6.csv"

spotify_df = pd.read_csv(spotify_data)
spotify_df.head()

Unnamed: 0.1,Unnamed: 0,Name,Artist,Features,Streams,Week
0,0,In My Feelings,Drake,,30747676,7/20/2018
1,1,Lucid Dreams,Juice WRLD,,12930705,7/20/2018
2,2,Nonstop,Drake,,12312859,7/20/2018
3,3,God is a woman,Ariana Grande,,10771324,7/20/2018
4,4,SAD!,XXXTENTACION,,10503061,7/20/2018


In [3]:
# Import the data from Billboard
billboard_data = "billboardHot100_1999-2019_cleaned_12.6.csv"

billboard_df = pd.read_csv(billboard_data)
billboard_df.head()

ParserError: Error tokenizing data. C error: Expected 11 fields in line 3, saw 14


## Transform 

In [4]:
# Drop Features Column in Spotify DataFrame
spotify_df = spotify_df.drop(columns = ['Features', 'Unnamed: 0'])
spotify_df.head()

Unnamed: 0,Name,Artist,Streams,Week
0,In My Feelings,Drake,30747676,7/20/2018
1,Lucid Dreams,Juice WRLD,12930705,7/20/2018
2,Nonstop,Drake,12312859,7/20/2018
3,God is a woman,Ariana Grande,10771324,7/20/2018
4,SAD!,XXXTENTACION,10503061,7/20/2018


In [5]:
# Adjust the Spotify DataFrame so the start of each week lines up with Billboard
from datetime import timedelta
spotify_df["Week"] = pd.to_datetime(spotify_df["Week"])
spotify_df["one_day_ahead"] = spotify_df["Week"].apply(lambda x: x + timedelta(days=1))
spotify_df["one_day_ahead"] = spotify_df["one_day_ahead"].dt.strftime("%m/%d/%y")
spotify_df = spotify_df.drop(columns = ["Week"])
spotify_df.head()

Unnamed: 0,Name,Artist,Streams,one_day_ahead
0,In My Feelings,Drake,30747676,07/21/18
1,Lucid Dreams,Juice WRLD,12930705,07/21/18
2,Nonstop,Drake,12312859,07/21/18
3,God is a woman,Ariana Grande,10771324,07/21/18
4,SAD!,XXXTENTACION,10503061,07/21/18


In [6]:
# Rename "Columns"
spotify_df.rename(columns = {'Name': 'Song Name',
                             'Streams': 'Weekly Streams',
                             'one_day_ahead': 'Week of'}, inplace = True)
spotify_df.head()

Unnamed: 0,Song Name,Artist,Weekly Streams,Week of
0,In My Feelings,Drake,30747676,07/21/18
1,Lucid Dreams,Juice WRLD,12930705,07/21/18
2,Nonstop,Drake,12312859,07/21/18
3,God is a woman,Ariana Grande,10771324,07/21/18
4,SAD!,XXXTENTACION,10503061,07/21/18


In [7]:
billboard_df.head()

Unnamed: 0,Artists,Name,Weekly.rank,Peak.position,Weeks.on.chart,Week,Date,Genre,Writing.Credits,Lyrics,Features
0,17,NC,41,,,8/18/2018,8/3/2018,"Trap,Rap","Ryan meyer, Rich meyer, Johnny stevens, Allen ...","NC-17 \nOoh\nMe and my bitch, I swear we like ...",
1,800,1,45,3.0,42.0,2/24/2018,4/28/2017,"Hip-Hop,Canada,DMV,Pop,Rap","Dylan wiggins, Andrew taggart, 6ix, Alessia ca...",1-800-273-8255 \nI've been on the low\nI been ...,
2,800,1,40,3.0,41.0,2/17/2018,4/28/2017,"Hip-Hop,Canada,DMV,Pop,Rap","Dylan wiggins, Andrew taggart, 6ix, Alessia ca...",1-800-273-8255 \nI've been on the low\nI been ...,
3,800,1,33,3.0,40.0,2/10/2018,4/28/2017,"Hip-Hop,Canada,DMV,Pop,Rap","Dylan wiggins, Andrew taggart, 6ix, Alessia ca...",1-800-273-8255 \nI've been on the low\nI been ...,
4,800,1,46,3.0,39.0,2/3/2018,4/28/2017,"Hip-Hop,Canada,DMV,Pop,Rap","Dylan wiggins, Andrew taggart, 6ix, Alessia ca...",1-800-273-8255 \nI've been on the low\nI been ...,


In [8]:
# Drop Columns on Billboard DataFrame
billboard_df = billboard_df.drop(columns = ['Peak.position', 
                                            'Weeks.on.chart', 
                                            'Date', 
                                            'Writing.Credits', 
                                            'Lyrics', 
                                            'Features'])
billboard_df.head()

Unnamed: 0,Artists,Name,Weekly.rank,Week,Genre
0,17,NC,41,8/18/2018,"Trap,Rap"
1,800,1,45,2/24/2018,"Hip-Hop,Canada,DMV,Pop,Rap"
2,800,1,40,2/17/2018,"Hip-Hop,Canada,DMV,Pop,Rap"
3,800,1,33,2/10/2018,"Hip-Hop,Canada,DMV,Pop,Rap"
4,800,1,46,2/3/2018,"Hip-Hop,Canada,DMV,Pop,Rap"


In [9]:
from datetime import timedelta
billboard_df["Week"] = pd.to_datetime(billboard_df["Week"])
billboard_df["Week"] = billboard_df["Week"].dt.strftime("%m/%d/%y")
billboard_df.head()

Unnamed: 0,Artists,Name,Weekly.rank,Week,Genre
0,17,NC,41,08/18/18,"Trap,Rap"
1,800,1,45,02/24/18,"Hip-Hop,Canada,DMV,Pop,Rap"
2,800,1,40,02/17/18,"Hip-Hop,Canada,DMV,Pop,Rap"
3,800,1,33,02/10/18,"Hip-Hop,Canada,DMV,Pop,Rap"
4,800,1,46,02/03/18,"Hip-Hop,Canada,DMV,Pop,Rap"


In [10]:
# split everything after (',') for Genre
from io import StringIO
billboard_df['Genre'] = billboard_df['Genre'].apply(lambda x: x.split(',')[0])
billboard_df

Unnamed: 0,Artists,Name,Weekly.rank,Week,Genre
0,17,NC,41,08/18/18,Trap
1,800,1,45,02/24/18,Hip-Hop
2,800,1,40,02/17/18,Hip-Hop
3,800,1,33,02/10/18,Hip-Hop
4,800,1,46,02/03/18,Hip-Hop
...,...,...,...,...,...
13277,"Zedd, Maren Morris, Grey",The Middle,11,03/10/18,Deutschland
13278,"Zedd, Maren Morris, Grey",The Middle,13,03/03/18,Deutschland
13279,"Zedd, Maren Morris, Grey",The Middle,17,02/24/18,Deutschland
13280,"Zedd, Maren Morris, Grey",The Middle,22,02/17/18,Deutschland


In [11]:
# Rename columns in Billboard DataFrame
billboard_df.rename(columns = {'Artists': 'Artist',
                                'Name': 'Song Name', 
                               'Weekly.rank': 'Weekly Rank', 
                               'Week': 'Week of'}, inplace = True )
billboard_df.head()

Unnamed: 0,Artist,Song Name,Weekly Rank,Week of,Genre
0,17,NC,41,08/18/18,Trap
1,800,1,45,02/24/18,Hip-Hop
2,800,1,40,02/17/18,Hip-Hop
3,800,1,33,02/10/18,Hip-Hop
4,800,1,46,02/03/18,Hip-Hop


In [12]:
# Strip all blank spaces for Artist and Song Names
billboard_df['Artist'] = billboard_df['Artist'].str.rstrip()
billboard_df['Song Name'] = billboard_df['Song Name'].str.rstrip()
billboard_df

Unnamed: 0,Artist,Song Name,Weekly Rank,Week of,Genre
0,17,NC,41,08/18/18,Trap
1,800,1,45,02/24/18,Hip-Hop
2,800,1,40,02/17/18,Hip-Hop
3,800,1,33,02/10/18,Hip-Hop
4,800,1,46,02/03/18,Hip-Hop
...,...,...,...,...,...
13277,"Zedd, Maren Morris, Grey",The Middle,11,03/10/18,Deutschland
13278,"Zedd, Maren Morris, Grey",The Middle,13,03/03/18,Deutschland
13279,"Zedd, Maren Morris, Grey",The Middle,17,02/24/18,Deutschland
13280,"Zedd, Maren Morris, Grey",The Middle,22,02/17/18,Deutschland


In [13]:
spotify_df['Weekly Streams'] = spotify_df['Weekly Streams'].apply(str)
spotify_df['Weekly Streams'].dtype

dtype('O')

####  Merging Billboard and Spotify DataFrame

In [14]:
# df_merged = pd.merge(billboard_df, spotify_df, how='left', left_on= ['Song Name', 'Weekly'], right_on = ['Song Name', 'one_day_day'])
# df_merged.head(100)
df_merged = pd.merge(billboard_df, spotify_df, how='left', on= ['Song Name', 'Week of'])
df_merged.head(100)

Unnamed: 0,Artist_x,Song Name,Weekly Rank,Week of,Genre,Artist_y,Weekly Streams
0,17,NC,41,08/18/18,Trap,,
1,800,1,45,02/24/18,Hip-Hop,,
2,800,1,40,02/17/18,Hip-Hop,,
3,800,1,33,02/10/18,Hip-Hop,,
4,800,1,46,02/03/18,Hip-Hop,,
...,...,...,...,...,...,...,...
95,2 Chainz,Good Drank,92,02/11/17,Atlanta,2 Chainz,1440861
96,21 Savage,1.5,86,01/05/19,Motown,21 Savage,1848760
97,21 Savage,A Lot,49,06/08/19,East Coast,,
98,21 Savage,A Lot,46,06/01/19,East Coast,,


In [15]:
df_merged[df_merged["Song Name"] == "4:00 AM"]

Unnamed: 0,Artist_x,Song Name,Weekly Rank,Week of,Genre,Artist_y,Weekly Streams
42,2 Chainz,4:00 AM,95,09/09/17,Atlanta,2 Chainz,1864342
43,2 Chainz,4:00 AM,90,09/02/17,Atlanta,2 Chainz,1997523
44,2 Chainz,4:00 AM,93,08/26/17,Atlanta,2 Chainz,2660105
45,2 Chainz,4:00 AM,83,08/19/17,Atlanta,2 Chainz,2865754
46,2 Chainz,4:00 AM,88,08/12/17,Atlanta,2 Chainz,2727864
47,2 Chainz,4:00 AM,83,08/05/17,Atlanta,2 Chainz,2866786
48,2 Chainz,4:00 AM,76,07/29/17,Atlanta,2 Chainz,3069167
49,2 Chainz,4:00 AM,65,07/22/17,Atlanta,2 Chainz,3393430
50,2 Chainz,4:00 AM,59,07/15/17,Atlanta,2 Chainz,3517378
51,2 Chainz,4:00 AM,55,07/08/17,Atlanta,2 Chainz,3595373


In [16]:
df_merged = df_merged.drop(columns = ['Artist_y'])
df_merged.head()

Unnamed: 0,Artist_x,Song Name,Weekly Rank,Week of,Genre,Weekly Streams
0,17,NC,41,08/18/18,Trap,
1,800,1,45,02/24/18,Hip-Hop,
2,800,1,40,02/17/18,Hip-Hop,
3,800,1,33,02/10/18,Hip-Hop,
4,800,1,46,02/03/18,Hip-Hop,


In [18]:
df_merged.rename(columns = {'Artist_x': 'Artist'})

Unnamed: 0,Artist,Song Name,Weekly Rank,Week of,Genre,Weekly Streams
0,17,NC,41,08/18/18,Trap,
1,800,1,45,02/24/18,Hip-Hop,
2,800,1,40,02/17/18,Hip-Hop,
3,800,1,33,02/10/18,Hip-Hop,
4,800,1,46,02/03/18,Hip-Hop,
...,...,...,...,...,...,...
13383,"Zedd, Maren Morris, Grey",The Middle,11,03/10/18,Deutschland,7664089
13384,"Zedd, Maren Morris, Grey",The Middle,13,03/03/18,Deutschland,7714406
13385,"Zedd, Maren Morris, Grey",The Middle,17,02/24/18,Deutschland,7006684
13386,"Zedd, Maren Morris, Grey",The Middle,22,02/17/18,Deutschland,6827589


In [19]:
df_merged[df_merged["Song Name"] == "4:00 AM"]

Unnamed: 0,Artist_x,Song Name,Weekly Rank,Week of,Genre,Weekly Streams
42,2 Chainz,4:00 AM,95,09/09/17,Atlanta,1864342
43,2 Chainz,4:00 AM,90,09/02/17,Atlanta,1997523
44,2 Chainz,4:00 AM,93,08/26/17,Atlanta,2660105
45,2 Chainz,4:00 AM,83,08/19/17,Atlanta,2865754
46,2 Chainz,4:00 AM,88,08/12/17,Atlanta,2727864
47,2 Chainz,4:00 AM,83,08/05/17,Atlanta,2866786
48,2 Chainz,4:00 AM,76,07/29/17,Atlanta,3069167
49,2 Chainz,4:00 AM,65,07/22/17,Atlanta,3393430
50,2 Chainz,4:00 AM,59,07/15/17,Atlanta,3517378
51,2 Chainz,4:00 AM,55,07/08/17,Atlanta,3595373


## Load

In [17]:
# SQL
from sqlalchemy import create_engine
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, inspect
from sqlalchemy import Column, Integer, String, Float
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()

In [23]:
connection_string = "postgres:postgres@localhost:5432/Music_db"
engine = create_engine(f'postgresql://{connection_string}')