# Import Dependencies

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from sqlalchemy import create_engine, inspect
from config import *

# Create Spotify URL's

In [2]:
url_global_200_latest = 'https://spotifycharts.com/regional/global/daily/latest'

In [3]:
url_global_200 = 'https://spotifycharts.com/regional/global/daily/'
url_us_200 = 'https://spotifycharts.com/regional/us/daily/'
url_uk_200 = 'https://spotifycharts.com/regional/gb/daily/'

# Create Function to WebScrape Spotify

* Defaults to latest date

In [4]:
def webScrapeSpotify (url):

    #Submit Request
    html = requests.get(url)

    #Create BeautifulSoup Object
    soup = BeautifulSoup(html.text, 'lxml')
    table = soup.findAll('table',{"class":"chart-table"})[0]

    #Import table to dataframe 
    df = pd.read_html(str(table), header=0)[0]
    df[['track','artist']] = df['Track'].str.split(pat = " by ", expand=True)
    df = df.drop(columns=['Unnamed: 0', 'Unnamed: 2', 'Track'])
    df.rename(columns={"Unnamed: 1":"position", "Streams":"streams"}, inplace=True)
    column_names = ["position", "track", "artist", "streams"]
    df = df.reindex(columns=column_names)
    
    return df

### Validate: Latest Global Top 200

In [5]:
df_global_200_latest = webScrapeSpotify(url_global_200)
df_global_200_latest.head()

Unnamed: 0,position,track,artist,streams
0,1,ROCKSTAR (feat. Roddy Ricch),DaBaby,5571679
1,2,Blinding Lights,The Weeknd,5037893
2,3,Roses - Imanbek Remix,SAINt JHN,4334314
3,4,Rain On Me (with Ariana Grande),Lady Gaga,3473008
4,5,death bed (coffee for your head) (feat. beabad...,Powfu,3418872


# Create Function to WebScrape Spotify on Date

* Accepts date argument
* Defaults to latest date

In [6]:
def webScrapeSpotifyonDate (url, date='latest'):

    #Submit Request
    html = requests.get(url+date)

    #Create BeautifulSoup Object
    soup = BeautifulSoup(html.text, 'lxml')
    table = soup.findAll('table',{"class":"chart-table"})[0]

    #Import table to dataframe 
    df = pd.read_html(str(table), header=0)[0]
    df[['track','artist']] = df['Track'].str.split(pat = " by ", expand=True)
    df = df.drop(columns=['Unnamed: 0', 'Unnamed: 2', 'Track'])
    df.rename(columns={"Unnamed: 1":"position", "Streams":"streams"}, inplace=True)
    column_names = ["position", "track", "artist", "streams"]
    df = df.reindex(columns=column_names)
    
    return df

### Validate: Global Top 200 for 2020-06-14

In [7]:
df_global_200_past = webScrapeSpotifyonDate(url_global_200, '2020-06-14')
df_global_200_past.head()

Unnamed: 0,position,track,artist,streams
0,1,Blinding Lights,The Weeknd,4516393
1,2,ROCKSTAR (feat. Roddy Ricch),DaBaby,4050048
2,3,Roses - Imanbek Remix,SAINt JHN,3793500
3,4,Rain On Me (with Ariana Grande),Lady Gaga,3270819
4,5,Toosie Slide,Drake,3171789


# Create Function to WebScrape Spotify on Chart Type

* Accepts date argument
* Accepts regional chart argument
* Defaults to latest date

In [8]:
def webScrapeSpotifyonChartType (chart='global', date='latest'):

    #Set URL based on Chart Type
    if chart =='global':
        url = 'https://spotifycharts.com/regional/global/daily/'
    elif chart == 'us':
        url = 'https://spotifycharts.com/regional/us/daily/'
    elif chart == 'uk':
        url = 'https://spotifycharts.com/regional/gb/daily/'
    else:
        url = 'invalid chart'
        print("Invalid chart type provided")    
    
    #Submit Request
    html = requests.get(url+date)

    #Create BeautifulSoup Object
    soup = BeautifulSoup(html.text, 'lxml')
    table = soup.findAll('table',{"class":"chart-table"})[0]

    #Import table to dataframe 
    df = pd.read_html(str(table), header=0)[0]
    df[['track','artist']] = df['Track'].str.split(pat = " by ", expand=True)
    df = df.drop(columns=['Unnamed: 0', 'Unnamed: 2', 'Track'])
    df.rename(columns={"Unnamed: 1":"position", "Streams":"streams"}, inplace=True)
    column_names = ["position", "track", "artist", "streams"]
    df = df.reindex(columns=column_names)
    df.insert(loc=4, column='chart_type', value=chart)
    df.insert(loc=5, column='source', value='spotify')
    
    return df

### Validate: Global Top 200 for Different Chart Types

In [9]:
df_uk = webScrapeSpotifyonChartType('uk')
df_uk.head()

Unnamed: 0,position,track,artist,streams,chart_type,source
0,1,ROCKSTAR (feat. Roddy Ricch),DaBaby,530032,uk,spotify
1,2,Rain On Me (with Ariana Grande),Lady Gaga,373887,uk,spotify
2,3,Blinding Lights,The Weeknd,367159,uk,spotify
3,4,Breaking Me,Topic,345352,uk,spotify
4,5,Rover (feat. DTG),S1mba,326491,uk,spotify


In [10]:
df_us = webScrapeSpotifyonChartType('us')
df_us.head()

Unnamed: 0,position,track,artist,streams,chart_type,source
0,1,ROCKSTAR (feat. Roddy Ricch),DaBaby,1562649,us,spotify
1,2,Party Girl,StaySolidRocky,1065084,us,spotify
2,3,Blinding Lights,The Weeknd,1007630,us,spotify
3,4,Blueberry Faygo,Lil Mosey,963554,us,spotify
4,5,Watermelon Sugar,Harry Styles,905569,us,spotify


# Create database connection

In [11]:
connection_string = f"postgresql+psycopg2://{SQL_USERNAME}:{SQL_PASSWORD}@{SQL_IP}:{PORT}/{DATABASE}"
engine = create_engine(connection_string)

# Create Tables in GCP PostgreSQL Database Server

* If __tracks__ tables exist, the drop table

In [12]:
engine.execute("drop table if exists tracks;")

<sqlalchemy.engine.result.ResultProxy at 0x7fa53413a5d0>

In [13]:
create_tracks = """CREATE TABLE tracks (
                                id SERIAL PRIMARY KEY,
                                position INT,
                                track TEXT,
                                artist TEXT,
                                streams INT,
                                source TEXT,
                                chart_type TEXT,
                                last_updated timestamp default current_timestamp
                    );
                """
engine.execute(create_tracks)

<sqlalchemy.engine.result.ResultProxy at 0x7fa5340dc710>

* Confirm table exists

In [14]:
engine.table_names()

['tracks', 'county']

# Load DataFrames into database

In [15]:
conn =  engine.connect()

In [16]:
df_us.to_sql(name='tracks', con=conn, if_exists='append',index=False)

In [17]:
conn.close()
engine.dispose

<bound method Engine.dispose of Engine(postgresql+psycopg2://pandas_etl:***@34.72.119.225:5432/pandas_etl)>

# Create Function to Load DataFrame into Database

In [18]:
def loadDFintoDB(df, table='tracks'):
    
    #Connect to Database
    connection_string = f"postgresql+psycopg2://{SQL_USERNAME}:{SQL_PASSWORD}@{SQL_IP}:{PORT}/{DATABASE}"
    engine = create_engine(connection_string)
    conn =  engine.connect()
    
    #Load DataFrame into Database 
    df.to_sql(name=table, con=conn, if_exists='append',index=False)
    
    #Disconnect from Database
    conn.close()
    engine.dispose

### Load UK Chart into Database

In [19]:
loadDFintoDB(df_uk)

# Verify Database

In [20]:
#Connect to Database
connection_string = f"postgresql+psycopg2://{SQL_USERNAME}:{SQL_PASSWORD}@{SQL_IP}:{PORT}/{DATABASE}"
engine = create_engine(connection_string)
conn =  engine.connect()

In [21]:
query = """
            SELECT 
                *
            FROM
                tracks
            LIMIT 5;
        """

In [22]:
test = pd.read_sql(query, con=conn)
test.head()

Unnamed: 0,id,position,track,artist,streams,source,chart_type,last_updated
0,1,1,ROCKSTAR (feat. Roddy Ricch),DaBaby,1562649,spotify,us,2020-06-21 17:22:58.813747
1,2,2,Party Girl,StaySolidRocky,1065084,spotify,us,2020-06-21 17:22:58.813747
2,3,3,Blinding Lights,The Weeknd,1007630,spotify,us,2020-06-21 17:22:58.813747
3,4,4,Blueberry Faygo,Lil Mosey,963554,spotify,us,2020-06-21 17:22:58.813747
4,5,5,Watermelon Sugar,Harry Styles,905569,spotify,us,2020-06-21 17:22:58.813747


In [23]:
conn.close()
engine.dispose

<bound method Engine.dispose of Engine(postgresql+psycopg2://pandas_etl:***@34.72.119.225:5432/pandas_etl)>