In [1]:
import billboard
import lyricsgenius
import pandas as pd
import spotipy
import sqlalchemy as db
from datetime import datetime
from psaw import PushshiftAPI
from spotipy.oauth2 import SpotifyClientCredentials

from api_keys import *

In [2]:
sp = spotipy.Spotify(client_credentials_manager = SpotifyClientCredentials(client_id = SPOTIFY_ID,
                                                                           client_secret = SPOTIFY_SECRET))

genius = lyricsgenius.Genius(GENIUS_ACCESS_TOKEN)
genius.verbose = False
genius.remove_section_headers = True

reddit = PushshiftAPI()

engine = db.create_engine(f"postgresql+psycopg2://{SQL_USER}:{SQL_PASS}@{SQL_HOST}/{SQL_DB}")
connection = engine.connect()

Our API wrappers:

In [3]:
def getSingleChart(date):
    chart = billboard.ChartData("hot-100", date = date)
    
    return pd.DataFrame( dict(song.__dict__, date = date) for song in chart )


def getSingleSongURI(title, artist):
    try:
        uri = sp.search(q = f"{title} {artist}", type = "track", limit = 1)["tracks"]["items"][0]["uri"]
        
        return {"uri":    uri,
                "title":  title,
                "artist": artist}
    
    except IndexError:
        return {"uri":    None,
                "title":  title,
                "artist": artist}


def getSingleSongFeatures(spotify_uri):
    features = sp.audio_features(spotify_uri)[0]
    
    if features is None:
        return {"uri": spotify_uri} # some kind of placeholder so we know it has no features
    
    return features


def getSingleSongLyrics(title, artist):
    try:
        song = genius.search_song(title = title, artist = artist)

        if song is None:
            return dict()

        return dict(song.to_dict())
    
    except: # API likes to disconnect?
        return dict()

    
def redditSearch(search_function, subreddit, from_date, to_date, **kwargs):
    start_date = datetime(*[ int(k) for k in from_date.split('-') ])
    start_epoch = int(start_date.timestamp())
    
    end_date = datetime(*[ int(k) for k in to_date.split('-') ])
    end_epoch = int(end_date.timestamp())

    results = search_function(after = start_epoch, before = end_epoch, subreddit = subreddit, **kwargs)
    
    return pd.DataFrame( row.d_ for row in results )

Aggregating those calls:

In [4]:
def getCharts(dates):
    return pd.concat( getSingleChart(date) for date in dates )

def getSongURIs(songs): # songs should be a table with columns "title" and "artist"
    return pd.DataFrame( getSingleSongURI(song.title, song.artist) for song in songs.itertuples() )

def getSongFeatures(spotify_uris):
    return pd.DataFrame( getSingleSongFeatures(uri) for uri in spotify_uris if uri is not None )

def getSongLyrics(songs):
    return pd.DataFrame( getSingleSongLyrics(song.title, song.artist) for song in songs.itertuples() )

def getPosts(subreddit, from_date, to_date, **kwargs):
    return redditSearch(reddit.search_submissions, subreddit, from_date, to_date, **kwargs)

def getComments(subreddit, from_date, to_date, **kwargs):
    return redditSearch(reddit.search_comments, subreddit, from_date, to_date, **kwargs)

Fetching the data:

In [5]:
START_YEAR = 2019
END_YEAR   = 2019

months = [ f"{y}-{m:02d}-01" for y in range(START_YEAR, END_YEAR+1) for m in range(1, 12+1) ] + [ f"{END_YEAR+1}-01-01" ]

print("Getting data from", months[0], "to", months[-1])

Getting data from 2019-01-01 to 2020-01-01


In [6]:
chartsTable = getCharts(months)
print("Fetched charts.")

chartsTable.to_sql("charts", connection, if_exists = "replace") # creates a new table
print("Sent charts to db.")

chartsTable

In [7]:
songURIs = getSongURIs(chartsTable[["title", "artist"]].drop_duplicates())
print("Fetched song URIs.")

songURIs.to_sql("uri", connection, if_exists = "replace")
print("Sent song URIs to db..")

songURIs

retrying ...2secs


In [None]:
del chartsTable

In [2]:
audioTable = getSongFeatures(songURIs.uri) # take the set in case songs are on the charts for many months
print("Fetched audio features.")

audioTable.to_sql("audio", connection, if_exists = "replace")
print("Sent audio features to db.")

audioTable

NameError: name 'getSongFeatures' is not defined

In [None]:
del audioTable

In [10]:
lyricsTable = getSongLyrics(songURIs) # slow, will replace with MusixMatch API
print("Fetched lyrics.")

lyricsTable.to_sql("lyrics", connection, if_exists = "replace")
print("Sent lyrics to db.")

lyricsTable

Searching for "Thank U, Next" by Ariana Grande...
Done.
Searching for "Without Me" by Halsey...
Done.
Searching for "All I Want For Christmas Is You" by Mariah Carey...
Done.
Searching for "Sicko Mode" by Travis Scott...
Done.
Searching for "Sunflower (Spider-Man: Into The Spider-Verse)" by Post Malone & Swae Lee...
Done.
Searching for "High Hopes" by Panic! At The Disco...
Done.
Searching for "Happier" by Marshmello & Bastille...
Done.
Searching for "Jingle Bell Rock" by Bobby Helms...
Done.
Searching for "Rockin' Around The Christmas Tree" by Brenda Lee...
Done.
Searching for "A Holly Jolly Christmas" by Burl Ives...
Done.
Searching for "The Christmas Song (Merry Christmas To You)" by Nat King Cole...
Done.
Searching for "Drip Too Hard" by Lil Baby & Gunna...
Done.
Searching for "It's The Most Wonderful Time Of The Year" by Andy Williams...
Done.
Searching for "Girls Like You" by Maroon 5 Featuring Cardi B...
Done.
Searching for "ZEZE" by Kodak Black Featuring Travis Scott & Offset..

Done.
Searching for "Nights Like This" by Kehlani Featuring Ty Dolla $ign...
Done.
Searching for "Pure Cocaine" by Lil Baby...
Done.
Searching for "Electricity" by Silk City x Dua Lipa...
Specified song does not contain lyrics. Rejecting.
Searching for "On My Way To You" by Cody Johnson...
Done.
Searching for "Make It Sweet" by Old Dominion...
Done.
Searching for "Love Wins" by Carrie Underwood...
Done.
Searching for "Murder On My Mind" by YNW Melly...
Done.
Searching for "GIRL" by Maren Morris...
Done.
Searching for "Roses" by benny blanco & Juice WRLD Featuring Brendon Urie...
Done.
Searching for "Face My Fears" by Hikaru Utada & Skrillex...
Done.
Searching for "Call The Coroner" by Future...
Done.


In [11]:
del lyricsTable

In [7]:
for i in range(0, len(months), 6): # we will run out of memory!
    startMonth = months[i]
    endMonth = months[i+6]
    
    postsTable = getPosts("news", startMonth, endMonth, filter = ["id", "num_comments", "title", "created", "url", "permalink"])
    print("Fetched posts.")

    postsTable.astype(str).to_sql("posts", connection, if_exists = "replace" if i == 0 else "append") # cast to string to insert dict objects
    print("Sent posts to db.")

    del postsTable

KeyboardInterrupt: 

In [None]:
for i in range(0, len(months), 6):
    startMonth = months[i]
    endMonth = months[i+6]
    
    commentsTable = getComments("news", startMonth, endMonth, filter = ["body", "id", "link_id", "parent_id", "score", "created", "subreddit", "permalink"])
    print("Fetched comments.")

    commentsTable.astype(str).to_sql("comments", connection, if_exists = "replace" if i == 0 else "append") # cast to string to insert dict objects
    print("Sent comments to db.")

    del commentsTable

In [None]:
connection.close()