In [15]:
import billboard
import lyricsgenius
import pandas as pd
import spotipy
from datetime import datetime
from psaw import PushshiftAPI
from spotipy.oauth2 import SpotifyClientCredentials

from api_keys import *

In [2]:
sp = spotipy.Spotify(client_credentials_manager = SpotifyClientCredentials(client_id = SPOTIFY_ID,
                                                                           client_secret = SPOTIFY_SECRET))

genius = lyricsgenius.Genius(GENIUS_ACCESS_TOKEN)
genius.verbose = False
genius.remove_section_headers = True

reddit = PushshiftAPI()

Our API wrappers:

In [16]:
def getSingleChart(date):
    
    def getSpotifyURI(song): # This needs to be worked a bit, sometimes the titles have guest artists' names
        try:
            return sp.search(q = f"{song.title} {song.artist}", type = "track", limit = 1)["tracks"]["items"][0]["uri"]
        except IndexError:
            return None
        
    chart = billboard.ChartData("hot-100", date = date)
    
    return pd.DataFrame( dict(song.__dict__, date = date, spotify_uri = getSpotifyURI(song)) for song in chart )


def getSingleSongFeatures(spotify_uri):
    features = sp.audio_features(spotify_uri)[0]
    
    if features is None:
        return {"uri": spotify_uri} # some kind of placeholder so we know it has no features
    
    return features


def getSingleSongMetadata(spotify_uri):
    song_info = sp.track(spotify_uri)
    song = genius.search_song(title = song_info["name"], artist = song_info["artists"][0]["name"])
    
    if song is None:
        return dict()
    
    return dict(song.to_dict(), spotify_uri = spotify_uri)

    
def redditSearch(search_function, subreddit, from_date, to_date):
    start_date = datetime(*[ int(k) for k in from_date.split('-') ])
    start_epoch = int(start_date.timestamp())
    
    end_date = datetime(*[ int(k) for k in to_date.split('-') ])
    end_epoch = int(end_date.timestamp())

    results = search_function(after = start_epoch, before = end_epoch, subreddit = subreddit)
    
    return pd.DataFrame( row.d_ for row in results )

Aggregating those calls:

In [13]:
def getCharts(dates):
    return pd.concat( getSingleChart(date) for date in dates )

def getSongFeatures(spotify_uris):
    return pd.DataFrame( getSingleSongFeatures(uri) for uri in spotify_uris if uri is not None )

def getSongMetadata(spotify_uris):
    return pd.DataFrame( getSingleSongMetadata(uri) for uri in spotify_uris if uri is not None )

def getPosts(subreddit, from_date, to_date):
    return redditSearch(reddit.search_submissions, subreddit, from_date, to_date)

def getComments(subreddit, from_date, to_date):
    return redditSearch(reddit.search_comments, subreddit, from_date, to_date)

Fetching the data:

In [5]:
chartsTable = getCharts(("2020-02-01", "2020-03-01"))

In [6]:
chartsTable

Unnamed: 0,title,artist,image,peakPos,lastPos,weeks,rank,isNew,date,spotify_uri
0,The Box,Roddy Ricch,,1,1,7,1,False,2020-02-01,spotify:track:0nbXyq5TXYPCO7pr3N8S4I
1,Life Is Good,Future Featuring Drake,,2,2,2,2,False,2020-02-01,
2,Godzilla,Eminem Featuring Juice WRLD,,3,0,1,3,True,2020-02-01,
3,Circles,Post Malone,,1,3,21,4,False,2020-02-01,spotify:track:21jGcNKet2qwijlDFuPiPb
4,Memories,Maroon 5,,2,4,18,5,False,2020-02-01,spotify:track:2b8fOow8UzyDFAE27YhOZM
...,...,...,...,...,...,...,...,...,...,...
95,Ridin' Roads,Dustin Lynch,,47,85,17,96,False,2020-03-01,spotify:track:2VPmBOuy7ZAOFSzKwW2IEt
96,Me And My Guitar,A Boogie Wit da Hoodie,,58,58,2,97,False,2020-03-01,spotify:track:6cZH4rX1KTt1aJ3Ql6Ynja
97,Feel Me,Selena Gomez,,98,0,1,98,True,2020-03-01,spotify:track:6XXYdF6pJR1K3wKvuxmu7n
98,Vete,Bad Bunny,,33,96,13,99,False,2020-03-01,spotify:track:5DxXgozhkPLgrbKFY91w0c


In [7]:
audioTable = getSongFeatures(set(chartsTable.spotify_uri)) # take the set in case songs are on the charts for many months

retrying ...3secs


In [8]:
audioTable

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.695,0.7620,0,-3.497,1,0.0395,0.19200,0.002440,0.0863,0.553,120.042,audio_features,21jGcNKet2qwijlDFuPiPb,spotify:track:21jGcNKet2qwijlDFuPiPb,https://api.spotify.com/v1/tracks/21jGcNKet2qw...,https://api.spotify.com/v1/audio-analysis/21jG...,215280,4
1,0.610,0.0316,9,-15.186,1,0.0486,0.89600,0.004610,0.1080,0.370,151.964,audio_features,4jXl6VtkFFKIt3ycUQc5LT,spotify:track:4jXl6VtkFFKIt3ycUQc5LT,https://api.spotify.com/v1/tracks/4jXl6VtkFFKI...,https://api.spotify.com/v1/audio-analysis/4jXl...,170360,4
2,0.537,0.7460,10,-5.507,0,0.1500,0.02360,0.000001,0.1560,0.252,170.062,audio_features,6bnF93Rx87YqUBLSgjiMU8,spotify:track:6bnF93Rx87YqUBLSgjiMU8,https://api.spotify.com/v1/tracks/6bnF93Rx87Yq...,https://api.spotify.com/v1/audio-analysis/6bnF...,198267,4
3,0.513,0.7960,1,-4.075,1,0.0629,0.00147,0.000209,0.0938,0.345,171.017,audio_features,0sf12qNH5qcw8qpgymFOqD,spotify:track:0sf12qNH5qcw8qpgymFOqD,https://api.spotify.com/v1/tracks/0sf12qNH5qcw...,https://api.spotify.com/v1/audio-analysis/0sf1...,201573,4
4,0.752,0.6530,6,-4.607,0,0.2930,0.06470,0.000000,0.1250,0.569,165.046,audio_features,7MYFKS7XpVz1JQDK38pr8N,spotify:track:7MYFKS7XpVz1JQDK38pr8N,https://api.spotify.com/v1/tracks/7MYFKS7XpVz1...,https://api.spotify.com/v1/audio-analysis/7MYF...,127983,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,0.875,0.5990,9,-9.077,0,0.1380,0.03330,0.014300,0.1130,0.112,152.061,audio_features,50ZbunKRxPbTa9sVu1iukc,spotify:track:50ZbunKRxPbTa9sVu1iukc,https://api.spotify.com/v1/tracks/50ZbunKRxPbT...,https://api.spotify.com/v1/audio-analysis/50Zb...,256507,4
99,0.519,0.8720,2,-3.212,1,0.0616,0.07930,0.000000,0.1110,0.740,179.965,audio_features,0AYbVoeF03NYhM0fvKjAuU,spotify:track:0AYbVoeF03NYhM0fvKjAuU,https://api.spotify.com/v1/tracks/0AYbVoeF03NY...,https://api.spotify.com/v1/audio-analysis/0AYb...,172280,4
100,0.708,0.6540,5,-6.760,0,0.3690,0.05560,0.000000,0.2080,0.697,168.032,audio_features,1aMokXl3uKDvCh4BBcjFOA,spotify:track:1aMokXl3uKDvCh4BBcjFOA,https://api.spotify.com/v1/tracks/1aMokXl3uKDv...,https://api.spotify.com/v1/audio-analysis/1aMo...,185746,4
101,0.769,0.7870,11,-3.909,1,0.3670,0.18900,0.000000,0.1290,0.836,126.770,audio_features,6Ozh9Ok6h4Oi1wUSLtBseN,spotify:track:6Ozh9Ok6h4Oi1wUSLtBseN,https://api.spotify.com/v1/tracks/6Ozh9Ok6h4Oi...,https://api.spotify.com/v1/audio-analysis/6Ozh...,159715,4


In [10]:
lyricsTable = getSongMetadata(set(chartsTable.spotify_uri)) # slow

In [11]:
lyricsTable

Unnamed: 0,title,album,year,lyrics,image,spotify_uri
0,Circles,Hollywood’s Bleeding,2019-08-30,"Oh, oh, oh\nOh, oh, oh\nOh, oh, oh, oh, oh\n...",https://images.genius.com/75fe2e493c0fa0fb58a3...,spotify:track:21jGcNKet2qwijlDFuPiPb
1,Circles,Circles,2020-01-17,"Well, this is what it look like right before y...",https://images.genius.com/f5528ff2f79b8c9aaf79...,spotify:track:4jXl6VtkFFKIt3ycUQc5LT
2,Heartless,After Hours,2019-11-27,"Young Metro, young Metro, young Metro (Sheesh)...",https://images.genius.com/66b71ea435aad8288aea...,spotify:track:6bnF93Rx87YqUBLSgjiMU8
3,Blinding Lights,After Hours,2019-11-29,Yeah\n\nI've been tryna call\nI've been on my ...,https://images.genius.com/22ca9d47b12db20bbfc8...,spotify:track:0sf12qNH5qcw8qpgymFOqD
4,Knocked Off,"Still Flexin, Still Steppin",2020-02-11,Who made this shit?\nTayTay made the beat\nAyy...,https://images.genius.com/60123290eae91ff1bde9...,spotify:track:7MYFKS7XpVz1JQDK38pr8N
...,...,...,...,...,...,...
98,Everybody,Circles,2020-01-17,Everybody's gotta live\nAnd everybody's gonna ...,https://images.genius.com/f5528ff2f79b8c9aaf79...,spotify:track:50ZbunKRxPbTa9sVu1iukc
99,Make Me Want To,Mercury Lane,2018-09-21,"Long legs, cut off jeans\nPulling me in like a...",https://images.genius.com/28f5df56d1a5e707579a...,spotify:track:0AYbVoeF03NYhM0fvKjAuU
100,Lil Top,"Still Flexin, Still Steppin",2020-02-21,"(Khris James, what the fuck?)\n(Goddamn, BJ wi...",https://images.genius.com/60123290eae91ff1bde9...,spotify:track:1aMokXl3uKDvCh4BBcjFOA
101,BOP,KIRK,2019-09-27,"You know everybody been waiting on that Baby, ...",https://images.genius.com/13178bc885dfa3d4ad71...,spotify:track:6Ozh9Ok6h4Oi1wUSLtBseN


In [17]:
posts = getPosts("news", "2020-02-01", "2020-02-02")

In [18]:
posts

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,...,created,post_hint,preview,thumbnail_height,thumbnail_width,media,media_embed,secure_media,secure_media_embed,link_flair_text
0,[],False,121kiwi,,[],,text,t2_3z42zcfk,False,False,...,1.580656e+09,,,,,,,,,
1,[],False,121kiwi,,[],,text,t2_3z42zcfk,False,False,...,1.580656e+09,,,,,,,,,
2,[],False,121kiwi,,[],,text,t2_3z42zcfk,False,False,...,1.580656e+09,,,,,,,,,
3,[],False,121kiwi,,[],,text,t2_3z42zcfk,False,False,...,1.580656e+09,,,,,,,,,
4,[],False,121kiwi,,[],,text,t2_3z42zcfk,False,False,...,1.580656e+09,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1580,[],False,RollinonEase,,[],,text,t2_15x2g9zz,False,False,...,1.580570e+09,link,"{'enabled': False, 'images': [{'id': 'dBZGIXYz...",84.0,140.0,,,,,
1581,[],False,sanj12345,,[],,text,t2_4kmfk8ta,False,False,...,1.580570e+09,,,,,,,,,
1582,[],False,Sinnivar,,[],,text,t2_154lpxwa,False,False,...,1.580569e+09,,,,,,,,,
1583,[],False,fatouakinajma,,[],,text,t2_3zxw1nd7,False,False,...,1.580569e+09,,,,,,,,,


In [19]:
comments = getComments("news", "2020-02-01", "2020-02-02")

In [20]:
comments

Unnamed: 0,all_awardings,associated_award,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,author_flair_type,...,retrieved_on,score,send_replies,stickied,subreddit,subreddit_id,total_awards_received,created,distinguished,author_cakeday
0,[],,r_r_36,,,[],,,,text,...,1580630409,1,True,False,news,t5_2qh3l,0,1.580656e+09,,
1,[],,up766570,,,[],,,,text,...,1580630409,1,True,False,news,t5_2qh3l,0,1.580656e+09,,
2,[],,ViolableOlive,,,[],,,,text,...,1580630400,1,True,False,news,t5_2qh3l,0,1.580656e+09,,
3,[],,r_r_36,,,[],,,,text,...,1580630381,1,True,False,news,t5_2qh3l,0,1.580656e+09,,
4,[],,[deleted],,,,,,dark,,...,1580630378,1,True,False,news,t5_2qh3l,0,1.580656e+09,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21396,[],,gameofthrombosis,,,[],,,,text,...,1580544048,1,True,False,news,t5_2qh3l,0,1.580569e+09,,
21397,[],,not_usually_serious,,,[],,,,text,...,1580544041,1,True,False,news,t5_2qh3l,0,1.580569e+09,,
21398,[],,SnackyDoo,,,[],,,,text,...,1580544024,1,True,False,news,t5_2qh3l,0,1.580569e+09,,
21399,[],,[deleted],,,,,,dark,,...,1580544021,1,True,False,news,t5_2qh3l,0,1.580569e+09,,
