## Spotify Playlist Generator

Import/download necessary libraries

In [1]:
pip install -r requirements.txt

You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from bs4 import BeautifulSoup
import requests
import sys
import os
import re
from track_info import initialize_spotify, inject_song_id, search_for_song


## Generate data frame to use

The data is in a folder named "lyrics/" where there are about 5 songs from the top 500 artists ( you can view the collection process in 'lyric_collector.py'. The name of each file is the artist name, followed by a number, ranging from 1-5 corresponding to their 1st to 5th most popular songs. Each file contains both the song title on the first line, then the song lyrics on the remaining lines.

The goal is to extract this information into a data frame with columns corresponding to artist name, song name, and lyrics.

In [3]:
def extract_info(text, doc_name, details):
    # The format for Happy by Pharrell for example would be :
    # Happy Lyrics
    # {Song Lyrics}
    # Hence, we plit on 'Lyrics'
    try:
        song_name, lyrics = text.split('Lyrics', maxsplit= 1)
        #print(song_name)
    except Exception as e:
        #print(e)
        print(f"{doc_name} has no lyrics or had an error")
        return
    artist, _ = doc_name.rsplit('_', maxsplit = 1)
    details["artist"].append(artist)
    details["song_name"].append(song_name)
    details["lyrics"].append(lyrics)

In [4]:
def read_docs(input_dir):
    details = {
        "artist": [],
        "song_name": [],
        "lyrics": []
    }
    docs = os.listdir(input_dir)
    for doc in docs:
        #print(f"{input_dir}{doc}")
        with open(f"{input_dir}/{doc}") as f:
            doc_text = f.read()
            extract_info(doc_text, doc, details)
    df = pd.DataFrame(details)
    return df


In [5]:
df = read_docs('lyrics/')

In [6]:
df["id"] = df.index
df.head()

Unnamed: 0,artist,song_name,lyrics,id
0,MGMT,Electric Feel,\nAll along the Western front\nPeople line up ...,0
1,AFI,The Days of the Phoenix,I remember when I was told of story of\nCrushe...,1
2,TVontheRadio,DLZ,\nCongratulations on the mess you made of thin...,2
3,SonicYouth,Schizophrenia,\nI went away to see an old friend of mine\nHi...,3
4,NickCave&TheBadSeeds,O Children,"Pass me that lovely little gun\nMy dear, my da...",4


### Remove Stop words and Tokenize

In [7]:
stop = set()
with open('data/stopwords.txt', 'r') as f:
    for line in f:
        stop.add(line.rstrip())

df['lyrics_nostop'] = df['lyrics'].apply(lambda x: ' '.join([word.lower() for word in x.split() if word not in (stop)]))

In [8]:
df.head()

Unnamed: 0,artist,song_name,lyrics,id,lyrics_nostop
0,MGMT,Electric Feel,\nAll along the Western front\nPeople line up ...,0,all along western front people line up receive...
1,AFI,The Days of the Phoenix,I remember when I was told of story of\nCrushe...,1,"i remember i told story crushed velvet, candle..."
2,TVontheRadio,DLZ,\nCongratulations on the mess you made of thin...,2,congratulations mess made things i'm trying re...
3,SonicYouth,Schizophrenia,\nI went away to see an old friend of mine\nHi...,3,i went away see old friend mine his sister cam...
4,NickCave&TheBadSeeds,O Children,"Pass me that lovely little gun\nMy dear, my da...",4,"pass lovely little gun my dear, darling one th..."


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
analyze = TfidfVectorizer().build_analyzer()
df['tokens'] = df['lyrics_nostop'].apply(analyze)

Now would be an appropriate time to get rid of some redundant columns

In [11]:
df = df.drop(['lyrics'], axis=1)
df.head()

Unnamed: 0,artist,song_name,id,lyrics_nostop,tokens
0,MGMT,Electric Feel,0,all along western front people line up receive...,"[all, along, western, front, people, line, up,..."
1,AFI,The Days of the Phoenix,1,"i remember i told story crushed velvet, candle...","[remember, told, story, crushed, velvet, candl..."
2,TVontheRadio,DLZ,2,congratulations mess made things i'm trying re...,"[congratulations, mess, made, things, trying, ..."
3,SonicYouth,Schizophrenia,3,i went away see old friend mine his sister cam...,"[went, away, see, old, friend, mine, his, sist..."
4,NickCave&TheBadSeeds,O Children,4,"pass lovely little gun my dear, darling one th...","[pass, lovely, little, gun, my, dear, darling,..."


## Find similarities between songs and queries

### General cosine similarity based on lyric similarity

In [12]:
query = "I am very happy, give me happy"

q_ser = pd.Series([query])
l_ser = pd.Series(df['lyrics_nostop'])

q_lyrics = q_ser.append(l_ser)

# the linear kernel is the dot product
tf_idf = TfidfVectorizer().fit_transform(q_lyrics)
# the query is the first item in the series, so compare it to the other items
cosine_sim = cosine_similarity(tf_idf[0:1], tf_idf[1:]).flatten()
top_songs_indices = cosine_sim.argsort()[:-7:-1]
print(top_songs_indices)
print(cosine_sim[top_songs_indices])

df.iloc[top_songs_indices]

[1286 2174  449 1174   94 2227]
[0.40734742 0.38548732 0.37506168 0.3115518  0.28000354 0.23671946]


Unnamed: 0,artist,song_name,id,lyrics_nostop,tokens
1286,DemiLovato,Stone Cold,1286,"stone cold, stone cold you see standing, i'm d...","[stone, cold, stone, cold, you, see, standing,..."
2174,TheKooks,Junk of the Heart (Happy),2174,junk heart junk mind so hard leave alone we ge...,"[junk, heart, junk, mind, so, hard, leave, alo..."
449,SherylCrow,If It Makes You Happy,449,"i've long, long way put poncho, played mosquit...","[ve, long, long, way, put, poncho, played, mos..."
1174,PharrellWilliams,Happy,1174,it might seem crazy i'm 'bout say sunshine she...,"[it, might, seem, crazy, bout, say, sunshine, ..."
94,TheWombats,Let’s Dance to Joy Division,94,i'm back liverpool everything seems same but i...,"[back, liverpool, everything, seems, same, but..."
2227,CatStevens,Tea for the Tillerman,2227,bring tea tillerman steak sun wine woman made ...,"[bring, tea, tillerman, steak, sun, wine, woma..."


If you look at the lyrics, key words like "happy" and "give" show up a lot, but the query is only stop words and those two words, and there are no stop words in the lyrics! So we need to look at some other methods.

My idea is to incorporate some data from spotify, as they have some nice metrics such as 'danceability', 'energy', 'loudness' etc. If we create a vector for each song, we can use a similar method to the one above, where we measure the 'closeness' of each vector. The query in this case would have to be a song, so we can extract the same metrics and measure closeness.

## Connecting to Spotify

In [13]:
cid = '1d668be1930e487eaacd284df4fa7601'
secret = '08ac712c04ba4bffaeb232efe98a7a54'

sp = initialize_spotify(cid,secret)

In [14]:
# test functionality
song = search_for_song(sp, "Pharrell Williams", "Happy")
print(song)

60nZcImufyMA1MKQY3dcCH


In [15]:
# pretty sure this isn't a song
song = search_for_song(sp, "Beep Boop", "Bop")
print(song)

nan


We can now use this id to extract certain information, but we want this information for all of the songs in our dataset.

In [16]:
df["artist_tokens"] = df["artist"].apply(lambda x: re.findall('([A-Z.!,$\'][a-z]*)', x))

In [17]:
df.head()

Unnamed: 0,artist,song_name,id,lyrics_nostop,tokens,artist_tokens
0,MGMT,Electric Feel,0,all along western front people line up receive...,"[all, along, western, front, people, line, up,...","[M, G, M, T]"
1,AFI,The Days of the Phoenix,1,"i remember i told story crushed velvet, candle...","[remember, told, story, crushed, velvet, candl...","[A, F, I]"
2,TVontheRadio,DLZ,2,congratulations mess made things i'm trying re...,"[congratulations, mess, made, things, trying, ...","[T, Vonthe, Radio]"
3,SonicYouth,Schizophrenia,3,i went away see old friend mine his sister cam...,"[went, away, see, old, friend, mine, his, sist...","[Sonic, Youth]"
4,NickCave&TheBadSeeds,O Children,4,"pass lovely little gun my dear, darling one th...","[pass, lovely, little, gun, my, dear, darling,...","[Nick, Cave, The, Bad, Seeds]"


### Cleaning and preprocessing

The naming scheme produces some problems, SonicYouth wont turn up any results, but Sonic Youth will, so we need to split somehow, but doing it by capital letters isn't good because bands like MGMT with be M G M T which another problem. So we can get the original name by merging on the same transformation.

In [18]:
artists = pd.read_csv("data/top_artists.csv")
artists = artists[0:600]
artists.head()

Unnamed: 0.1,Unnamed: 0,artist_mb,listeners_lastfm
0,0,Coldplay,5381567.0
1,1,Radiohead,4732528.0
2,2,Red Hot Chili Peppers,4620835.0
3,3,Rihanna,4558193.0
4,4,Eminem,4517997.0


In [19]:
# peform same transformation that filenaming used to merge onto other data set
artists["artist"] = artists["artist_mb"].apply(lambda x: x.replace(' ',''))

In [20]:
artists.head()

Unnamed: 0.1,Unnamed: 0,artist_mb,listeners_lastfm,artist
0,0,Coldplay,5381567.0,Coldplay
1,1,Radiohead,4732528.0,Radiohead
2,2,Red Hot Chili Peppers,4620835.0,RedHotChiliPeppers
3,3,Rihanna,4558193.0,Rihanna
4,4,Eminem,4517997.0,Eminem


Notice now we have the original artist name, and the transformed version, now we can merge

In [21]:
#do an inner merge so we get the intersection, and there are no inconsistoncies
info = pd.merge(artists, df, on = "artist")

In [22]:
print(info.shape)

(2410, 9)


In [23]:
info.head(20)

Unnamed: 0.1,Unnamed: 0,artist_mb,listeners_lastfm,artist,song_name,id,lyrics_nostop,tokens,artist_tokens
0,0,Coldplay,5381567.0,Coldplay,Viva la Vida,399,i used rule world seas would rise i gave word ...,"[used, rule, world, seas, would, rise, gave, w...",[Coldplay]
1,0,Coldplay,5381567.0,Coldplay,Hymn for the Weekend,787,"and said drink me, drink (oh-ah-oh-ah) that we...","[and, said, drink, me, drink, oh, ah, oh, ah, ...",[Coldplay]
2,0,Coldplay,5381567.0,Coldplay,The Scientist,1075,"come up meet you, tell i'm sorry you don't kno...","[come, up, meet, you, tell, sorry, you, don, k...",[Coldplay]
3,0,Coldplay,5381567.0,Coldplay,Yellow,2120,"look stars, look shine and everything do yeah,...","[look, stars, look, shine, and, everything, do...",[Coldplay]
4,0,Coldplay,5381567.0,Coldplay,Fix You,2169,"when try best, don't succeed when get want, no...","[when, try, best, don, succeed, when, get, wan...",[Coldplay]
5,1,Radiohead,4732528.0,Radiohead,Creep,558,when were before couldn't look eye you're just...,"[when, were, before, couldn, look, eye, you, r...",[Radiohead]
6,1,Radiohead,4732528.0,Radiohead,Exit Music (For A Film),630,"wake sleep the drying tears today we escape, w...","[wake, sleep, the, drying, tears, today, we, e...",[Radiohead]
7,1,Radiohead,4732528.0,Radiohead,Karma Police,1221,"karma police, arrest man he talks maths, buzze...","[karma, police, arrest, man, he, talks, maths,...",[Radiohead]
8,1,Radiohead,4732528.0,Radiohead,No Surprises,1965,a heart that's full up like landfill a job slo...,"[heart, that, full, up, like, landfill, job, s...",[Radiohead]
9,1,Radiohead,4732528.0,Radiohead,Paranoid Android,2321,"please, could stop noise? i'm trying get rest ...","[please, could, stop, noise, trying, get, rest...",[Radiohead]


In [24]:
drop = ['artist', 'artist_tokens', 'listeners_lastfm']
info = info.drop(drop, axis = 1)
info.head()

Unnamed: 0.1,Unnamed: 0,artist_mb,song_name,id,lyrics_nostop,tokens
0,0,Coldplay,Viva la Vida,399,i used rule world seas would rise i gave word ...,"[used, rule, world, seas, would, rise, gave, w..."
1,0,Coldplay,Hymn for the Weekend,787,"and said drink me, drink (oh-ah-oh-ah) that we...","[and, said, drink, me, drink, oh, ah, oh, ah, ..."
2,0,Coldplay,The Scientist,1075,"come up meet you, tell i'm sorry you don't kno...","[come, up, meet, you, tell, sorry, you, don, k..."
3,0,Coldplay,Yellow,2120,"look stars, look shine and everything do yeah,...","[look, stars, look, shine, and, everything, do..."
4,0,Coldplay,Fix You,2169,"when try best, don't succeed when get want, no...","[when, try, best, don, succeed, when, get, wan..."


In [25]:
# Avoid running this because this is written to a csv
# all work is done in track_info.py
# Uncomment if running for the first time
# inject_song_id(sp, info)

See track_info.py to view the process of injecting these ids into the dataframe. The process took about 3 minutes, so I figured it was easier to store in a csv for easy access.

In [26]:
info = pd.read_csv("data/data_with_spotify_ids.csv")

In [27]:
info["sp_id"].isna().sum()

0

That did the trick, we will simply remove these last 79 songs because there are likely errors that would waste a lot of time to fix.

In [28]:
info.shape

(2331, 8)

Now we have 2,331 songs to build our playlists on, and there is always a way to add artists I/you like if I needed to later.

### Add some song metadata

I already processed and extracted all of the data into a csv, see track_info.py to see how that was done

Now we have some features that will predict song similarity!

In [29]:
track_info = pd.read_csv("data/audio_and_lyric_data.csv")

In [30]:
track_info = track_info.drop("Unnamed: 0",axis=1)
track_info.head()

Unnamed: 0,artist_mb,song_name,id,lyrics_nostop,tokens,sp_id,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Coldplay,Viva la Vida,399,i used rule world seas would rise i gave word ...,"['used', 'rule', 'world', 'seas', 'would', 'ri...",1mea3bSkSGXuIRvnydlB5b,0.486,0.617,5.0,-7.115,0.0287,0.0954,3e-06,0.109,0.417,138.015
1,Coldplay,Hymn for the Weekend,787,"and said drink me, drink (oh-ah-oh-ah) that we...","['and', 'said', 'drink', 'me', 'drink', 'oh', ...",3RiPr603aXAoi4GHyXx0uy,0.491,0.693,0.0,-6.487,0.0377,0.211,7e-06,0.325,0.412,90.027
2,Coldplay,The Scientist,1075,"come up meet you, tell i'm sorry you don't kno...","['come', 'up', 'meet', 'you', 'tell', 'sorry',...",75JFxkI2RXiU7L9VXzMkle,0.557,0.442,5.0,-7.224,0.0243,0.731,1.5e-05,0.11,0.213,146.277
3,Coldplay,Yellow,2120,"look stars, look shine and everything do yeah,...","['look', 'stars', 'look', 'shine', 'and', 'eve...",3AJwUDP919kvQ9QcozQPxg,0.429,0.661,11.0,-7.227,0.0281,0.00239,0.000121,0.234,0.285,173.372
4,Coldplay,Fix You,2169,"when try best, don't succeed when get want, no...","['when', 'try', 'best', 'don', 'succeed', 'whe...",7LVHVU3tWfcxj5aiPFEW4Q,0.209,0.417,3.0,-8.74,0.0338,0.164,0.00196,0.113,0.124,138.178


In [31]:
track_info.isna().sum()

artist_mb           0
song_name           0
id                  0
lyrics_nostop       0
tokens              0
sp_id               0
danceability        0
energy              0
key                 0
loudness            0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
dtype: int64

Now we have a lot of information on the song to work with!

## Find songs based on audio features

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=020269be-b327-479a-994d-3e58f34cfdc4' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>