In [1]:
# import libraries
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import joblib
import spacy
import re

In [2]:
# run once
# !python -m spacy download en_core_web_lg

In [3]:
# load spacy model

nlp = spacy.load("en_core_web_lg")

# Baseline Nearest Neighbors Model

In [4]:
# read in data
data_path = "../data/"

def wrangle(data_path):

    # reads in the data
    df_tracks = pd.read_csv(data_path + 'tracks.csv', parse_dates=['release_date'])
    df_lyrics = pd.read_csv(data_path + 'lyrics.csv') #usecols=['SName', 'Lyric']
    
    # drop null values on df_tracks
    df_tracks.dropna(inplace=True)
    
    # case normalization on song names
    df_lyrics['SName'] = df_lyrics['SName'].apply(lambda x: x.lower())
    df_tracks['name'] = df_tracks['name'].apply(lambda x: x.lower())
    
    # clean up df_lyrics data for merging
    df_lyrics = df_lyrics.rename(columns={'ALink': 'Artist', 'SName': 'name'})
    df_lyrics['Artist'] = df_lyrics['Artist'].apply(lambda x: x.rstrip('/').lstrip('/').replace('-', " ").lower())

    # drops duplicate lyrics 
    df_lyrics = df_lyrics.drop_duplicates(subset=['Artist', 'name'])
    
    # clean up df_tracks data for merging
    df_tracks = df_tracks.rename(columns={'artists': 'Artist'})

    df_tracks['Artist'] = df_tracks['Artist'].apply(lambda x: x.lstrip("['").rstrip("']").lower())
    
    # use regex to strip acoustic/remix versions
    strip_post_dash = ' - \S+'
    strip_version = ' version'

    df_tracks['name'] = df_tracks['name'].apply(lambda song_name: re.sub(strip_post_dash, "", song_name))
    df_tracks['name'] = df_tracks['name'].apply(lambda song_name: re.sub(strip_version, "", song_name))
    
    # drop duplicate rows
    df_tracks = df_tracks.drop_duplicates(subset=['Artist', 'name'])
    
    df_merged = pd.merge(df_lyrics, df_tracks, on=['Artist', 'name'], how='inner')
    
    # drop unwanted columns
    df_merged = df_merged.drop(columns=['duration_ms'])
    
    # original merge dataframes
    # df_merged = pd.merge(df_tracks, df_lyrics, how='inner', left_on=['name'], right_on='SName')
    
    
    # original drop columns not used in NN model
    # df_merged = df_merged.drop(columns= ['id', 'name', 'artists', 'id_artists', 'release_date', ])
    
    return df_merged

def keep_wanted_columns(df):
    df_dropped = df[['popularity', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']]
    
    #drop(columns= ['Artist', 'name', 'SLink', 'Lyric', 'Idiom', 'id_artists', 'id', 'release_date'])
    
    return df_dropped

In [5]:
df_merged = wrangle(data_path)

In [6]:
df_merged.head()

Unnamed: 0,Artist,name,SLink,Lyric,Idiom,id,popularity,explicit,id_artists,release_date,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,12 stones,world so cold,/12-stones/world-so-cold.html,"It starts with pain, followed by hate. Fueled ...",ENGLISH,471eQ7hcJ7JdGY1NzMmUeg,55,0,['0DrXhci3WAyo0WJv1RBOG6'],2007-01-01,...,4,-4.949,0,0.0443,0.472,0.0,0.36,0.394,186.227,3
1,12 stones,anthem for the underdog,/12-stones/anthem-for-the-underdog.html,You say you know just who I am. But you can't ...,ENGLISH,6FFwt1ea9hJ4MfMQLywahm,59,0,['0DrXhci3WAyo0WJv1RBOG6'],2007-01-01,...,5,-3.424,1,0.0626,0.000651,0.0,0.339,0.468,93.977,3
2,12 stones,we are one,/12-stones/we-are-one.html,We walk alone. In the unknown. We live to win ...,ENGLISH,4lhqal0Hq63U2wETCeBdG1,58,0,['0DrXhci3WAyo0WJv1RBOG6'],2010-01-01,...,5,-4.041,0,0.114,0.000119,1.6e-05,0.233,0.306,127.102,4
3,3 doors down,here without you,/3-doors-down/here-without-you.html,"A hundred days have made me older,. Since the ...",ENGLISH,3NLrRZoMF0Lx6zTlYqeIo4,76,0,['2RTUTCvo6onsAnheUk3aL9'],2002-11-12,...,10,-6.817,0,0.0252,0.0492,0.0,0.205,0.233,143.994,4
4,3 doors down,when i'm gone,/3-doors-down/when-im-gone.html,There's another world inside of me that you ma...,ENGLISH,3WbphvawbMZ8FyqDxYGdSQ,70,0,['2RTUTCvo6onsAnheUk3aL9'],2002-11-12,...,7,-5.611,1,0.0284,0.00385,0.0,0.103,0.374,148.095,4


In [7]:
df_dropped = keep_wanted_columns(df_merged)
df_dropped

Unnamed: 0,popularity,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,55,0,0.262,0.688,4,-4.949,0,0.0443,0.472000,0.000000,0.360,0.394,186.227,3
1,59,0,0.200,0.863,5,-3.424,1,0.0626,0.000651,0.000000,0.339,0.468,93.977,3
2,58,0,0.444,0.974,5,-4.041,0,0.1140,0.000119,0.000016,0.233,0.306,127.102,4
3,76,0,0.557,0.533,10,-6.817,0,0.0252,0.049200,0.000000,0.205,0.233,143.994,4
4,70,0,0.530,0.768,7,-5.611,1,0.0284,0.003850,0.000000,0.103,0.374,148.095,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15827,31,0,0.564,0.517,10,-12.895,0,0.0434,0.460000,0.000005,0.118,0.627,151.578,4
15828,26,0,0.607,0.733,9,-10.234,1,0.0548,0.264000,0.000001,0.328,0.935,105.702,4
15829,42,0,0.633,0.650,0,-9.687,1,0.2050,0.692000,0.000000,0.272,0.909,105.756,4
15830,21,0,0.403,0.722,8,-6.716,1,0.0446,0.418000,0.000000,0.189,0.761,73.708,4


In [8]:
# display(df_merged.shape)
# df_merged.tail()

In [9]:
# df_tracks = pd.read_csv(data_path + 'tracks.csv', parse_dates=['release_date'])
# df_lyrics = pd.read_csv(data_path + 'lyrics.csv') #usecols=['SName', 'Lyric']
# df_tracks.dropna(inplace=True)
    
# # case normalization on song names
# df_lyrics['SName'] = df_lyrics['SName'].apply(lambda x: x.lower())
# df_tracks['name'] = df_tracks['name'].apply(lambda x: x.lower())

# # merge dataframes
# df_merged = pd.merge(df_tracks, df_lyrics, how='inner', left_on=['name'], right_on='SName')

# # drop columns not used in NN model
# df_merged = df_merged.drop(columns= ['id', 'name', 'artists', 'id_artists', 'release_date', 'duration_ms'])

In [10]:
user_selected_song = 'stairway to heaven'
user_selected_artist = 'led zeppelin'

In [11]:
song_row = df_merged[(df_merged['name'] == user_selected_song) & (df_merged['Artist'] == user_selected_artist)]

In [12]:
song_row

Unnamed: 0,Artist,name,SLink,Lyric,Idiom,id,popularity,explicit,id_artists,release_date,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
4268,led zeppelin,stairway to heaven,/led-zeppelin/stairway-to-heaven.html,There's a lady who's sure all. that glitters i...,ENGLISH,5CQ30WqJwcep0pYcV4AMNc,79,0,['36QJpDe2go2KgaRleHCDTp'],1971-11-08,...,9,-12.049,0,0.0339,0.58,0.0032,0.116,0.197,82.433,4


In [13]:
# drop categorical data, prepare for input to model
song_row = keep_wanted_columns(song_row)

# df_dropped = keep_wanted_columns(df_merged)

# instantiate Nearest Neighbors model
nn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')



nn.fit(df_dropped)

NearestNeighbors(algorithm='kd_tree')

In [14]:
neigh_dist, neigh_index = nn.kneighbors(song_row)

In [15]:
neigh_index = neigh_index[0][1:]
neigh_index

array([11904, 12882, 10793, 10478])

In [16]:
print(df_merged['name'].iloc[11904])

the a team


In [17]:
for index in neigh_index:
    print(df_merged['name'].iloc[index])

the a team
royals
take a bow
self control


In [18]:
neigh_dist

array([[0.        , 4.032528  , 4.67054588, 5.08803819, 5.14691675]])

In [19]:
# joblib.dump(nn, '../data/NearestNeighborModel')

# Natural Language Processing

In [20]:
# df_merged = df_merged[df_merged['Idiom'] == 'ENGLISH']