In [1]:
# import libraries
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import joblib
import spacy
import re
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models
from category_encoders import OneHotEncoder
import numpy as np

  from collections import Mapping


In [2]:
# run once
# !python -m spacy download en_core_web_lg

  and should_run_async(code)


In [3]:
# load spacy model
nlp = spacy.load("en_core_web_lg")

  and should_run_async(code)


# Baseline Nearest Neighbors Model

In [4]:
# read in data
data_path = "../data/"

def wrangle(data_path):

    # reads in the data
    df_tracks = pd.read_csv(data_path + 'tracks.csv', parse_dates=['release_date'])
    df_lyrics = pd.read_csv(data_path + 'lyrics.csv') #usecols=['SName', 'Lyric']
    
    # drop null values on df_tracks
    df_tracks.dropna(inplace=True)
    df_lyrics.dropna(inplace=True)
    
    # case normalization on song names
    df_lyrics['SName'] = df_lyrics['SName'].apply(lambda x: x.lower())
    df_tracks['name'] = df_tracks['name'].apply(lambda x: x.lower())
    
    # clean up df_lyrics data for merging
    df_lyrics = df_lyrics.rename(columns={'ALink': 'Artist', 'SName': 'name'})
    df_lyrics['Artist'] = df_lyrics['Artist'].apply(lambda x: x.rstrip('/').lstrip('/').replace('-', " ").lower())

    # drops duplicate lyrics 
    df_lyrics = df_lyrics.drop_duplicates(subset=['Artist', 'name'])
    
    # clean up df_tracks data for merging
    df_tracks = df_tracks.rename(columns={'artists': 'Artist'})

    df_tracks['Artist'] = df_tracks['Artist'].apply(lambda x: x.lstrip("['").rstrip("']").lower())
    
    # use regex to strip acoustic/remix versions
    strip_post_dash = ' - \S+'
    strip_version = ' version'

    df_tracks['name'] = df_tracks['name'].apply(lambda song_name: re.sub(strip_post_dash, "", song_name))
    df_tracks['name'] = df_tracks['name'].apply(lambda song_name: re.sub(strip_version, "", song_name))
    
    # drop duplicate rows
    df_tracks = df_tracks.drop_duplicates(subset=['Artist', 'name'])
    
    df_merged = pd.merge(df_lyrics, df_tracks, on=['Artist', 'name'], how='inner')
    
    # drop unwanted columns
    df_merged = df_merged.drop(columns=['duration_ms'])
    
    df_merged = df_merged[df_merged['Idiom'] == 'ENGLISH']
    
    # original merge dataframes
    # df_merged = pd.merge(df_tracks, df_lyrics, how='inner', left_on=['name'], right_on='SName')
    
    
    # original drop columns not used in NN model
    # df_merged = df_merged.drop(columns= ['id', 'name', 'artists', 'id_artists', 'release_date', ])
    
    return df_merged

def keep_wanted_columns(df):
    df_dropped = df[['popularity', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'new_topic_name_Existential', 'new_topic_name_Religion',
        'new_topic_name_Gangsta', 'new_topic_name_Poppy','new_topic_name_Love']]
    
    #drop(columns= ['Artist', 'name', 'SLink', 'Lyric', 'Idiom', 'id_artists', 'id', 'release_date'])
    
    return df_dropped

  and should_run_async(code)
  strip_post_dash = ' - \S+'


In [5]:
df_merged = wrangle(data_path)

  and should_run_async(code)


In [6]:
df_merged

  and should_run_async(code)


Unnamed: 0,Artist,name,SLink,Lyric,Idiom,id,popularity,explicit,id_artists,release_date,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,12 stones,world so cold,/12-stones/world-so-cold.html,"It starts with pain, followed by hate. Fueled ...",ENGLISH,471eQ7hcJ7JdGY1NzMmUeg,55,0,['0DrXhci3WAyo0WJv1RBOG6'],2007-01-01,...,4,-4.949,0,0.0443,0.472000,0.000000,0.360,0.394,186.227,3
1,12 stones,anthem for the underdog,/12-stones/anthem-for-the-underdog.html,You say you know just who I am. But you can't ...,ENGLISH,6FFwt1ea9hJ4MfMQLywahm,59,0,['0DrXhci3WAyo0WJv1RBOG6'],2007-01-01,...,5,-3.424,1,0.0626,0.000651,0.000000,0.339,0.468,93.977,3
2,12 stones,we are one,/12-stones/we-are-one.html,We walk alone. In the unknown. We live to win ...,ENGLISH,4lhqal0Hq63U2wETCeBdG1,58,0,['0DrXhci3WAyo0WJv1RBOG6'],2010-01-01,...,5,-4.041,0,0.1140,0.000119,0.000016,0.233,0.306,127.102,4
3,3 doors down,here without you,/3-doors-down/here-without-you.html,"A hundred days have made me older,. Since the ...",ENGLISH,3NLrRZoMF0Lx6zTlYqeIo4,76,0,['2RTUTCvo6onsAnheUk3aL9'],2002-11-12,...,10,-6.817,0,0.0252,0.049200,0.000000,0.205,0.233,143.994,4
4,3 doors down,when i'm gone,/3-doors-down/when-im-gone.html,There's another world inside of me that you ma...,ENGLISH,3WbphvawbMZ8FyqDxYGdSQ,70,0,['2RTUTCvo6onsAnheUk3aL9'],2002-11-12,...,7,-5.611,1,0.0284,0.003850,0.000000,0.103,0.374,148.095,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14394,whitney houston,you'll never stand alone,/whitney-houston/youll-never-stand-alone.html,Verse 1:. If there's a time when the tears sho...,ENGLISH,3KFbjIFQxMbOKdZ4S3lv4S,34,0,['6XpaIBNiVzIetEPCWDvAFP'],1998-11-17,...,9,-6.943,1,0.0254,0.105000,0.000000,0.272,0.194,103.907,4
14395,whitney houston,you're still my man,/whitney-houston/youre-still-my-man.html,You'Re Still My Man. On the day that you left ...,ENGLISH,2csRQWI7A2FjLYBQBiAIoi,42,0,['6XpaIBNiVzIetEPCWDvAFP'],1987-06-02,...,10,-9.496,1,0.0295,0.684000,0.000004,0.226,0.308,131.743,4
14441,zendaya,replay,/zendaya/replay.html,"Make it stop, sounds so good. I just can't tak...",ENGLISH,7d1CFwrBmH34gmS0Hkbfbt,67,0,['6sCbFbEjbYepqswM1vWjjs'],2013-01-01,...,6,-6.682,0,0.1210,0.041600,0.000200,0.302,0.618,140.023,4
15306,jorge ben jor,brother,/jorge-ben-jor/brother.html,"Brother, Brother, Prepare one more happy way f...",ENGLISH,1LLRVZB4fz7bduFEZ22DE4,48,0,['5JYtpnUKxAzXfHEYpOeeit'],1974-01-01,...,2,-11.125,1,0.0378,0.416000,0.000000,0.233,0.972,132.249,4


In [7]:
# user selects what song
user_selected_song = 'imagine'
user_selected_artist = 'john lennon'

song_row = df_merged[(df_merged['name'] == user_selected_song) & (df_merged['Artist'] == user_selected_artist)]
song_row

  and should_run_async(code)


Unnamed: 0,Artist,name,SLink,Lyric,Idiom,id,popularity,explicit,id_artists,release_date,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
3776,john lennon,imagine,/john-lennon/imagine.html,Imagine there's no heaven. It's easy if you tr...,ENGLISH,4lwh5BoHBFGjKNAF3sNNCK,25,0,['4x1nvY2FN8jxqAFA0DA02H'],1971-09-09,...,0,-11.407,1,0.029,0.947,3e-05,0.0689,0.218,74.4,4


In [8]:
# drop categorical data, prepare for input to model
# song_row = keep_wanted_columns(song_row)
# df_dropped = keep_wanted_columns(df_merged)

# # instantiate Nearest Neighbors model
# nn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')

# # fit the model
# nn.fit(df_dropped)

  and should_run_async(code)


In [9]:
# neigh_dist, neigh_index = nn.kneighbors(song_row)

  and should_run_async(code)


In [10]:
# neigh_index = neigh_index[0][1:]
# neigh_index

  and should_run_async(code)


In [11]:
# for index in neigh_index:
#     print(df_merged['name'].iloc[index])

  and should_run_async(code)


In [12]:
# neigh_dist

  and should_run_async(code)


In [13]:
# joblib.dump(nn, '../data/NearestNeighborModel')

  and should_run_async(code)


# Natural Language Processing

In [14]:
# df_merged = df_merged[df_merged['Idiom'] == 'ENGLISH']

  and should_run_async(code)


In [15]:
import re
def clean_data(text):
    """
    Accepts a single text document and performs several regex substitutions in order to clean the document.
    Parameters
    ----------
    text: string or object
    Returns
    -------
    text: string or object
    """
    special_chars_regex = '[:?,\>$|!\'"]'
    white_spaces_regex = '[ ]{2,}'
    text = re.sub('[^a-zA-Z ]', "", text)
    text = re.sub(special_chars_regex, " ", text)
    text = re.sub(white_spaces_regex, " ", text)
    return text.lower()

  and should_run_async(code)
  special_chars_regex = '[:?,\>$|!\'"]'


In [16]:
def tokenize(doc):
# run the clean_data function on the text
    text = clean_data(doc)
# Instantiatie language model
    doc = nlp(text)
    STOP_WORDS = nlp.Defaults.stop_words.union('', 'the', 'this', '-pron-', 'wo', 've', 'nt')
    return [token.lemma_.lower().lstrip().rstrip() for token in doc
            if (token.text not in STOP_WORDS)
            and (token.is_punct != True)
            and (token.text not in '')
            and (len(token.text)>2)]

  and should_run_async(code)


In [17]:
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension
!jupyter labextension install @jupyter-widgets/jupyterlab-manager
!pip install pandarallel

  and should_run_async(code)




Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m
Traceback (most recent call last):
  File "/Users/dylansivori/.local/share/virtualenvs/Spotify-Song-Suggester-AdyWyJ8v/bin/jupyter", line 8, in <module>
    sys.exit(main())
  File "/Users/dylansivori/.local/share/virtualenvs/Spotify-Song-Suggester-AdyWyJ8v/lib/python3.8/site-packages/jupyter_core/command.py", line 285, in main
    command = _jupyter_abspath(subcommand)
  File "/Users/dylansivori/.local/share/virtualenvs/Spotify-Song-Suggester-AdyWyJ8v/lib/python3.8/site-packages/jupyter_core/command.py", line 124, in _jupyter_abspath
    raise Exception(
Exception: Jupyter command `jupyter-labextension` not found.


In [18]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=7)

INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


  and should_run_async(code)


In [19]:
lemmas = df_merged['Lyric'].parallel_apply(tokenize)

  and should_run_async(code)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1826), Label(value='0 / 1826'))), …

In [20]:
lemmas

  and should_run_async(code)


0        [start, pain, follow, hate, fuel, endless, que...
1        [know, imagine, wait, line, think, stand, tire...
2        [walk, unknown, live, win, victory, young, die...
3        [day, old, time, see, pretty, face, thousand, ...
4        [world, inside, secret, life, hide, darkness, ...
                               ...                        
14394    [verse, time, tear, fill, eye, past, shadow, s...
14395    [man, day, leave, say, regret, bond, break, fe...
14441    [stop, sound, good, turn, turn, know, know, st...
15306    [brother, brother, prepare, happy, way, lord, ...
15331    [mama, mama, mama, easy, girl, easy, girl, jor...
Name: Lyric, Length: 12780, dtype: object

In [21]:
id2words = corpora.Dictionary(lemmas)
corpus = [id2words.doc2bow(lemma) for lemma in lemmas]

  and should_run_async(code)


In [22]:
lda_multicore = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                        id2word=id2words,
                                                        num_topics=5,
                                                        chunksize=100,
                                                        passes=10,
                                                        per_word_topics=True,
                                                        workers=6)

  and should_run_async(code)


In [23]:
# somewhat unsure of what this is doing

all_topics = lda_multicore.get_document_topics(corpus[3800], per_word_topics=True)
# for doc_topics, word_topics, phi_values in all_topics:
#     print('New Document \n')
#     print('Document topics:', doc_topics)
#     print('Word topics:', word_topics)
#     print('Phi values:', phi_values)
#     print(" ")
#     print('-------------- \n')
all_topics[0]

  and should_run_async(code)


[(3, 0.99272645)]

In [24]:
# display the print_topics from the model 
doc_topics = lda_multicore.get_document_topics(corpus, per_word_topics=True)

  and should_run_async(code)


In [25]:
doc_topics.corpus

  and should_run_async(code)


[[(0, 1),
  (1, 1),
  (2, 4),
  (3, 3),
  (4, 6),
  (5, 1),
  (6, 1),
  (7, 2),
  (8, 5),
  (9, 2),
  (10, 1),
  (11, 1),
  (12, 4),
  (13, 4),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 3),
  (18, 1),
  (19, 2),
  (20, 1),
  (21, 6),
  (22, 4),
  (23, 2),
  (24, 1),
  (25, 3),
  (26, 4),
  (27, 1),
  (28, 4),
  (29, 1),
  (30, 4),
  (31, 1),
  (32, 1),
  (33, 4),
  (34, 3),
  (35, 1),
  (36, 2),
  (37, 1),
  (38, 1),
  (39, 2),
  (40, 1),
  (41, 1),
  (42, 3),
  (43, 2),
  (44, 1),
  (45, 2),
  (46, 1),
  (47, 2),
  (48, 1),
  (49, 5),
  (50, 1),
  (51, 2),
  (52, 2),
  (53, 1),
  (54, 1),
  (55, 12)],
 [(17, 2),
  (22, 3),
  (28, 1),
  (36, 2),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 5),
  (65, 1),
  (66, 1),
  (67, 2),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 3),
  (73, 1),
  (74, 1),
  (75, 3),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1)],
 [(30, 4),
  (47, 1),
  (52, 3),
  (73, 3),
  (81, 2),
  (82, 1),
  (83,

In [26]:
len(lemmas[0])

  and should_run_async(code)


130

In [27]:
doc_topics.corpus

  and should_run_async(code)


[[(0, 1),
  (1, 1),
  (2, 4),
  (3, 3),
  (4, 6),
  (5, 1),
  (6, 1),
  (7, 2),
  (8, 5),
  (9, 2),
  (10, 1),
  (11, 1),
  (12, 4),
  (13, 4),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 3),
  (18, 1),
  (19, 2),
  (20, 1),
  (21, 6),
  (22, 4),
  (23, 2),
  (24, 1),
  (25, 3),
  (26, 4),
  (27, 1),
  (28, 4),
  (29, 1),
  (30, 4),
  (31, 1),
  (32, 1),
  (33, 4),
  (34, 3),
  (35, 1),
  (36, 2),
  (37, 1),
  (38, 1),
  (39, 2),
  (40, 1),
  (41, 1),
  (42, 3),
  (43, 2),
  (44, 1),
  (45, 2),
  (46, 1),
  (47, 2),
  (48, 1),
  (49, 5),
  (50, 1),
  (51, 2),
  (52, 2),
  (53, 1),
  (54, 1),
  (55, 12)],
 [(17, 2),
  (22, 3),
  (28, 1),
  (36, 2),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 5),
  (65, 1),
  (66, 1),
  (67, 2),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 3),
  (73, 1),
  (74, 1),
  (75, 3),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1)],
 [(30, 4),
  (47, 1),
  (52, 3),
  (73, 3),
  (81, 2),
  (82, 1),
  (83,

In [28]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_multicore, corpus, id2words)
vis

  and should_run_async(code)


In [29]:
# create a dictionary 
# keys - use topic ids from pyLDAvis visualization 
# values - topic names that you create 
# save dictionary to `vis_topic_name_dict`

###BEGIN SOLUTION
# results keep changing between runs - topic names not perfectly matching term distributions might occur 
# rest of solution assumes that number of topics is 8 
# key is order of topic ids accoridng to pyLDAvis tool 
vis_topic_name_dict = {1:"Existential", 
                       2:"Love", 
                       3:"Gangsta", 
                       4:"Religion", 
                       5:"Poppy"
                       }
###END SOLUTION

  and should_run_async(code)


In [30]:
def get_topic_id_lookup_dict(vis, vis_topic_name_dict):
    """
    Both the starting index and the ordering of topic ids bewteen the trained LDA model 
    and the viz tool are different. So we need to create a look up dictionary that maps 
    the correct association between topic ids from both sources. 
    """
    # value is order of topic ids accoridng to pyLDAvis tool 
    # key is order of topic ids according to lda model
    model_vis_tool_topic_id_lookup = vis.topic_coordinates.topics.to_dict()
    #return model_vis_tool_topic_id_lookup

    # invert dictionary so that 
    # key is order of topic ids accoridng to pyLDAvis tool 
    # value is order of topic ids according to lda model
    topic_id_lookup =  {v:k for k, v in model_vis_tool_topic_id_lookup.items()}
    
    return {v:vis_topic_name_dict[k]  for k, v in topic_id_lookup.items()}

  and should_run_async(code)


In [31]:
# use get_topic_id_lookup_dict to update vis_topic_name_dict - save result to `topic_name_dict`

###BEING SOLTUION
# now we have an updated topic id/name dict
# the topic ids correspond to how the lda model has index the topics 
# now we can use this dictionary with lda model to label our docs 
topic_name_dict = get_topic_id_lookup_dict(vis, vis_topic_name_dict)
###END SOLTUION

  and should_run_async(code)


In [32]:
def get_topic_ids_for_docs(lda_model, corpus):
    
    """
    Passes a Bag-of-Words vector into a trained LDA model in order to get the topic id of that document. 
    
    Parameters
    ----------
    lda_model: Gensim object
        Must be a trained model 
        
    corpus: nested lists of tuples, 
        i.e. [[(),(), ..., ()], [(),(), ..., ()], ..., [(),(), ..., ()]]
        
    Returns
    -------
    topic_id_list: list
        Contains topic ids for all document vectors in corpus 
    """
    
    # store topic ids for each document
    doc_topic_ids = []

    # iterature through the bow vectors for each doc
    for doc_bow in corpus:
        
        # store the topic ids for the doc
        topic_ids = []
        # store the topic probabilities for the doc
        topic_probs = []

        # list of tuples
        # each tuple has a topic id and the prob that the doc belongs to that topic 
        topic_id_prob_tuples = lda_multicore.get_document_topics(doc_bow)
        
        # iterate through the topic id/prob pairs 
        for topic_id_prob in topic_id_prob_tuples:
            
            # index for topic id
            topic_id = topic_id_prob[0]
            # index for prob that doc belongs that the corresponding topic
            topic_prob = topic_id_prob[1]

            # store all topic ids for doc
            topic_ids.append(topic_id)
            # store all topic probs for doc
            topic_probs.append(topic_prob)

        # get index for largest prob score
        max_topic_prob_ind = np.argmax(topic_probs)
        # get corresponding topic id
        max_prob_topic_id = topic_ids[max_topic_prob_ind]
        # store topic id that had the highest prob for doc being a memebr of that topic
        doc_topic_ids.append(max_prob_topic_id)
        
    return doc_topic_ids

  and should_run_async(code)


In [33]:

# use get_topic_ids_for_docs get the topic id for each doc in the corpus - save result to `doc_topic_ids`

# create a new feature in df_electronics called topic_id using `doc_topic_ids`

# iterate through topic_id and use the lookup dict `topic_name_dict` to assign each document a topic name
# save results to a new feature in df_electronics called `new_topic_name`

###BEGIN SOLUTION
# get the topic id for each doc in the corpus 
doc_topic_ids = get_topic_ids_for_docs(lda_multicore, corpus)

# create a feature for document's topic id
# topic ids from trained LDA model
df_merged["topic_id"] = doc_topic_ids

# iterate through the topic id and use the lookup table to assign each document with a topic name
df_merged["new_topic_name"] = df_merged["topic_id"].apply(lambda topic_id: topic_name_dict[topic_id])
###END SOLUTION

  and should_run_async(code)


In [34]:
cols = ["Lyric", "new_topic_name"]
df_merged[cols].head(50)

  and should_run_async(code)


Unnamed: 0,Lyric,new_topic_name
0,"It starts with pain, followed by hate. Fueled ...",Poppy
1,You say you know just who I am. But you can't ...,Existential
2,We walk alone. In the unknown. We live to win ...,Poppy
3,"A hundred days have made me older,. Since the ...",Existential
4,There's another world inside of me that you ma...,Love
5,I took a walk around the world. To ease my tro...,Existential
6,"He spends his nights in California,. Watching ...",Existential
7,It's down to this. I've got to make this life ...,Existential
8,Looking back at the beginning of this. And how...,Existential
9,"Breathe in right away, nothing seems. to fill ...",Existential


In [35]:
ohe = OneHotEncoder(use_cat_names=True)

ohe_topics = ohe.fit_transform(df_merged['new_topic_name'])
ohe_topics

  and should_run_async(code)
  elif pd.api.types.is_categorical(cols):


Unnamed: 0,new_topic_name_Poppy,new_topic_name_Existential,new_topic_name_Love,new_topic_name_Religion,new_topic_name_Gangsta
0,1,0,0,0,0
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
...,...,...,...,...,...
14394,0,1,0,0,0
14395,0,1,0,0,0
14441,0,0,1,0,0
15306,1,0,0,0,0


In [36]:
df_merged.shape

  and should_run_async(code)


(12780, 24)

In [37]:
df_merged_ohe = pd.concat([df_merged, ohe_topics], axis=1)
df_merged_ohe

  and should_run_async(code)


Unnamed: 0,Artist,name,SLink,Lyric,Idiom,id,popularity,explicit,id_artists,release_date,...,valence,tempo,time_signature,topic_id,new_topic_name,new_topic_name_Poppy,new_topic_name_Existential,new_topic_name_Love,new_topic_name_Religion,new_topic_name_Gangsta
0,12 stones,world so cold,/12-stones/world-so-cold.html,"It starts with pain, followed by hate. Fueled ...",ENGLISH,471eQ7hcJ7JdGY1NzMmUeg,55,0,['0DrXhci3WAyo0WJv1RBOG6'],2007-01-01,...,0.394,186.227,3,4,Poppy,1,0,0,0,0
1,12 stones,anthem for the underdog,/12-stones/anthem-for-the-underdog.html,You say you know just who I am. But you can't ...,ENGLISH,6FFwt1ea9hJ4MfMQLywahm,59,0,['0DrXhci3WAyo0WJv1RBOG6'],2007-01-01,...,0.468,93.977,3,1,Existential,0,1,0,0,0
2,12 stones,we are one,/12-stones/we-are-one.html,We walk alone. In the unknown. We live to win ...,ENGLISH,4lhqal0Hq63U2wETCeBdG1,58,0,['0DrXhci3WAyo0WJv1RBOG6'],2010-01-01,...,0.306,127.102,4,4,Poppy,1,0,0,0,0
3,3 doors down,here without you,/3-doors-down/here-without-you.html,"A hundred days have made me older,. Since the ...",ENGLISH,3NLrRZoMF0Lx6zTlYqeIo4,76,0,['2RTUTCvo6onsAnheUk3aL9'],2002-11-12,...,0.233,143.994,4,1,Existential,0,1,0,0,0
4,3 doors down,when i'm gone,/3-doors-down/when-im-gone.html,There's another world inside of me that you ma...,ENGLISH,3WbphvawbMZ8FyqDxYGdSQ,70,0,['2RTUTCvo6onsAnheUk3aL9'],2002-11-12,...,0.374,148.095,4,2,Love,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14394,whitney houston,you'll never stand alone,/whitney-houston/youll-never-stand-alone.html,Verse 1:. If there's a time when the tears sho...,ENGLISH,3KFbjIFQxMbOKdZ4S3lv4S,34,0,['6XpaIBNiVzIetEPCWDvAFP'],1998-11-17,...,0.194,103.907,4,1,Existential,0,1,0,0,0
14395,whitney houston,you're still my man,/whitney-houston/youre-still-my-man.html,You'Re Still My Man. On the day that you left ...,ENGLISH,2csRQWI7A2FjLYBQBiAIoi,42,0,['6XpaIBNiVzIetEPCWDvAFP'],1987-06-02,...,0.308,131.743,4,1,Existential,0,1,0,0,0
14441,zendaya,replay,/zendaya/replay.html,"Make it stop, sounds so good. I just can't tak...",ENGLISH,7d1CFwrBmH34gmS0Hkbfbt,67,0,['6sCbFbEjbYepqswM1vWjjs'],2013-01-01,...,0.618,140.023,4,2,Love,0,0,1,0,0
15306,jorge ben jor,brother,/jorge-ben-jor/brother.html,"Brother, Brother, Prepare one more happy way f...",ENGLISH,1LLRVZB4fz7bduFEZ22DE4,48,0,['5JYtpnUKxAzXfHEYpOeeit'],1974-01-01,...,0.972,132.249,4,4,Poppy,1,0,0,0,0


In [38]:
# user selects what song
user_selected_song = 'imagine'
user_selected_artist = 'john lennon'

song_row = df_merged_ohe[(df_merged_ohe['name'] == user_selected_song) & (df_merged_ohe['Artist'] == user_selected_artist)]
song_row

  and should_run_async(code)


Unnamed: 0,Artist,name,SLink,Lyric,Idiom,id,popularity,explicit,id_artists,release_date,...,valence,tempo,time_signature,topic_id,new_topic_name,new_topic_name_Poppy,new_topic_name_Existential,new_topic_name_Love,new_topic_name_Religion,new_topic_name_Gangsta
3776,john lennon,imagine,/john-lennon/imagine.html,Imagine there's no heaven. It's easy if you tr...,ENGLISH,4lwh5BoHBFGjKNAF3sNNCK,25,0,['4x1nvY2FN8jxqAFA0DA02H'],1971-09-09,...,0.218,74.4,4,4,Poppy,1,0,0,0,0


In [39]:
# drop categorical data, prepare for input to model
song_row = keep_wanted_columns(song_row)
df_dropped = keep_wanted_columns(df_merged_ohe)

# instantiate Nearest Neighbors model
nn = NearestNeighbors(n_neighbors=10, algorithm='kd_tree')

# fit the model
nn.fit(df_dropped)

  and should_run_async(code)


NearestNeighbors(algorithm='kd_tree', n_neighbors=10)

In [40]:
neigh_dist, neigh_index = nn.kneighbors(song_row)

  and should_run_async(code)


In [41]:
neigh_index = neigh_index[0][1:]
neigh_index

  and should_run_async(code)


array([ 2434, 10275,  2536,   906,  1327,  2617,  8731, 11773,  2145])

In [42]:
for index in neigh_index:
    print(df_merged['name'].iloc[index])

my wish came true
let me go
tell me why
i'm not there
girl on my mind
your love's been a long time coming
across the bridge where angels dwell
wings of my love
you've lost that lovin' feelin'


  and should_run_async(code)


In [43]:
joblib.dump(nn, '../data/NearestNeighborModelWithTopics')

  and should_run_async(code)


['../data/NearestNeighborModelWithTopics']

In [45]:
df_merged_ohe.to_csv('../data/df_with_topics.csv')

  and should_run_async(code)
