In [1]:
# import libraries
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import joblib
import spacy
import re
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models



In [2]:
# run once
# !python -m spacy download en_core_web_lg

  and should_run_async(code)


In [3]:
# load spacy model

nlp = spacy.load("en_core_web_lg")

  and should_run_async(code)


# Baseline Nearest Neighbors Model

In [4]:
# read in data
data_path = "../data/"

def wrangle(data_path):

    # reads in the data
    df_tracks = pd.read_csv(data_path + 'tracks.csv', parse_dates=['release_date'])
    df_lyrics = pd.read_csv(data_path + 'lyrics.csv') #usecols=['SName', 'Lyric']
    
    # drop null values on df_tracks
    df_tracks.dropna(inplace=True)
    df_lyrics.dropna(inplace=True)
    
    # case normalization on song names
    df_lyrics['SName'] = df_lyrics['SName'].apply(lambda x: x.lower())
    df_tracks['name'] = df_tracks['name'].apply(lambda x: x.lower())
    
    # clean up df_lyrics data for merging
    df_lyrics = df_lyrics.rename(columns={'ALink': 'Artist', 'SName': 'name'})
    df_lyrics['Artist'] = df_lyrics['Artist'].apply(lambda x: x.rstrip('/').lstrip('/').replace('-', " ").lower())

    # drops duplicate lyrics 
    df_lyrics = df_lyrics.drop_duplicates(subset=['Artist', 'name'])
    
    # clean up df_tracks data for merging
    df_tracks = df_tracks.rename(columns={'artists': 'Artist'})

    df_tracks['Artist'] = df_tracks['Artist'].apply(lambda x: x.lstrip("['").rstrip("']").lower())
    
    # use regex to strip acoustic/remix versions
    strip_post_dash = ' - \S+'
    strip_version = ' version'

    df_tracks['name'] = df_tracks['name'].apply(lambda song_name: re.sub(strip_post_dash, "", song_name))
    df_tracks['name'] = df_tracks['name'].apply(lambda song_name: re.sub(strip_version, "", song_name))
    
    # drop duplicate rows
    df_tracks = df_tracks.drop_duplicates(subset=['Artist', 'name'])
    
    df_merged = pd.merge(df_lyrics, df_tracks, on=['Artist', 'name'], how='inner')
    
    # drop unwanted columns
    df_merged = df_merged.drop(columns=['duration_ms'])
    
    df_merged = df_merged[df_merged['Idiom'] == 'ENGLISH']
    
    # original merge dataframes
    # df_merged = pd.merge(df_tracks, df_lyrics, how='inner', left_on=['name'], right_on='SName')
    
    
    # original drop columns not used in NN model
    # df_merged = df_merged.drop(columns= ['id', 'name', 'artists', 'id_artists', 'release_date', ])
    
    return df_merged

def keep_wanted_columns(df):
    df_dropped = df[['popularity', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']]
    
    #drop(columns= ['Artist', 'name', 'SLink', 'Lyric', 'Idiom', 'id_artists', 'id', 'release_date'])
    
    return df_dropped

  and should_run_async(code)
  strip_post_dash = ' - \S+'


In [5]:
df_merged = wrangle(data_path)

  and should_run_async(code)


In [30]:
df_merged.shape

  and should_run_async(code)


(12780, 22)

In [6]:
df_merged.head()

  and should_run_async(code)


Unnamed: 0,Artist,name,SLink,Lyric,Idiom,id,popularity,explicit,id_artists,release_date,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,12 stones,world so cold,/12-stones/world-so-cold.html,"It starts with pain, followed by hate. Fueled ...",ENGLISH,471eQ7hcJ7JdGY1NzMmUeg,55,0,['0DrXhci3WAyo0WJv1RBOG6'],2007-01-01,...,4,-4.949,0,0.0443,0.472,0.0,0.36,0.394,186.227,3
1,12 stones,anthem for the underdog,/12-stones/anthem-for-the-underdog.html,You say you know just who I am. But you can't ...,ENGLISH,6FFwt1ea9hJ4MfMQLywahm,59,0,['0DrXhci3WAyo0WJv1RBOG6'],2007-01-01,...,5,-3.424,1,0.0626,0.000651,0.0,0.339,0.468,93.977,3
2,12 stones,we are one,/12-stones/we-are-one.html,We walk alone. In the unknown. We live to win ...,ENGLISH,4lhqal0Hq63U2wETCeBdG1,58,0,['0DrXhci3WAyo0WJv1RBOG6'],2010-01-01,...,5,-4.041,0,0.114,0.000119,1.6e-05,0.233,0.306,127.102,4
3,3 doors down,here without you,/3-doors-down/here-without-you.html,"A hundred days have made me older,. Since the ...",ENGLISH,3NLrRZoMF0Lx6zTlYqeIo4,76,0,['2RTUTCvo6onsAnheUk3aL9'],2002-11-12,...,10,-6.817,0,0.0252,0.0492,0.0,0.205,0.233,143.994,4
4,3 doors down,when i'm gone,/3-doors-down/when-im-gone.html,There's another world inside of me that you ma...,ENGLISH,3WbphvawbMZ8FyqDxYGdSQ,70,0,['2RTUTCvo6onsAnheUk3aL9'],2002-11-12,...,7,-5.611,1,0.0284,0.00385,0.0,0.103,0.374,148.095,4


In [7]:
df_dropped = keep_wanted_columns(df_merged)
df_dropped

  and should_run_async(code)


Unnamed: 0,popularity,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,55,0,0.262,0.688,4,-4.949,0,0.0443,0.472000,0.000000,0.360,0.394,186.227,3
1,59,0,0.200,0.863,5,-3.424,1,0.0626,0.000651,0.000000,0.339,0.468,93.977,3
2,58,0,0.444,0.974,5,-4.041,0,0.1140,0.000119,0.000016,0.233,0.306,127.102,4
3,76,0,0.557,0.533,10,-6.817,0,0.0252,0.049200,0.000000,0.205,0.233,143.994,4
4,70,0,0.530,0.768,7,-5.611,1,0.0284,0.003850,0.000000,0.103,0.374,148.095,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14394,34,0,0.703,0.488,9,-6.943,1,0.0254,0.105000,0.000000,0.272,0.194,103.907,4
14395,42,0,0.548,0.505,10,-9.496,1,0.0295,0.684000,0.000004,0.226,0.308,131.743,4
14441,67,0,0.688,0.723,6,-6.682,0,0.1210,0.041600,0.000200,0.302,0.618,140.023,4
15306,48,0,0.730,0.452,2,-11.125,1,0.0378,0.416000,0.000000,0.233,0.972,132.249,4


In [8]:
# display(df_merged.shape)
# df_merged.tail()

  and should_run_async(code)


In [9]:
# df_tracks = pd.read_csv(data_path + 'tracks.csv', parse_dates=['release_date'])
# df_lyrics = pd.read_csv(data_path + 'lyrics.csv') #usecols=['SName', 'Lyric']
# df_tracks.dropna(inplace=True)
    
# # case normalization on song names
# df_lyrics['SName'] = df_lyrics['SName'].apply(lambda x: x.lower())
# df_tracks['name'] = df_tracks['name'].apply(lambda x: x.lower())

# # merge dataframes
# df_merged = pd.merge(df_tracks, df_lyrics, how='inner', left_on=['name'], right_on='SName')

# # drop columns not used in NN model
# df_merged = df_merged.drop(columns= ['id', 'name', 'artists', 'id_artists', 'release_date', 'duration_ms'])

  and should_run_async(code)


In [10]:
user_selected_song = 'stairway to heaven'
user_selected_artist = 'led zeppelin'

  and should_run_async(code)


In [11]:
song_row = df_merged[(df_merged['name'] == user_selected_song) & (df_merged['Artist'] == user_selected_artist)]

  and should_run_async(code)


In [12]:
song_row

  and should_run_async(code)


Unnamed: 0,Artist,name,SLink,Lyric,Idiom,id,popularity,explicit,id_artists,release_date,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
4229,led zeppelin,stairway to heaven,/led-zeppelin/stairway-to-heaven.html,There's a lady who's sure all. that glitters i...,ENGLISH,5CQ30WqJwcep0pYcV4AMNc,79,0,['36QJpDe2go2KgaRleHCDTp'],1971-11-08,...,9,-12.049,0,0.0339,0.58,0.0032,0.116,0.197,82.433,4


In [13]:
# drop categorical data, prepare for input to model
song_row = keep_wanted_columns(song_row)

# df_dropped = keep_wanted_columns(df_merged)

# instantiate Nearest Neighbors model
nn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')



nn.fit(df_dropped)

  and should_run_async(code)


NearestNeighbors(algorithm='kd_tree')

In [14]:
neigh_dist, neigh_index = nn.kneighbors(song_row)

  and should_run_async(code)


In [15]:
neigh_index = neigh_index[0][1:]
neigh_index

  and should_run_async(code)


array([10652, 11440,  9658,  9371])

In [16]:
print(df_merged['name'].iloc[11904])

no control


  and should_run_async(code)


In [17]:
for index in neigh_index:
    print(df_merged['name'].iloc[index])

the a team
royals
take a bow
self control


  and should_run_async(code)


In [18]:
neigh_dist

  and should_run_async(code)


array([[0.        , 4.032528  , 4.67054588, 5.08803819, 5.14691675]])

In [19]:
# joblib.dump(nn, '../data/NearestNeighborModel')

  and should_run_async(code)


# Natural Language Processing

In [20]:
# df_merged = df_merged[df_merged['Idiom'] == 'ENGLISH']

  and should_run_async(code)


In [21]:
import re
def clean_data(text):
    """
    Accepts a single text document and performs several regex substitutions in order to clean the document.
    Parameters
    ----------
    text: string or object
    Returns
    -------
    text: string or object
    """
    special_chars_regex = '[:?,\>$|!\'"]'
    white_spaces_regex = '[ ]{2,}'
    text = re.sub('[^a-zA-Z ]', "", text)
    text = re.sub(special_chars_regex, " ", text)
    text = re.sub(white_spaces_regex, " ", text)
    return text.lower()

  and should_run_async(code)
  special_chars_regex = '[:?,\>$|!\'"]'


In [46]:
def tokenize(doc):
# run the clean_data function on the text
    text = clean_data(doc)
# Instantiatie language model
    doc = nlp(text)
    STOP_WORDS = nlp.Defaults.stop_words.union('', 'the', 'this', '-pron-', 'wo', 've', 'nt')
    return [token.lemma_.lower().lstrip().rstrip() for token in doc
            if (token.text not in STOP_WORDS)
            and (token.is_punct != True)
            and (token.text not in '')
            and (len(token.text)>2)]

  and should_run_async(code)


In [47]:
# !pip install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension
# !jupyter labextension install @jupyter-widgets/jupyterlab-manager
# !pip install pandarallel

  and should_run_async(code)


In [48]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=7)

INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


  and should_run_async(code)


In [49]:
lemmas = df_merged['Lyric'].parallel_apply(tokenize)

  and should_run_async(code)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1826), Label(value='0 / 1826'))), …

In [50]:
lemmas

  and should_run_async(code)


0        [start, pain, follow, hate, fuel, endless, que...
1        [know, imagine, wait, line, think, stand, tire...
2        [walk, unknown, live, win, victory, young, die...
3        [day, old, time, see, pretty, face, thousand, ...
4        [world, inside, secret, life, hide, darkness, ...
                               ...                        
14394    [verse, time, tear, fill, eye, past, shadow, s...
14395    [man, day, leave, say, regret, bond, break, fe...
14441    [stop, sound, good, turn, turn, know, know, st...
15306    [brother, brother, prepare, happy, way, lord, ...
15331    [mama, mama, mama, easy, girl, easy, girl, jor...
Name: Lyric, Length: 12780, dtype: object

In [51]:
id2words = corpora.Dictionary(lemmas)
corpus = [id2words.doc2bow(lemma) for lemma in lemmas]

  and should_run_async(code)


In [54]:
lda_multicore = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                        id2word=id2words,
                                                        num_topics=5,
                                                        chunksize=100,
                                                        passes=10,
                                                        per_word_topics=True,
                                                        workers=6)

  and should_run_async(code)


In [55]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_multicore, corpus, id2words)
vis

  and should_run_async(code)
