In [1]:
# import libraries
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import joblib
import spacy
import re
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models



In [2]:
# run once
# !python -m spacy download en_core_web_lg

  and should_run_async(code)


In [3]:
# load spacy model
nlp = spacy.load("en_core_web_lg")

  and should_run_async(code)


# Baseline Nearest Neighbors Model

In [4]:
# read in data
data_path = "../data/"

def wrangle(data_path):

    # reads in the data
    df_tracks = pd.read_csv(data_path + 'tracks.csv', parse_dates=['release_date'])
    df_lyrics = pd.read_csv(data_path + 'lyrics.csv') #usecols=['SName', 'Lyric']
    
    # drop null values on df_tracks
    df_tracks.dropna(inplace=True)
    df_lyrics.dropna(inplace=True)
    
    # case normalization on song names
    df_lyrics['SName'] = df_lyrics['SName'].apply(lambda x: x.lower())
    df_tracks['name'] = df_tracks['name'].apply(lambda x: x.lower())
    
    # clean up df_lyrics data for merging
    df_lyrics = df_lyrics.rename(columns={'ALink': 'Artist', 'SName': 'name'})
    df_lyrics['Artist'] = df_lyrics['Artist'].apply(lambda x: x.rstrip('/').lstrip('/').replace('-', " ").lower())

    # drops duplicate lyrics 
    df_lyrics = df_lyrics.drop_duplicates(subset=['Artist', 'name'])
    
    # clean up df_tracks data for merging
    df_tracks = df_tracks.rename(columns={'artists': 'Artist'})

    df_tracks['Artist'] = df_tracks['Artist'].apply(lambda x: x.lstrip("['").rstrip("']").lower())
    
    # use regex to strip acoustic/remix versions
    strip_post_dash = ' - \S+'
    strip_version = ' version'

    df_tracks['name'] = df_tracks['name'].apply(lambda song_name: re.sub(strip_post_dash, "", song_name))
    df_tracks['name'] = df_tracks['name'].apply(lambda song_name: re.sub(strip_version, "", song_name))
    
    # drop duplicate rows
    df_tracks = df_tracks.drop_duplicates(subset=['Artist', 'name'])
    
    df_merged = pd.merge(df_lyrics, df_tracks, on=['Artist', 'name'], how='inner')
    
    # drop unwanted columns
    df_merged = df_merged.drop(columns=['duration_ms'])
    
    df_merged = df_merged[df_merged['Idiom'] == 'ENGLISH']
    
    # original merge dataframes
    # df_merged = pd.merge(df_tracks, df_lyrics, how='inner', left_on=['name'], right_on='SName')
    
    
    # original drop columns not used in NN model
    # df_merged = df_merged.drop(columns= ['id', 'name', 'artists', 'id_artists', 'release_date', ])
    
    return df_merged

def keep_wanted_columns(df):
    df_dropped = df[['popularity', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']]
    
    #drop(columns= ['Artist', 'name', 'SLink', 'Lyric', 'Idiom', 'id_artists', 'id', 'release_date'])
    
    return df_dropped

  and should_run_async(code)
  strip_post_dash = ' - \S+'


In [5]:
df_merged = wrangle(data_path)

  and should_run_async(code)


In [6]:
# user selects what song
user_selected_song = 'lemonade'
user_selected_artist = 'gucci mane'

song_row = df_merged[(df_merged['name'] == user_selected_song) & (df_merged['Artist'] == user_selected_artist)]

song_row

  and should_run_async(code)


Unnamed: 0,Artist,name,SLink,Lyric,Idiom,id,popularity,explicit,id_artists,release_date,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
10359,gucci mane,lemonade,/gucci-mane/lemonade.html,"yeah. its Gucci,. whats up baby,. Yellow errth...",ENGLISH,6rUcS9i07F6okIe8wujs5J,61,1,['13y7CgLHjMVRMDqxdx0Xdo'],2009-12-04,...,7,-7.758,1,0.0853,0.643,3e-06,0.307,0.746,142.057,4


In [7]:
# drop categorical data, prepare for input to model
song_row = keep_wanted_columns(song_row)
df_dropped = keep_wanted_columns(df_merged)

# instantiate Nearest Neighbors model
nn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')

# fit the model
nn.fit(df_dropped)

  and should_run_async(code)


NearestNeighbors(algorithm='kd_tree')

In [8]:
neigh_dist, neigh_index = nn.kneighbors(song_row)

  and should_run_async(code)


In [9]:
neigh_index = neigh_index[0][1:]
neigh_index

  and should_run_async(code)


array([ 8720,  4395, 10053,  9364])

In [10]:
for index in neigh_index:
    print(df_merged['name'].iloc[index])

someone like you
tainted love
drowning
handlebars


  and should_run_async(code)


In [11]:
neigh_dist

  and should_run_async(code)


array([[0.        , 1.84631722, 2.55340212, 2.58756591, 3.14668559]])

In [12]:
# joblib.dump(nn, '../data/NearestNeighborModel')

  and should_run_async(code)


# Natural Language Processing

In [13]:
# df_merged = df_merged[df_merged['Idiom'] == 'ENGLISH']

  and should_run_async(code)


In [14]:
import re
def clean_data(text):
    """
    Accepts a single text document and performs several regex substitutions in order to clean the document.
    Parameters
    ----------
    text: string or object
    Returns
    -------
    text: string or object
    """
    special_chars_regex = '[:?,\>$|!\'"]'
    white_spaces_regex = '[ ]{2,}'
    text = re.sub('[^a-zA-Z ]', "", text)
    text = re.sub(special_chars_regex, " ", text)
    text = re.sub(white_spaces_regex, " ", text)
    return text.lower()

  and should_run_async(code)
  special_chars_regex = '[:?,\>$|!\'"]'


In [15]:
def tokenize(doc):
# run the clean_data function on the text
    text = clean_data(doc)
# Instantiatie language model
    doc = nlp(text)
    STOP_WORDS = nlp.Defaults.stop_words.union('', 'the', 'this', '-pron-', 'wo', 've', 'nt')
    return [token.lemma_.lower().lstrip().rstrip() for token in doc
            if (token.text not in STOP_WORDS)
            and (token.is_punct != True)
            and (token.text not in '')
            and (len(token.text)>2)]

  and should_run_async(code)


In [16]:
# !pip install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension
# !jupyter labextension install @jupyter-widgets/jupyterlab-manager
# !pip install pandarallel

  and should_run_async(code)


In [17]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=7)

INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


  and should_run_async(code)


In [18]:
lemmas = df_merged['Lyric'].parallel_apply(tokenize)

  and should_run_async(code)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1826), Label(value='0 / 1826'))), …

In [19]:
lemmas

  and should_run_async(code)


0        [start, pain, follow, hate, fuel, endless, que...
1        [know, imagine, wait, line, think, stand, tire...
2        [walk, unknown, live, win, victory, young, die...
3        [day, old, time, see, pretty, face, thousand, ...
4        [world, inside, secret, life, hide, darkness, ...
                               ...                        
14394    [verse, time, tear, fill, eye, past, shadow, s...
14395    [man, day, leave, say, regret, bond, break, fe...
14441    [stop, sound, good, turn, turn, know, know, st...
15306    [brother, brother, prepare, happy, way, lord, ...
15331    [mama, mama, mama, easy, girl, easy, girl, jor...
Name: Lyric, Length: 12780, dtype: object

In [20]:
id2words = corpora.Dictionary(lemmas)
corpus = [id2words.doc2bow(lemma) for lemma in lemmas]

  and should_run_async(code)


In [21]:
lda_multicore = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                        id2word=id2words,
                                                        num_topics=5,
                                                        chunksize=100,
                                                        passes=10,
                                                        per_word_topics=True,
                                                        workers=6)

  and should_run_async(code)


In [22]:
# somewhat unsure of what this is doing

all_topics = lda_multicore.get_document_topics(corpus[3800], per_word_topics=True)
# for doc_topics, word_topics, phi_values in all_topics:
#     print('New Document \n')
#     print('Document topics:', doc_topics)
#     print('Word topics:', word_topics)
#     print('Phi values:', phi_values)
#     print(" ")
#     print('-------------- \n')
all_topics[0]

  and should_run_async(code)


[(1, 0.35577255), (2, 0.63884145)]

In [23]:
# display the print_topics from the model 
doc_topics = lda_multicore.get_document_topics(corpus, per_word_topics=True)

  and should_run_async(code)


In [24]:
len(lemmas[0])

  and should_run_async(code)


130

In [25]:
doc_topics.corpus

  and should_run_async(code)


[[(0, 1),
  (1, 1),
  (2, 4),
  (3, 3),
  (4, 6),
  (5, 1),
  (6, 1),
  (7, 2),
  (8, 5),
  (9, 2),
  (10, 1),
  (11, 1),
  (12, 4),
  (13, 4),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 3),
  (18, 1),
  (19, 2),
  (20, 1),
  (21, 6),
  (22, 4),
  (23, 2),
  (24, 1),
  (25, 3),
  (26, 4),
  (27, 1),
  (28, 4),
  (29, 1),
  (30, 4),
  (31, 1),
  (32, 1),
  (33, 4),
  (34, 3),
  (35, 1),
  (36, 2),
  (37, 1),
  (38, 1),
  (39, 2),
  (40, 1),
  (41, 1),
  (42, 3),
  (43, 2),
  (44, 1),
  (45, 2),
  (46, 1),
  (47, 2),
  (48, 1),
  (49, 5),
  (50, 1),
  (51, 2),
  (52, 2),
  (53, 1),
  (54, 1),
  (55, 12)],
 [(17, 2),
  (22, 3),
  (28, 1),
  (36, 2),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 5),
  (65, 1),
  (66, 1),
  (67, 2),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 3),
  (73, 1),
  (74, 1),
  (75, 3),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1)],
 [(30, 4),
  (47, 1),
  (52, 3),
  (73, 3),
  (81, 2),
  (82, 1),
  (83,

In [26]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_multicore, corpus, id2words)
vis

  and should_run_async(code)
