# Song Name Hot or Not

* Goal: To generate candidates for the best or worst song name using visualizations
* Method: Using the names of each song in the db, extract the most frequent bigrams. Show the highest count relationships in a chord diagram.

## Imports

In [110]:
# base
import pandas as pd
import numpy as np
import nltk
from nltk import bigrams
import itertools
import pandas as pd
import re

# Chord/Shahin
from chord import Chord

# hv
# ref: http://holoviews.org/gallery/demos/bokeh/route_chord.html#bokeh-gallery-route-chord
import holoviews as hv
from holoviews import opts, dim
import holoviews as hv
from bokeh.themes.theme import Theme
from bokeh import palettes
import inspect

hv.extension('bokeh', 'matplotlib')
%output backend='matplotlib' fig='svg' size=200
%output backend='bokeh' size=200

## Utility Functions

In [4]:
# remove stopwords
# input: df, column name containing words
# output: df with stopwords removed

def rm_stopwords(df, name):
    nltk.download('stopwords')
    from nltk.corpus import stopwords
    sw = stopwords.words('english')
    realwords = df[~df[name].isin(sw)]
    return realwords

In [5]:
# create a matrix for word in each song
# input: df with column of words, one per row, multiple duplicates of song title
# df 'realwords'
# column 'title'

def longtable_to_listoflist(realwords):

    finallist= []
    wordlist = []

    uniq_title = realwords.title.unique()

    for song in uniq_title:
        wordlist = []
        print(song)
        subset = realwords[realwords['title']==song]
        for word in subset['word']:
            wordlist.append(word)
        finallist.append(wordlist)

    print(finallist)
    return finallist

In [6]:
# generate chord chart to html
# input: matrix, names, output name
# output: html file saved to same directory

def gen_html(matrix, names, output):
    Chord(matrix, names, padding=0.05, width=1200, wrap_labels=False).to_html(output)

In [7]:
# generate matrix for chord/shahin
# limit is min co-occur #

def gen_chord(inlist, limit):
    data = list(itertools.chain.from_iterable(inlist))
    matrix, bigram, vocab_index = generate_co_occurrence_matrix(data)

    data_matrix = pd.DataFrame(matrix, index=vocab_index, columns=vocab_index)
    
    data_matrix = rematrix(data_matrix, limit)
    
    outmatrix = data_matrix.to_numpy()
    outmatrix = outmatrix.tolist()
#     outmatrix = matrix.tolist()
    names = list(data_matrix.index) 

#     print(matrix)
#     print(type(matrix))
#     print(data_matrix)
#     print(type(names))
#     print(names)
    return outmatrix, data_matrix, bigram, names

In [8]:
# build co-occurrence matrix
# input: list of lists
# output: outmatrix

def generate_co_occurrence_matrix(corpus):
    vocab = set(corpus)
    print('vocab len: ' + str(len(vocab)))
    vocab = list(vocab)
    vocab_index = {word: i for i, word in enumerate(vocab)}

    # Create bigrams from all words in corpus
    bi_grams = list(bigrams(corpus))

    # Frequency distribution of bigrams ((word1, word2), num_occurrences)
    bigram_freqraw = nltk.FreqDist(bi_grams).most_common(len(bi_grams))
    bigram_freq = [bigram for bigram in bigram_freqraw if bigram[1] > 1]

    # Initialise co-occurrence matrix
    # co_occurrence_matrix[current][previous]
    co_occurrence_matrix = np.zeros((len(vocab), len(vocab)))

    # Loop through the bigrams taking the current and previous word,
    # and the number of occurrences of the bigram.
    for bigram in bigram_freq:
        current = bigram[0][1]
        previous = bigram[0][0]
        count = bigram[1]
        pos_current = vocab_index[current]
        pos_previous = vocab_index[previous]
        co_occurrence_matrix[pos_current][pos_previous] = count
    co_occurrence_matrix = np.matrix(co_occurrence_matrix)

    # return the matrix and the index
    return co_occurrence_matrix, bigram_freq, vocab_index

In [9]:
# strip cooccurence matrix of low value weights per limit

def rematrix(df, limit):
    for idx in df.index:
        rowsum = df.loc[idx,:].sum(axis=0)
        colsum = df[idx].sum(axis=0)
#         print('Idx: ' + idx + '; sum of row: ' + str(rowsum))
        if rowsum < limit and colsum < limit:
            df.drop([idx], axis = 0, inplace=True)
            df.drop([idx], axis = 1, inplace=True)
#             print('dropping: ' + idx)
    print('Final shape: ')
    print(df.shape)
    return df

In [10]:
# remove parentheses and contents from song titles (or any string input)

def Clean_titles(title): 
    # Search for opening bracket in the name followed by 
    # any characters repeated any number of times
    if re.search('\(.*', str(title)):
        # return the cleaned name 
        return re.sub(r'\([^)]*\)', '', str(title)) # FORCE TO STRING in case of floats
    else: 
        # if clean up needed return the same name 
        return title 

In [90]:
# ref: Philipp Rudiger, @philippjfr, https://stackoverflow.com/questions/52356899/how-to-rotate-text-to-be-horizontal-on-holoview-with-bokeh

def rotate_label(plot, element):
    print('plot.handles: ', sorted(plot.handles.keys()))
    text_cds = plot.handles['text_1_source']
    length = len(text_cds.data['angle'])
    text_cds.data['angle'] = [0]*length
    xs = text_cds.data['x']
    text = np.array(text_cds.data['text'])
    xs[xs<0] -= np.array([len(t)*0.019 for t in text[xs<0]])

## Load cleaned data

In [12]:
deduped = pd.read_csv(r"../Py code/titles_clean.zip")
deduped.head()

Unnamed: 0.1,Unnamed: 0,word,count,track_id,song_id,artist_id,title,artist_name,duration,year,artist_hotttnesss
0,0,i,6,TRAAAAV128F421A322,SOQPWCR12A6D4FB2A3,AR73AIO1187B9AD57B,A Poor Recipe For Civic Cohesion,Western Addiction,118.07302,2005,0.386606
1,68,i,10,TRAAABD128F429CF47,SOCIWDW12A8C13D406,ARMJAGH1187FB546F3,Soul Deep,The Box Tops,148.03546,1969,0.4175
2,142,i,28,TRAAAED128E0783FAB,SOXZYWX12A6310ED0C,ARC1IHZ1187FB4E920,It's About Time,Jamie Cullum,246.9873,0,0.562061
3,262,i,5,TRAAAEF128F4273421,SONHOTT12A8C13493C,AR7G5I41187FB4CE6C,Something Girls,Adam Ant,233.40363,1982,0.454231
4,321,i,4,TRAAAEW128F42930C0,SODZYPO12A8C13A91E,AR1C2IX1187B99BF74,Burn My Body,Broken Spindles,177.99791,0,0.378545


### Set parameters

In [294]:
# set thresholds

hot_thresh = 0.7 # recommend > 0.7
not_thresh = 0.25
limit = 4

# caution INPLACE
dropcols = ['track_id', 'song_id', 'artist_id', 'duration']

In [295]:
# filter down to hotties
hotties = deduped[deduped['artist_hotttnesss'] > hot_thresh]
hotties.drop(columns=dropcols, inplace=True)
unique_hotties = hotties.title.unique()
print(str(len(unique_hotties)) + ' unique songs with hotness greater than ' + str(hot_thresh))
print(unique_hotties)

2508 unique songs with hotness greater than 0.7
["You're No Good" "Let's Get It Started"
 'Low [feat. T-Pain] [Travis Barker Remix]' ... 'Me and Your Cigarettes'
 'Sin For A Sin' 'Solo Dolo ']


In [296]:
# filter down to NOTties, doing yourself no favors with naming convention here
notties = deduped[(deduped['artist_hotttnesss'] < not_thresh) & (deduped['artist_hotttnesss'] > 0)]
notties.drop(columns=dropcols, inplace=True)
unique_notties = notties.title.unique()
print(str(len(unique_notties)) + ' unique songs with hotness less than ' + str(not_thresh))
print(unique_notties)

1613 unique songs with hotness less than 0.25
['Next Time' 'Überlebensgroß' 'Non So Perchè ' ... 'Chills'
 'Yo se que te amare ' 'Foggy Mountain Special']


### Build the title data

In [300]:
# create a matrix for word in each song
# input: list of song names
# output: list of lists, each song name split into component words

hotlist = []
notlist = []

for song in unique_hotties:
    subset = song.split()
    hotlist.append(subset)
    
for song in unique_notties:
    subset = song.split()
    notlist.append(subset)
        
print('Hotlist: ')
# print(hotlist)
print('\nNotlist: ')
# print(notlist)

Hotlist: 

Notlist: 


In [298]:
# hot_mat, hot_data, hot_bigrams, hot_names - explain each one
# hot_data is the filtered bigram list, cutting anything where sum or row or col is less than thresh
# hot_bigram is the straight bigram from sklarn
hot_mat, hot_data, hot_bigrams, hot_names = gen_chord(hotlist, limit)

vocab len: 2658
Final shape: 
(122, 122)


In [17]:
not_mat, not_data, not_bigrams, not_names = gen_chord(notlist, limit)

vocab len: 2388
Final shape: 
(45, 45)


### Convert hot_bigrams and hot_names to df for hv

In [61]:
# hot_bigrams

In [235]:
def convBiToDF(df):
    dfbigrams = pd.DataFrame(df, columns = ['Bi', 'Ct']) 
    dfbigrams.head()
    src, dest = list(zip(*dfbigrams['Bi']))
    print('Check Source Len: ' + str(len(src)) + ' and dest len: ' + str(len(dest)))
    dfbigrams['src'] = src
    dfbigrams['dest'] = dest
    dfbigrams.head()
    return dfbigrams

In [463]:
# Choose either _data or _bigram

hotbi = convBiToDF(hot_bigrams)

Check Source Len: 654 and dest len: 654


In [464]:
# hot_data

### Give it the chop

In [465]:
hotbi

Unnamed: 0,Bi,Ct,src,dest
0,"(In, The)",34,In,The
1,"([Live, From)",23,[Live,From
2,"(On, The)",21,On,The
3,"(Of, The)",20,Of,The
4,"(On, My)",16,On,My
...,...,...,...,...
649,"(Young, And)",2,Young,And
650,"(Up, To)",2,Up,To
651,"(I, Keep)",2,I,Keep
652,"(Don't, Think)",2,Don't,Think


In [466]:
# remove brackets
hotbi['src'] =  hotbi['src'].apply(lambda x: x.replace('[','').replace(']',''))
hotbi['dest'] =  hotbi['dest'].apply(lambda x: x.replace('[','').replace(']',''))

In [467]:
# drop 'the' entirely
# hotbi = hotbi[(hotbi['src'] != 'The') & (hotbi['dest'] != 'The')]

In [468]:
# count word appearances per src and dest
srcct = hotbi['src'].value_counts()
destct = hotbi['dest'].value_counts()
thescaler = 14
srcct['The'] = srcct['The'] / thescaler
destct['The'] = destct['The'] / thescaler
srcct['You'] = srcct['You'] / thescaler
destct['You'] = destct['You'] / thescaler
srcct['I'] = 2* srcct['I'] / thescaler
destct['I'] = 2* destct['I'] / thescaler
srcct['A'] = 2* srcct['A'] / thescaler
destct['A'] = 2* destct['A'] / thescaler
srcct['Me'] = 3* srcct['Me'] / thescaler
destct['Me'] = 3* destct['Me'] / thescaler
hotbi['TotalCt'] = hotbi.apply(lambda row: srcct[row.src] + destct[row.dest], axis = 1)

In [469]:
# scale the word the
# def thescale(word,ct):
#     if word == 'The':
#         return ct / 10
#     else:
#         return ct * 10

# hotbi['AdjCt'] =  hotbi.apply(lambda row: thescale(row.src, row.TotalCt), axis=1)
# hotbi['AdjCt'] =  hotbi.apply(lambda row: thescale(row.dest, row.TotalCt), axis=1)

In [470]:
thresh = 18
hotbifilter = hotbi[hotbi['TotalCt'] > thresh]
hotbifilter.describe()
# hotbifilter.head()

Unnamed: 0,Ct,TotalCt
count,57.0,57.0
mean,3.736842,21.842105
std,2.722435,3.682921
min,2.0,19.0
25%,2.0,19.0
50%,3.0,20.0
75%,4.0,23.0
max,16.0,32.0


### Generate and save hv

In [471]:
# Declare a gridded HoloViews dataset and call dframe to flatten it
# hotdata = hv.Dataset((list(hotbi['src']), list(hotbi['dest']), hotbi['Ct']),
#                   ['SourceWord', 'DestWord'], 'Links').dframe()

hotdata = hv.Dataset((list(hotbifilter['src']), list(hotbifilter['dest']), hotbifilter['TotalCt']),
                  ['SourceWord', 'DestWord'], 'Links').dframe()
hotdata.describe()
hotdata = hotdata[hotdata.Links != 0]

In [472]:
hotdata

Unnamed: 0,SourceWord,DestWord,Links
0,On,My,23
1,Live,In,20
2,In,Love,30
3,I,Love,19
4,In,My,28
5,Love,The,20
6,My,Life,19
7,To,Be,21
8,My,Mind,19
9,To,Me,20


In [473]:
hotchord = hv.Chord(hotdata)

# title = "Hotties", fontscale=3,

hotchord.opts(
    node_color='index', node_alpha=1, align='center',
    edge_cmap='Category20', edge_color='SourceWord', edge_line_width=3, edge_line_alpha=0.7,
    cmap='Category20', width=700, height=700, bgcolor='#2F2F2F',
    labels='index', hooks=[rotate_label], label_text_color='#F5F5F5', 
    label_text_font_size='14px', label_text_align='center')

plot.handles:  ['color_dim', 'edge_color_color_mapper', 'glyph_renderer', 'hover', 'layout_source', 'multi_line_1_glyph', 'multi_line_1_glyph_renderer', 'multi_line_1_source', 'multi_line_2_glyph', 'multi_line_2_glyph_renderer', 'multi_line_2_source', 'node_color_color_mapper', 'plot', 'previous_id', 'scatter_1_glyph', 'scatter_1_glyph_renderer', 'scatter_1_source', 'static_source', 'text_1_glyph', 'text_1_glyph_renderer', 'text_1_source', 'x_range', 'xaxis', 'y_range', 'yaxis']


In [95]:
opts.Chord()

Options('Chord')

In [474]:
hv.save(hotchord, 'hotchord_dark.html', fmt='html')

plot.handles:  ['color_dim', 'edge_color_color_mapper', 'glyph_renderer', 'hover', 'layout_source', 'multi_line_1_glyph', 'multi_line_1_glyph_renderer', 'multi_line_1_source', 'multi_line_2_glyph', 'multi_line_2_glyph_renderer', 'multi_line_2_source', 'node_color_color_mapper', 'plot', 'previous_id', 'scatter_1_glyph', 'scatter_1_glyph_renderer', 'scatter_1_source', 'static_source', 'text_1_glyph', 'text_1_glyph_renderer', 'text_1_source', 'x_range', 'xaxis', 'y_range', 'yaxis']


### Potential names

Hotties

'hotties2093_limit4.html'
SONG TITLE - potential album art
"Rest The Day, Out All Night" - Kesha bathtub, Last Friday Night
"Waiting On A Star" - Twinkle twinkle
"You Ain't Got It" - No Scrubs
"Where You Go On My Heart" - celine dion
"The End Of The World" - it's the end of the world as we know
"Wake Me Up On The Way" - country roads
"One More 'NO'"
"You Can't Take Me"
"Never Too Long"
"Name My Heart Your Heart"


Notties

'notties1008_limit3.html'
"Can't I Love Me"
"Don't You Love Me"

'notties2388_limit3.html'
"I Don't Do It" - meatloaf
"Man In My Baby" - nirvana baby swimming
"Life Of A Fool" - lovefool
"Don't Let Me Down"
"That Girl That I Can't Be"
"Now You Are My Man" - crazy girlfriend face
"Your In My Man"