# Song Name Hot or Not

* Goal: To generate candidates for the best or worst song name using visualizations
* Method: Using the names of each song in the db, extract the most frequent bigrams. Show the highest count relationships in a chord diagram.

## Imports

In [None]:
# base
import pandas as pd
import numpy as np
import nltk
from nltk import bigrams
import itertools
import pandas as pd
import re

# Chord/Shahin
from chord import Chord

# hv
import holoviews as hv
from holoviews import opts, dim

## Utility Functions

In [None]:
# remove stopwords
# input: df, column name containing words
# output: df with stopwords removed

def rm_stopwords(df, name):
    nltk.download('stopwords')
    from nltk.corpus import stopwords
    sw = stopwords.words('english')
    realwords = df[~df[name].isin(sw)]
    return realwords

In [None]:
# create a matrix for word in each song
# input: df with column of words, one per row, multiple duplicates of song title
# df 'realwords'
# column 'title'

def longtable_to_listoflist(realwords):

    finallist= []
    wordlist = []

    uniq_title = realwords.title.unique()

    for song in uniq_title:
        wordlist = []
        print(song)
        subset = realwords[realwords['title']==song]
        for word in subset['word']:
            wordlist.append(word)
        finallist.append(wordlist)

    print(finallist)
    return finallist

In [62]:
# generate chord chart to html
# input: matrix, names, output name
# output: html file saved to same directory

def gen_html(matrix, names, output):
    Chord(matrix, names, padding=0.05, width=1200, wrap_labels=False).to_html(output)

In [None]:
# generate matrix for chord/shahin
# limit is min co-occur #

def gen_chord(inlist, limit):
    data = list(itertools.chain.from_iterable(inlist))
    matrix, bigram, vocab_index = generate_co_occurrence_matrix(data)

    data_matrix = pd.DataFrame(matrix, index=vocab_index, columns=vocab_index)
    
    data_matrix = rematrix(data_matrix, limit)
    
    outmatrix = data_matrix.to_numpy()
    outmatrix = outmatrix.tolist()
#     outmatrix = matrix.tolist()
    names = list(data_matrix.index) 

#     print(matrix)
#     print(type(matrix))
#     print(data_matrix)
#     print(type(names))
#     print(names)
    return outmatrix, data_matrix, bigram, names

In [None]:
# build co-occurrence matrix
# input: list of lists
# output: outmatrix

def generate_co_occurrence_matrix(corpus):
    vocab = set(corpus)
    print('vocab len: ' + str(len(vocab)))
    vocab = list(vocab)
    vocab_index = {word: i for i, word in enumerate(vocab)}

    # Create bigrams from all words in corpus
    bi_grams = list(bigrams(corpus))

    # Frequency distribution of bigrams ((word1, word2), num_occurrences)
    bigram_freqraw = nltk.FreqDist(bi_grams).most_common(len(bi_grams))
    bigram_freq = [bigram for bigram in bigram_freqraw if bigram[1] > 1]

    # Initialise co-occurrence matrix
    # co_occurrence_matrix[current][previous]
    co_occurrence_matrix = np.zeros((len(vocab), len(vocab)))

    # Loop through the bigrams taking the current and previous word,
    # and the number of occurrences of the bigram.
    for bigram in bigram_freq:
        current = bigram[0][1]
        previous = bigram[0][0]
        count = bigram[1]
        pos_current = vocab_index[current]
        pos_previous = vocab_index[previous]
        co_occurrence_matrix[pos_current][pos_previous] = count
    co_occurrence_matrix = np.matrix(co_occurrence_matrix)

    # return the matrix and the index
    return co_occurrence_matrix, bigram_freq, vocab_index

In [None]:
# strip cooccurence matrix of low value weights per limit

def rematrix(df, limit):
    for idx in df.index:
        rowsum = df.loc[idx,:].sum(axis=0)
        colsum = df[idx].sum(axis=0)
#         print('Idx: ' + idx + '; sum of row: ' + str(rowsum))
        if rowsum < limit and colsum < limit:
            df.drop([idx], axis = 0, inplace=True)
            df.drop([idx], axis = 1, inplace=True)
#             print('dropping: ' + idx)
    print('Final shape: ')
    print(df.shape)
    return df

In [None]:
# remove parentheses and contents from song titles (or any string input)

def Clean_titles(title): 
    # Search for opening bracket in the name followed by 
    # any characters repeated any number of times
    if re.search('\(.*', str(title)):
        # return the cleaned name 
        return re.sub(r'\([^)]*\)', '', str(title)) # FORCE TO STRING in case of floats
    else: 
        # if clean up needed return the same name 
        return title 

## Load cleaned data

In [None]:
deduped = pd.read_csv(r"../Py code/titles_clean.zip")
deduped.head()

### Set parameters

In [74]:
# set thresholds

hot_thresh = 0.75 # recommend > 0.7
not_thresh = 0.25
limit = 4

In [70]:
# filter down to hotties
hotties = deduped[deduped['artist_hotttnesss'] > hot_thresh]
hotties.drop(columns=dropcols, inplace=True)
unique_hotties = hotties.title.unique()
print(str(len(unique_hotties)) + ' unique songs with hotness greater than ' + str(hot_thresh))
print(unique_hotties)

1739 unique songs with hotness greater than 0.75
["Let's Get It Started" 'Low [feat. T-Pain] [Travis Barker Remix]'
 'Kite Live from Sydney' ... 'I Just Wanna Know' 'The Calm' 'Solo Dolo ']


In [55]:
# filter down to NOTties, doing yourself no favors with naming convention here
notties = deduped[(deduped['artist_hotttnesss'] < not_thresh) & (deduped['artist_hotttnesss'] > 0)]
notties.drop(columns=dropcols, inplace=True)
unique_notties = notties.title.unique()
print(str(len(unique_notties)) + ' unique songs with hotness less than ' + str(not_thresh))
print(unique_notties)

1613 unique songs with hotness less than 0.25
['Next Time' 'Überlebensgroß' 'Non So Perchè ' ... 'Chills'
 'Yo se que te amare ' 'Foggy Mountain Special']


### Build the title data

In [71]:
# create a matrix for word in each song
# input: list of song names
# output: list of lists, each song name split into component words

hotlist = []
notlist = []

for song in unique_hotties:
    subset = song.split()
    hotlist.append(subset)
    
for song in unique_notties:
    subset = song.split()
    notlist.append(subset)
        
print('Hotlist: ')
# print(hotlist)
print('\nNotlist: ')
# print(notlist)

Hotlist: 

Notlist: 


In [75]:
# hot_mat, hot_data, hot_bigrams, hot_names - explain each one
hot_mat, hot_data, hot_bigrams, hot_names = gen_chord(hotlist, limit)

vocab len: 2093
Final shape: 
(82, 82)


In [58]:
not_mat, not_data, not_bigrams, not_names = gen_chord(notlist, limit)

vocab len: 2388
Final shape: 
(58, 58)


### Generate and save the chord plot

In [76]:
name = 'hotties2093_limit4.html'
gen_html(hot_mat, hot_names, name)

In [63]:
name = 'notties2388_limit3.html'
gen_html(not_mat, not_names, name)

### Potential names

Hotties

'hotties2093_limit4.html'
SONG TITLE - potential album art
"Rest The Day, Out All Night" - Kesha bathtub, Last Friday Night
"Waiting On A Star" - Twinkle twinkle
"You Ain't Got It" - No Scrubs
"Where You Go On My Heart" - celine dion
"The End Of The World" - it's the end of the world as we know
"Wake Me Up On The Way" - country roads
"One More 'NO'"
"You Can't Take Me"
"Never Too Long"
"Name My Heart Your Heart"


Notties

'notties1008_limit3.html'
"Can't I Love Me"
"Don't You Love Me"

'notties2388_limit3.html'
"I Don't Do It" - meatloaf
"Man In My Baby" - nirvana baby swimming
"Life Of A Fool" - lovefool
"Don't Let Me Down"
"That Girl That I Can't Be"
"Now You Are My Man" - crazy girlfriend face
"Your In My Man"

## STOP HERE
### Generate title data from base file

In [None]:
# previously saved file csv file  
# matched.to_csv(r'../words_songs_matched.zip', index = False)

In [None]:
# reading csv file  
# matched = pd.read_csv(r"../words_songs_matched.zip") 

In [None]:
# check the data
print(matched.shape)
matched.head()

### Selective data drop

In [None]:
# no longer used, caution INPLACE

dropcols = ['track_id', 'song_id', 'artist_id', 'duration']

# matched.drop(columns=dropcol, inplace=True)
# matched.head()

### Extract unique songs and save file

In [None]:
# drop duplicates against song title
deduped = matched.drop_duplicates(subset="title", keep='first', inplace=False)
print(deduped.shape)
deduped.head()

In [None]:
deduped['title'] = deduped['title'].apply(Clean_titles)
print(deduped.shape)
deduped.head()

In [None]:
# deduped.to_csv(r"../Py code/titles_clean.zip", index=False)

## Data Review Only

In [None]:
hot_bigrams

# for bigram in hot_bigrams:
#     current = bigram[0][1]
#     print('current: ' + current)
#     previous = bigram[0][0]
#     print('prev: ' + previous)
#     count = bigram[1]
#     print('count: ' + str(count))

In [None]:
z = len(hot_bigrams)
for i in range(z):
    if hotbigrams[i][1] > 1:
        print(hotbigrams[i])

In [None]:
# here is the trick part, need to rip all the edges which are very light weight

print('Hotties len: ' + str(len(hot_names)))
print('Hotties len: ' + str(len(np.unique(np.array(hot_names)))))

In [None]:
hot_data['Too'].sum

In [None]:
hot_data.index

In [None]:
hot_data

### Test with sklearn vectorizer

In [None]:
# test sklearn countvectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 1)) 
co_occurrences = bigram_vectorizer.fit_transform(unique_hotties)

In [None]:
print('Printing sparse matrix:', co_occurrences)
print('Printing dense matrix', co_occurrences.todense())
sum_occ = np.sum(co_occurrences.todense(),axis=0)
print('Sum of word-word occurrences:', sum_occ)
print('Pretty printig of co_occurrences count:')
z = zip(bigram_vectorizer.get_feature_names(),np.array(sum_occ)[0].tolist())
print(z)

In [None]:
df = pd.DataFrame(z)
print(df)