# Lyrical Creativity Score 
Charlie Baker (Charles.d.baker.24@dartmouth.edu) <br>
Will Toth (william.a.toth.23@dartmouth.edu) <br>
Dartmouth College, LING48, Spring 2022 (5/23/2022)

Input: csv file with lyrics and other data on over 28,000 songs

Output: Most creative genres, most and least lyrically creative artists, Creativity score for any song in the database you type in

This code parses through lyrics for every song in the database and returns a creativity score based on 3 different metrics. The last cell of the program asks for a song and returns its creativity score and its breakdown.

In [1]:
#importing
import pandas as pd
import io
import nltk
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
import math

#importing stopwords
nltk.download('stopwords')
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
#uploading our csv file with all the songs and their data
from google.colab import files
uploaded = files.upload()
df = pd.read_csv(io.BytesIO(uploaded['tcc_ceds_music.csv']))

Saving tcc_ceds_music.csv to tcc_ceds_music (1).csv


In [6]:
wordDict = {} #Dict containing all of the words that show up in the lyrics which aren't stopwords as keys and their counts as values
dictOfSongDicts = {} #Dict of song dicts which contain all the non-stopwords words in the song and their counts
totalWords = {} #dictionary where song names are the keys and its values are the length of the lyrics without stopwords

for i in range(df.shape[0]): #loops through all of the songs
  songDict = {}
  song = df.at[i, 'lyrics']
  lyrics = song.split( )
  lyricsNoStopWords = [lyric for lyric in lyrics if lyric not in stopwords]

  #loops through every individual word in the song lyrics that isn't a stopword
  for word in lyricsNoStopWords:
    if word in wordDict:
      wordDict[word] += 1
    else:
      wordDict[word] = 1   
    if word in songDict:
      songDict[word] += 1
    else:
      songDict[word] = 1 
  dictOfSongDicts[df.at[i, 'track_name']] = songDict #putting each songdict into the dict of dicts
  totalWords[df.at[i, 'track_name']] = len(lyricsNoStopWords)

In [7]:
#Logging Creativity scores for each song
dictOfCreativityDicts = {} #dictionary where keys are song names and values is the dictionary containing its creativity scores
bestDBTotalCount = math.inf
worstDBTotalCount = 0

for i in range(df.shape[0]): #looping through songs
  creativityDict = {}
  currDict = dictOfSongDicts[df.at[i, 'track_name']]
  creativityDict['Repetitive Score'] = len(currDict)/totalWords[df.at[i, 'track_name']] #unique words in song over total words in the song

  databaseTotalCount = 0 #inter-song frequency of words in song
  for word in currDict.keys():  #iterating through unique words in each song
    databaseTotalCount += wordDict[word]
  avgNormFreq = databaseTotalCount / len(currDict)
  creativityDict['AVGDataBaseFrequency'] = avgNormFreq #avg frequency in the total database of words in song

  #setting best and worst values so we can scale later
  worstDBTotalCount = max(worstDBTotalCount,avgNormFreq)
  bestDBTotalCount = min(bestDBTotalCount,avgNormFreq)

  dictOfCreativityDicts[df.at[i, 'track_name']] = creativityDict #adding each songs creativity 

In [8]:
#Changing to a scale from 0 to 1 based on best and worst scores
for song in dictOfCreativityDicts.keys():
  score = dictOfCreativityDicts[song]['AVGDataBaseFrequency']
  dictOfCreativityDicts[song]['AVGDataBaseFrequency'] = 1 - (score - bestDBTotalCount) / (worstDBTotalCount - bestDBTotalCount)

In [9]:
#KMeans

x = df.iloc[:,8:28] #set features
kmeans = KMeans(10)
kmeans.fit(x)
identified_clusters = kmeans.fit_predict(x)
identified_clusters

array([8, 0, 7, ..., 4, 4, 4], dtype=int32)

In [10]:
#add the variance of distances for each song to dictionary
meansDict = {}
dist = abs(kmeans.transform(x))
for i in range(df.shape[0]):
  mean = sum(dist[i])/10
  arr = []
  for val in dist[i]:
    arr.append((val-mean)**2)
  v = sum(arr)/10
  meansDict[df.at[i,"track_name"]] = v

In [11]:
#get the bottom and top bounds
bot = math.inf
top = 0
for num in meansDict.values():
  top = max(top,num)
  bot = min(bot,num)
#convert on scale of 0 to 1
for song,num in meansDict.items():
  meansDict[song] = 1 - (num-bot) / (top-bot)


In [12]:
#add to large dictionary
for song, dictionary in dictOfCreativityDicts.items():
  if song in meansDict:
    dictionary["KmeansVariance"] = meansDict[song]
    dictOfCreativityDicts[song]=dictionary

In [13]:
#Creating an artist dict whose keys are artist names and values are all of their songs in the dataset
artistDict = {}
for i in range(df.shape[0]):
  artist = df.at[i, 'artist_name']
  song = df.at[i, 'track_name']
  if artist in artistDict.keys():
    artistDict[artist].append(song)
  else:
    artistDict[artist] = [song]

#eliminating artists with fewer than 3 songs
bigArtistDict = artistDict.copy()
for artist in bigArtistDict.keys():
  if len(artistDict[artist]) < 3:
    artistDict.pop(artist)

In [14]:
#Averaging all of an artists songs' creativity scores into scores for the artist
artistScoreDict = {}
for artist in artistDict.keys():
  avgFreq = 0
  avgNormFreq = 0
  uniqueTotalWords = 0
  for song in artistDict[artist]:
    avgFreq += dictOfCreativityDicts[song]['Repetitive Score']
    avgNormFreq += dictOfCreativityDicts[song]['AVGDataBaseFrequency']
    uniqueTotalWords += dictOfCreativityDicts[song]['KmeansVariance']
  avgFreq = avgFreq / len(artistDict[artist])
  avgNormFreq = avgNormFreq / len(artistDict[artist])
  uniqueTotalWords = uniqueTotalWords / len(artistDict[artist])

  #Creating a dictionary of all of an artist's scores and then putting the dictionary into the artistScoreDict
  artistScore = {'Repetitive Score':avgFreq, 'Unique language Score':avgNormFreq, 'Genre Nonconformity Score':uniqueTotalWords}
  artistScoreDict[artist] = artistScore

In [15]:
#Creating a dictionary of where the keys are genre names and the values are lists of all the songs in that genre
genreDict = {}
for i in range(df.shape[0]):
  genre = df.at[i, 'genre']
  song = df.at[i, 'track_name']
  if artist in genreDict.keys():
    genreDict[genre].append(song)
  else:
    genreDict[genre] = [song]

#Creating a dictionary of the avg creativity scores for each genre
genreScoreDict = {}
for genre in genreDict.keys():
  avgFreq = 0
  avgNormFreq = 0
  uniqueTotalWords = 0
  for song in genreDict[genre]:
    avgFreq += dictOfCreativityDicts[song]['Repetitive Score']
    avgNormFreq += dictOfCreativityDicts[song]['AVGDataBaseFrequency']
    uniqueTotalWords += dictOfCreativityDicts[song]['KmeansVariance']

  #averaging them across the number of songs in the genre
  avgFreq = avgFreq / len(genreDict[genre])
  avgNormFreq = avgNormFreq / len(genreDict[genre])
  uniqueTotalWords = uniqueTotalWords / len(genreDict[genre])

  #creating a dictionary which contains the corresponding creativity scores for each genre
  genreScore = {'Repetitive Score':avgFreq, 'Unique language Score':avgNormFreq, 'Genre Nonconformity Score':uniqueTotalWords}
  genreScoreDict[genre] = genreScore

#creating a list which contains each genre and its composite lyrical creativity score
genres = []
for key, val in genreScoreDict.items():
  genres.append((key, val["Repetitive Score"]+val["Unique language Score"]+val["Genre Nonconformity Score"]))

#sorting the genres from least creative to most creative
genres.sort(key = lambda x:x[1])
print(genres)

[('pop', 1.4599532172283922), ('blues', 1.8812985414013035), ('reggae', 1.9118545309113335), ('hip hop', 1.9457905753074223), ('rock', 1.9712115555636442), ('jazz', 2.0923667409922198), ('country', 2.0983864401016064)]


In [16]:
#Creating a list of all the artists with more than 3 songs in the database and their composite lyrical creativity score
artists = []
for key, val in artistScoreDict.items():
  artists.append((key, val["Repetitive Score"]+val["Unique language Score"]+val["Genre Nonconformity Score"]))

#sorting the artists and then printing out the top five least creative and then the top 5 most creative
artists.sort(key = lambda x:x[1])
print(artists[:5])
print(artists[-5:])

[('mindless self indulgence', 1.2741712059199497), ('*nsync', 1.3462120518818417), ('galantis', 1.406688845563098), ('natasha bedingfield', 1.4552404246331498), ('will.i.am', 1.4573645489810807)]
[('massacre', 2.4066903773230885), ('lincoln durham', 2.427132666758239), ('typical cats', 2.4350062052287362), ('ulver', 2.4487068091278132), ('deca', 2.5029758770611408)]


In [18]:
print("type in a song to see it's creativity score")
print("note: type all song names in lowercase")
currsong = input()
if currsong not in dictOfCreativityDicts:
  print("\nSorry: this song is not in our set")
else:
  print("\nBaker-Toth Lyrical Creativity Index Composite: " + str((dictOfCreativityDicts[currsong]['Repetitive Score'] + dictOfCreativityDicts[currsong]['AVGDataBaseFrequency'] + dictOfCreativityDicts[currsong]['KmeansVariance'])/3))
  print("")
  print("Non Repetitiveness: " + str(dictOfCreativityDicts[currsong]["Repetitive Score"]))
  print("")
  print("Uniqueness: " + str(dictOfCreativityDicts[currsong]["AVGDataBaseFrequency"]))
  print("")
  print("Non similarity to other songs: " + str(dictOfCreativityDicts[currsong]["KmeansVariance"]))
  print("")
  print("Most creative artists: ")
  print([x[0] for x in artists[-25:]])
  print("")
  print("Least creative artists:")
  print([x[0] for x in artists[:25]])
  print("")
  print("Genres ranked by creativity index (ascending order))")
  print([x[0] for x in genres])

type in a song to see it's creativity score
note: type all song names in lowercase
heartache medication

Baker-Toth Lyrical Creativity Index Composite: 0.5146344976092134

Non Repetitiveness: 0.4368932038834951

Uniqueness: 0.8513203432139433

Non similarity to other songs: 0.25568994573020165

Most creative artists: 
['blockhead', 'shane & shane', 'wu-tang clan', 'meshuggah', 'killah priest', 'll cool j', 'timbaland & magoo', 'joe henderson', 'mr. bungle', 'tinariwen', 'gorguts', 'louis the child', 'the uniques', 'pestilence', 'the walters', 'steps ahead', 'the mars volta', 'lee morgan', 'alpha & omega', 'boozoo bajou', 'massacre', 'lincoln durham', 'typical cats', 'ulver', 'deca']

Least creative artists:
['mindless self indulgence', '*nsync', 'galantis', 'natasha bedingfield', 'will.i.am', 'bruno mars', 'selena gomez', 'big brother & the holding company', 'bushman', 'giant panda guerilla dub squad', 'ingrid michaelson', 'collie buddz', 'eurythmics', 'grouplove', 'pat travers', 'jenn