In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import ComplementNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from subsample_trainer import SGDSubsampleClassifier
from subsample_trainer import CNBSubsampleClassifier
from subsample_trainer import CNBTwoStepClassifier
from subsample_trainer import SGDTwoStepClassifier
from subsample_trainer import RFTwoStepClassifier
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import re
import requests
from bs4 import BeautifulSoup
import operator
from abc import ABC, abstractmethod

DEBUG = False

def debug_msg(message):
    if DEBUG:
        print(message)

# Creating data CSVs

Multiple CSV files were created, parsing the data in different ways to save time on future calls.  This code is included merely for completeness, and because the resulting CSVs were too large to include in the GitHub.

## Scraping lyrics

The top10s file had a large amount of interesting data in terms of genre, popularity, beats per minute, etc; however, it didn't have the actual lyrics of the songs.  To get around this, we scraped the lyrics from metrolyrics.com, as did the creators of the other dataset.  I also condensed the genres to ensure each genre included more than 1 song.

In [None]:
def findw(w1, w2):
    w1l = re.split(r"\W+", w1)
    return w2 in w1l

def group_genre(top_genre):
    top_genre = ' '+top_genre+' '
    if findw(top_genre, 'hip'):
        return 'rap'
    elif findw(top_genre, 'pop'):
        return 'pop'
    elif findw(top_genre, 'soul'):
        return 'r&b'
    elif findw(top_genre, 'r') and findw(top_genre, 'b'):
        return 'r&b'
    elif findw(top_genre, 'boy'):
        return 'boy band'
    elif findw(top_genre, 'rap'):
        return 'rap'
    elif findw(top_genre, 'rock'):
        return 'pop'
    elif findw(top_genre, 'room'):
        return 'house'
    elif findw(top_genre, 'house'):
        return 'house'
    elif findw(top_genre, 'metropopolis'):
        return 'pop'
    elif findw(top_genre, 'indie'):
        return 'pop'
    elif findw(top_genre, 'singer'):
        return 'r&b'
    elif findw(top_genre, 'techno'):
        return 'electronic'
    elif findw(top_genre, 'edm'):
        return 'electronic'
    elif findw(top_genre, 'complextro'):
        return 'electronic'
    elif findw(top_genre, 'electro'):
        return 'electronic'
    elif findw(top_genre, 'electropop'):
        return 'electronic'
    elif findw(top_genre, 'wave'):
        return 'alternative'
    elif findw(top_genre, 'brostep'):
        return 'electronic'
    elif findw(top_genre, 'dubstep'):
        return 'electronic'
    elif findw(top_genre, 'dance'):
        return 'pop'
    elif findw(top_genre, 'latin'):
        return 'latin'
    elif findw(top_genre, 'hollywood'):
        return 'pop'
    elif findw(top_genre, 'electronic') and findw(top_genre, 'trap'):
        return 'house'
    elif findw(top_genre, 'country'):
        return 'pop'
    else:
        return top_genre.strip()

def infer_metrolyrics_url(title, artist):
    delimiter_re = r"(\(.*\))*\W"
    url = 'https://www.metrolyrics.com/'
    title = [w for w in re.split(delimiter_re, title.lower()) if w != '' and w is not None and w[0] != '(']
    artist = [w for w in re.split(delimiter_re, artist.lower()) if w != '' and w is not None and w[0] != '(']
    for word in title:
        url += word + '-'
    url += 'lyrics'
    for word in artist:
        url += '-' + word
    url += '.html'
    return url
    

song_dataset = pd.read_csv('top10s.csv')
lyrics_dataset = {'Song': [],
                  'Artist': [],
                  'Year': [],
                  'Genre-group': [],
                  'Popularity': [],
                  'Lyrics': []}

counter = 0
for index in range(0, len(song_dataset['title'])):
    if index % 100 == 0:
        debug_msg('Reading song #'+str(index)+'/'+str(len(song_dataset['title']))+': '+str(counter)+' successful')
    title = song_dataset['title'][index]
    artist = song_dataset['artist'][index]
    top_genre = song_dataset['top genre'][index]
    year = song_dataset['year'][index]
    popularity = song_dataset['pop'][index]
    URL = infer_metrolyrics_url(title, artist)
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    lyricbody = soup.find(id='lyrics-body')
    if lyricbody is None: # Means that it pulled up a 404 page most likely
        continue
    verses_iterable = lyricbody.find_all('p', class_='verse')
    lyrics = ''
    for verse in verses_iterable:
        lyrics += verse.text.strip() + ' '
    lyrics_dataset['Song'].append(title)
    lyrics_dataset['Artist'].append(artist)
    lyrics_dataset['Year'].append(year)
    lyrics_dataset['Genre-group'].append(group_genre(top_genre))
    lyrics_dataset['Popularity'].append(popularity)
    lyrics_dataset['Lyrics'].append(lyrics)
    counter += 1

debug_msg('Processing done! Saving data...')
df = pd.DataFrame(lyrics_dataset)
df.to_csv('lyrics_spotify.csv', encoding='utf-8')
debug_msg('Done!')

## Fitting the lyrics dataset for our purposes

In [None]:
data = pd.read_csv("lyrics_spotify.csv")
    
dirty_dict = {"lyrics" : [],
                "genre" : [],
                "popularity" : []}
# Basic cleaning of impromper imports
for index in range(0, len(data['Lyrics'])):
    if index % 100 == 0:
        debug_msg('Checking song #'+str(index)+'/'+str(len(data['Lyrics'])))
    if(isinstance(data['Lyrics'][index], str)):
        dirty_dict['lyrics'].append(data['Lyrics'][index])
        dirty_dict['genre'].append(data['Genre-group'][index])
        dirty_dict['popularity'].append(data['Popularity'][index])

df_dirty = pd.DataFrame(dirty_dict)

df_dirty.to_csv('dirty_lyrics_genre_popularity.csv', encoding='utf-8')

## Cleaning stopwords and punctuation from lyrics

In [None]:
def fix_unnecessary_tokenization(wordlist):
    newlist = []
    i = 0
    while i < len(wordlist)-1:
        if wordlist[i] == 'got' and wordlist[i+1] == 'ta':
            newlist.append('gotta')
            i += 2
        elif wordlist[i] == 'gon' and wordlist[i+1] == 'na':
            newlist.append('gonna')
            i += 2
        elif wordlist[i] == 'wan' and wordlist[i+1] == 'na':
            newlist.append('wanna')
            i += 2
        else:
            newlist.append(wordlist[i])
            i += 1
    if i < len(wordlist):
        newlist.append(wordlist[i])
    return newlist


data = pd.read_csv("lyrics_spotify.csv")
columns = data.columns
stop_words = set(stopwords.words('english') + list(string.punctuation))
stop_words_ext = set(stopwords.words('english') + list(string.punctuation) + ['yeah', 'oh', 'ohh', 'woah', 'la', 'na'])
ps = PorterStemmer()
# Create dummy lists to populate with values
lyrics_tokens = []
genre_targets = []
popularity_targets = []

genre_counts = {}
genre_words = {}
all_words = []

# Break lyrics up into tokens and get genre counts
for index in range(0, len(data['Lyrics'])):
    if index % 100 == 0:
        debug_msg('Tokenizing song #'+str(index)+'/'+str(len(data['Lyrics'])))
    if data['Genre-group'][index] in genre_counts:
        genre_counts[data['Genre-group'][index]] += 1
    else:
        genre_counts[data['Genre-group'][index]] = 1
        genre_words[data['Genre-group'][index]] = []
    if(isinstance(data['Lyrics'][index], str)):
        lyrics = (data['Lyrics'][index]).translate(str.maketrans('', '', string.punctuation)) # remove punctuation
        lyrics_tokenized = [w.lower() for w in word_tokenize(lyrics) if not w in stop_words_ext] # clean stopwords
        lyrics_tokenized = fix_unnecessary_tokenization(lyrics_tokenized)
        lyrics_tokens.append(lyrics_tokenized) # Get song data per song
        genre_words[data['Genre-group'][index]] += lyrics_tokenized
        all_words += lyrics_tokenized
        genre_targets.append(data['Genre-group'][index]) # Get targets
        popularity_targets.append(data['Popularity'][index])
unique_words = list(dict.fromkeys(all_words))
debug_msg(len(lyrics_tokens))

# Now treat words that occur fewer than 10 times across all classes as unknown words (disregard)
known_words = []
debug_msg('Removing outlier words')
for index in range(0, len(unique_words)):
    w = unique_words[index]
    if index % 100 == 0:
        debug_msg('Checking word #'+str(index)+'/'+str(len(unique_words)))
    known = False
    for g in genre_words:
        if genre_words[g].count(w) >= 10:
            known = True
    if known:
        known_words.append(w)
    else:
        for g in genre_words:
            genre_words[g] = [r for r in genre_words[g] if r != w]
        for i in range(0, len(lyrics_tokens)):
            lyrics_tokens[i] = [r for r in lyrics_tokens[i] if r != w]
debug_msg(len(lyrics_tokens))

# Re-concatenate into cleaned lyrics for sklearn
lyrics = ['']*len(lyrics_tokens)
for i in range(0, len(lyrics_tokens)):
    if i % 100 == 0:
        debug_msg('Reforming song #'+str(i)+'/'+str(len(lyrics_tokens)))
    for w in lyrics_tokens[i]:
        lyrics[i] += w + ' '
debug_msg(len(lyrics_tokens))

total_data = {'lyrics' : lyrics, 'genre' : genre_targets, 'popularity' : popularity_targets}
df = pd.DataFrame(total_data)
df.to_csv('cleaned_scraped_lyrics_genre.csv', encoding='utf-8')

## Creating trigrams from the raw lyrics

In [None]:
data = pd.read_csv("lyrics_spotify.csv")
columns = data.columns
stop_words = set(stopwords.words('english') + list(string.punctuation))
stop_words_ext = set(stopwords.words('english') + list(string.punctuation) + ['yeah', 'oh', 'ohh', 'woah', 'la', 'na'])
ps = PorterStemmer()
# Create dummy lists to populate with values
# lyrics_tokens = [None]*len(data['Lyrics'])

song_dict = {'lyrics' : [],
             'genre' : [],
             'popularity' : []}

SEP = ''

# Break lyrics up into tokens
for index in range(0, len(data['Lyrics'])):
    if index % 100 == 0:
        debug_msg('Tokenizing song #'+str(index)+'/'+str(len(data['Lyrics'])))
    if(isinstance(data['Lyrics'][index], str)):
        lyrics = (data['Lyrics'][index]).translate(str.maketrans('', '', string.punctuation))
        tokens = [w.lower() for w in word_tokenize(lyrics) if not w in list(string.punctuation)]
        tokens = fix_unnecessary_tokenization(tokens)
        new_lyrics = ''
        for i in range(len(tokens)-2):
            trigram = tokens[i]+SEP+tokens[i+1]+SEP+tokens[i+2]
            new_lyrics += trigram + ' '
        song_dict['lyrics'].append(new_lyrics)
        song_dict['genre'].append(data['Genre-group'][index])
        song_dict['popularity'].append(data['Popularity'][index])

debug_msg('Processing done! Saving data...')
df = pd.DataFrame(song_dict)
df.to_csv('trigram_lyrics.csv', encoding='utf-8')
debug_msg('Done!')


# Classifying Genre on Lyrics

The classifiers are saved in object name, argument tuples to allow for multiple instantiations on new data.  Unfortunately, this has the downside of not allowing the automation of more specific calls, such as calling the RandomForestClassifier with a limited max_depth; however, these calls were done externally and were found to not improve the accuracy.

In [None]:
clf_arg = {'CNB' : (ComplementNB, 1e-05), 
           'SVM' : (SGDClassifier, None), 
           'RF' : (RandomForestClassifier, 50),
           'KNN' : (KNeighborsClassifier, 1),
           'SGDSC' : (SGDSubsampleClassifier, None),
           'CNBSC' : (CNBSubsampleClassifier, None),
           'CNB2C' : (CNBTwoStepClassifier, 0.001),
           'SVM2C' : (SGDTwoStepClassifier, 0.001),
           'RF2C' : (RFTwoStepClassifier, None)}

In [None]:
def avg_res(df, bigram=True):
    res = {}
    for name in clf_arg:
        res['n-p train '+name] = []
        res['n-p test  '+name] = []
        res['pop train '+name] = []
        res['pop test  '+name] = []
        res['all train '+name] = []
        res['all test  '+name] = []
        
    num_iters = 10
    for iteration in range(num_iters):
        debug_msg('Iteration '+str(iteration)+'/'+str(num_iters))
        X_train, X_test, y_train, y_test = train_test_split(df['lyrics'], df['genre'], test_size=0.3) # random_state=0
        X_train_n, y_train_n = list(zip(*[(s,g) for s,g in list(zip(X_train,y_train)) if g != 'pop']))
        X_test_n, y_test_n = list(zip(*[(s,g) for s,g in list(zip(X_test,y_test)) if g != 'pop']))
        X_train_p, y_train_p = list(zip(*[(s,g) for s,g in list(zip(X_train,y_train)) if g == 'pop']))
        X_test_p, y_test_p = list(zip(*[(s,g) for s,g in list(zip(X_test,y_test)) if g == 'pop']))
        
        for name in clf_arg:
            debug_msg(name)
            clf, arg = clf_arg[name]
            if arg != None:
                clf_init = clf(arg)
            else:
                clf_init = clf()
            if bigram:
                pipeline = Pipeline([
                    ('vect', CountVectorizer(ngram_range=(2,2))),
                    ('tfidf', TfidfTransformer(use_idf=True)),
                    ('clf', clf_init),
                    ])
            else:
                pipeline = Pipeline([
                    ('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer(use_idf=True)),
                    ('clf', clf_init),
                    ])
            pipeline.fit(X_train, y_train)
            res['n-p train '+name].append(pipeline.score(X_train_n, y_train_n))
            res['n-p test  '+name].append(pipeline.score(X_test_n, y_test_n))
            res['pop train '+name].append(pipeline.score(X_train_p, y_train_p))
            res['pop test  '+name].append(pipeline.score(X_test_p, y_test_p))
            res['all train '+name].append(pipeline.score(X_train, y_train))
            res['all test  '+name].append(pipeline.score(X_test, y_test))
    avg_scores = {
        'which genres' : [],
        'train or test' : [],
        'classifier' : [],
        'score' : []
    }
    for test in res:
        test_designators = [s for s in test.split(' ') if s != '']
        score = np.mean(res[test])
        avg_scores['which genres'].append(test_designators[0])
        avg_scores['train or test'].append(test_designators[1])
        avg_scores['classifier'].append(test_designators[2])
        avg_scores['score'].append(score)
    
    return avg_scores

In [None]:
data_options = {
    'RAW LYRICS' : 'dirty_lyrics_genre_popularity.csv',
    'CLEANED LYRICS' : 'cleaned_scraped_lyrics_genre.csv',
    
} # 'TRIGRAMS' : 'trigram_lyrics.csv'

In [None]:
scores = []
for dataset in data_options:
    path = data_options[dataset]
    df = pd.read_csv(path)
    
    avg_scores = pd.DataFrame(avg_res(df))
    scores.append(avg_scores)
    print('>>> ' + dataset)
    print(avg_scores)

In [None]:
for df, dataset in list(zip(scores, data_options)):
    df.to_csv(dataset+'_genre_predictions.csv', encoding='utf-8')

In [None]:
avg_scores = pd.DataFrame(avg_res(pd.read_csv('trigram_lyrics.csv'), False))
avg_scores.to_csv('TRIGRAMS_genre_predictions.csv', encoding='utf-8')