In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter

The cleaning process for this model will be multiple fold in order to create the data that will be used to create the genre guesser for the web app. Step one will be to clear blanks and delete junk columns

In [2]:
df = pd.read_csv('C:/Users/trist/OneDrive/Desktop/NLPify_Raw_Data.csv')

In [3]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,artist,track,genres,lyrics
0,0,0,Montgomery Gentry,Lucky Man,"['contemporary country', 'country', 'country r...",[Verse 1]\nI have days where I hate my job\nTh...
1,1,1,Pretty Ricky,On The Hotline,"['dance pop', 'dirty south rap', 'hip hop', 'h...",[Hook: Pleasure]\nIt's five in the mornin'\nAn...
2,2,2,Billy Currington,I Got A Feelin',"['contemporary country', 'country', 'country r...",[Verse 1]\nI don't want to rush this thing\nI ...
3,3,3,Alan Jackson,Where I Come From,"['contemporary country', 'country', 'country r...",[Verse 1]\nWell I was rolling wheels and shift...
4,4,4,Tim McGraw,Nothin' To Die For,"['contemporary country', 'country', 'country r...",[Verse 1]\nStopped to have a few at five now y...


In [4]:
df = df.loc[:,['artist','track','genres','lyrics']]

In [5]:
#Check shape before quick cleaning process
df.shape

(250, 4)

In [6]:
#Remove blank genre rows from the dataset
df = df[df['genres']!="[]"]

In [7]:
#Remove empty lyrics songs
df = df[df["lyrics"]!=np.nan]

In [8]:
#Check shape after quick cleaning process
df.shape

(244, 4)

Begin the cleaning process for the genres; at the end, you should convert the string (which becomes a list) of genres to one "outcome"

In [9]:
#Notice since some genre columns have duplicates (e.g. hip hop twice), convert to list and remove duplicates
for row in df['genres']:
        
    #Convert to a list instead of string 
    raw_row = re.findall("'(.*?)'",row)
    
    #Use set to get unique values for each row
    row = set(raw_row)
    print(row)

{'modern country rock', 'country', 'redneck', 'contemporary country', 'country road'}
{'pop r&b', 'dance pop', 'pop rap', 'hip hop', 'rap', 'urban contemporary', 'southern hip hop', 'dirty south rap', 'r&b', 'trap', 'miami hip hop', 'hip pop'}
{'country road', 'country', 'contemporary country'}
{'country road', 'country', 'contemporary country'}
{'country road', 'country', 'contemporary country'}
{'country road', 'country', 'contemporary country'}
{'country road', 'modern country rock', 'country', 'contemporary country'}
{'modern country rock', 'country', 'redneck', 'contemporary country', 'country road'}
{'post-teen pop', 'modern rock', 'pop rock', 'pop punk', 'neo mellow'}
{'east coast hip hop', 'pop rap', 'new jack swing', 'hip hop', 'rap', 'hardcore hip hop', 'urban contemporary', 'southern hip hop', 'neo soul', 'r&b', 'gangster rap', 'bronx hip hop', 'hip pop', 'quiet storm'}
{'east coast hip hop', 'pop rap', 'hip hop', 'rap', 'hardcore hip hop', 'new orleans rap', 'trap', 'gangst

In [10]:
genre_word_dict = {}

feed_for_genre_outcome = []

#Iterate through each row in the df to create the dictionary of word frequencies
for row in df['genres']:
    
    #Remove the junk characters from genres by replacing with spaces
    raw_words = row.replace("[","").replace("]","").replace("'","").replace(",","")
    
    #Acknowledge that hip hop , so convert it into the single word hip-hop 
    raw_words = raw_words.replace("hip hop","hip-hop")
    
    #Split up genres by word so we can generalize them later
    individual_words = re.split("\s+",raw_words)
    
    #Create outcome column equating genre to the most common word in the genres column
    feed_for_genre_outcome.append(max(set(individual_words), key=individual_words.count))
    
    #Iterate through each word and send it to the genre word dict
    for word in individual_words:
        if word not in genre_word_dict:
            genre_word_dict[word] = 0
        genre_word_dict[word] += 1
        
#Merge the feeder list to the dataframe 
df['outcome_genre'] = feed_for_genre_outcome
df = df.loc[:,["artist","track","lyrics","outcome_genre"]]

In [11]:
#Genre data should be done at this point, return the new df
df.head()

Unnamed: 0,artist,track,lyrics,outcome_genre
0,Montgomery Gentry,Lucky Man,[Verse 1]\nI have days where I hate my job\nTh...,country
1,Pretty Ricky,On The Hotline,[Hook: Pleasure]\nIt's five in the mornin'\nAn...,pop
2,Billy Currington,I Got A Feelin',[Verse 1]\nI don't want to rush this thing\nI ...,country
3,Alan Jackson,Where I Come From,[Verse 1]\nWell I was rolling wheels and shift...,country
4,Tim McGraw,Nothin' To Die For,[Verse 1]\nStopped to have a few at five now y...,country


Begin the lyric cleaning process; strip the lyrics of brackets and then process line breaks.

In [12]:
#Examine a sample of lyrics to find what must be stripped from the string
df['lyrics'][0]

#Results show that we need to cut brackets, and the EmbedShare UrlCopyEmbedCopy at the end of the string

"[Verse 1]\nI have days where I hate my job\nThis little town and the whole world too\nAnd last Sunday when my Bengals lost\nLord, it put me in a bad mood\n\n[Verse 2]\nI have moments when I curse the rain\nBut then complain when the sun's too hot\nI look around at what everyone has\nAnd I forget about all I've got\n\n[Chorus 1]\nBut I know I'm a lucky man\nGod's given me a pretty fair hand\nGot a house and piece of land\nA few dollars in a coffee can\n\n[Chorus 2]\nMy old truck's still runnin' good\nMy ticker's tickin' like they say it should\nI've got supper in the oven, a good woman's lovin'\nAnd one more day to be my little kid's dad\nLord knows I'm a lucky man\n\n[Verse 3]\nGot some friends who would be here fast\nI could call 'em any time of day\nGot a brother who's got my back\nGot a mama who I swear's a saint\n[Verse 4]\nGot a brand new rod and reel\nGot a full week off this year\nDad had a close call last spring\nIt's a miracle he's still here\n\n[Chorus 1]\nBut I know I'm a l

In [19]:
# Remove extraneous text (e.g. embedcopy) and verse indicators
new_lyrics = []

#Iterate through the songs
for row in df['lyrics']:
    
    row = str(row)
    
    #remove the embedshare
    row = row.replace("EmbedShare URLCopyEmbedCopy","")
    
    #sub the bracket expressions via regex
    row = re.sub(r"[\[].*?[\]]", "", row)
    
    #sub line breaks out via regex
    row = re.sub(r"\n+", " ", row)
    
    #Convert all upper-case words to lower-case
    row = row.lower()
    
    #add new lyrics to the list that will replace the original column
    new_lyrics.append(row)

#Replace the lyrics column with the cleaned lyrics
df['lyrics'] = new_lyrics

In [23]:
#As we can see, the genre method at hand right now does not quite word as presented.
df['outcome_genre'].value_counts()

pop            84
country        69
rap            25
metal          15
soul            9
hip-hop         9
rock            8
punk            5
band            3
wave            1
house           1
latino          1
latin           1
coast           1
salsa           1
party           1
texas           1
big             1
vocal           1
post-grunge     1
storm           1
fusion          1
electro         1
dance           1
dawn            1
blues           1
Name: outcome_genre, dtype: int64