In [1]:
import pandas as pd
import numpy as np
from functions import *

In [2]:
df = pd.read_csv('df_w_spacy.csv', index_col = 0)

In [6]:
# inspect the class balance one more time
df['genre'].value_counts()

Pop/R&B         160407
Rock/Metal      116089
Hip-Hop/Rap      77605
Country/Folk     47716
Name: genre, dtype: int64

In [7]:
def get_corpus(column):
    genre_list = column.to_list()
    all_genre = ' '.join(genre_list).split(' ')
    genre_corp = ' '.join(all_genre)
    
    return genre_corp   

In [8]:
full_corpus = get_corpus(df['clean_lyrics'])

In [10]:
# The unique characters in the file, many of our lyrics have random characters we do not want
# we will remove them so that it makes our model easier
vocab = sorted(set(full_corpus))
print ('{} unique characters'.format(len(vocab)))

6843 unique characters


In [15]:
# define the only characters that we want in our corpus, will be used to find songs with only these characters
good = ['\n', ' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k','l',
        'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

In [16]:
#initialize a bad character list
bad = []

In [17]:
# get a list of characters we dont want to pop up in corpus
for char in vocab:
    if char in good:
        continue
    else:
        bad.append(char)

In [18]:
# loop through every song and if a bad character shows up assign True or False to 'pass' column
df['pass'] = df['better_lyrics'].apply(lambda x: any(ele in x for ele in bad))

In [19]:
# limit our df to only songs that have good characters
df = df[df['pass'] == False]

In [21]:
df.shape

(348834, 15)

## Downsample
#### We found out pretty early that if we use too many songs it will really slow down our rnn, we will limit the max amount of songs allowed in corpus to 25,000. this will give us enought time to create a neural net and then try to tune our parameters

In [25]:
# set up dataframes by genre to downsample (we will downsample to 25,000)
Country_df = df[df['genre'] == 'Country/Folk']
Pop_df = df[df['genre'] == 'Pop/R&B']
Rock_df = df[df['genre'] == 'Rock/Metal']
Rap_df = df[df['genre'] == 'Hip-Hop/Rap']

In [28]:
# downsample pop genre
Pop_downsampled = resample(Pop_df,
                           replace = False, # sample without replacement
                           n_samples = 25000, # match minority n
                           random_state = 333) # reproducible results

In [29]:
# downsample country genre
Country_downsampled = resample(Country_df,
                               replace = False, # sample without replacement
                               n_samples = 25000, # match minority n
                               random_state = 333) # reproducible results

In [30]:
# downsample rock genre
Rock_downsampled = resample(Rock_df,
                            replace = False, # sample without replacement
                            n_samples = 25000, # match minority n
                            random_state = 333) # reproducible results

In [31]:
# downsample rap genre
Rap_downsampled = resample(Rap_df,
                           replace = False, # sample without replacement
                           n_samples = 25000, # match minority n
                           random_state = 333) # reproducible results

### we will now create a corpus based on each genre and export the text to be used with our rnn

In [65]:
rap_corp = get_corpus(Rap_downsampled['better_lyrics'])

text_file = open("rap_corpus.txt", "w")
n = text_file.write(rap_corp)
text_file.close()

In [65]:
rock_corp = get_corpus(Rock_downsampled['better_lyrics'])

text_file = open("rock_corpus.txt", "w")
n = text_file.write(rock_corp)
text_file.close()

In [65]:
country_corp = get_corpus(Country_downsampled['better_lyrics'])

text_file = open("country_corpus.txt", "w")
n = text_file.write(country_corp)
text_file.close()

In [65]:
pop_corp = get_corpus(Pop_downsampled['better_lyrics'])

text_file = open("pop_corpus.txt", "w")
n = text_file.write(pop_corp)
text_file.close()