In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

import keras
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Bidirectional

from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence, text
import sklearn
from sklearn import preprocessing as skpp 
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("english_cleaned_lyrics.csv") # get data 

In [2]:
def cleaning(data): 
  data.dropna(axis=0, inplace=True)

  # Remove punctuation
  data['lyrics'] = data['lyrics'].str.replace("[-\?.,\/#!$%\^&\*;:{}=\_~()]", ' ')
    
  # Remove song identifiers using regex
  data['lyrics'] = data['lyrics'].str.replace("\[(.*?)\]", ' ')
  data['lyrics'] = data['lyrics'].str.replace("' | '", ' ')
  data['lyrics'] = data['lyrics'].str.replace('x[0-9]+', ' ')
    
  # Remove lyricless songs 
  data = data[data['lyrics'].str.strip().str.lower() != 'instrumental']
    
  # fix shit like [verse] and 'repeat x2'
  data = data[~data['lyrics'].str.contains(r'[^\x00-\x7F]+')]
  data = data[data['lyrics'].str.strip() != '']
  data = data[data['genre'].str.lower() != 'not available']
    
  # Grouping Folk and Country to one genre
  data['genre'] = np.where(data['genre'] == "Folk", "Country", data["genre"])
  
  pop = data.loc[data['genre'] == 'Pop']
  rock = data.loc[data['genre'] == 'Rock']
  hiphop = data.loc[data['genre'] == 'Hip-Hop']
  country = data.loc[data['genre'] == 'Country']

  data = pd.concat([pop, rock, hiphop, country])

  return data.copy()

In [4]:
df = cleaning(df)

In [5]:
df.drop(labels=['Unnamed: 0', 'index', 'song', 'year', 'artist'], axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,genre,lyrics
0,Pop,Oh baby how you doing You know I'm gonna cut r...
1,Pop,playin everything so easy it's like you seem s...
2,Pop,If you search For tenderness It isn't hard to ...
3,Pop,Oh oh oh I oh oh oh I If I wrote a book about ...
4,Pop,Party the people the people the party it's pop...


In [7]:
df.groupby(by='genre')['lyrics'].count()

genre
Country     325
Hip-Hop     775
Pop         872
Rock       2513
Name: lyrics, dtype: int64

In [8]:
X = df['lyrics']
y = df['genre']

label_encoder = skpp.LabelEncoder()
y = np.array(label_encoder.fit_transform(y.values))

In [9]:
MAXIMUM_WORDS = 30000

tokenizer = text.Tokenizer(num_words=MAXIMUM_WORDS, oov_token="<NEG>")
tokenizer.fit_on_texts(df['lyrics'].values)

In [10]:
tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() if i<= MAXIMUM_WORDS}
tokenizer.word_index[tokenizer.oov_token] == MAXIMUM_WORDS + 1
indexed_data = np.array(tokenizer.texts_to_sequences(df['lyrics'].values)) 

label_encoder = skpp.LabelEncoder()
indexed_labels = np.array(label_encoder.fit_transform(df['genre'].values))

In [32]:
random_indexes = np.random.permutation(len(indexed_labels))
indexed_data = indexed_data[random_indexes]
indexed_labels = indexed_labels[random_indexes]

X_train = indexed_data[:-1000]
y_train = indexed_labels[:-1000]
X_test  = indexed_data[-1000:]
y_test  = indexed_labels[-1000:]

# y_train = keras.utils.to_categorical(y_train)
# y_test = keras.utils.to_categorical(y_test)

In [31]:
X_train.shape

(3485,)

In [22]:
train_X, test_X, train_y, test_y = train_test_split(indexed_data, indexed_labels, random_state=43, test_size=0.2, stratify=indexed_labels)
val_X, test_X, val_y, test_y = train_test_split(test_X, test_y, random_state=43, test_size=0.5, stratify=test_y)

In [30]:
train_X.shape

(3588,)

In [33]:
WORDS = MAXIMUM_WORDS + 2
MAX_REVIEW_LENGTH = 1000

X_train_padded = sequence.pad_sequences(X_train, maxlen=MAX_REVIEW_LENGTH)
X_test_padded = sequence.pad_sequences(X_test, maxlen=MAX_REVIEW_LENGTH)

In [37]:
model = Sequential()
model.add(Embedding(WORDS, 50, input_length=MAX_REVIEW_LENGTH))
model.add(LSTM(100, return_sequences=True))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [39]:
model.fit(X_train_padded, y_train, steps_per_epoch=5, epochs=10, batch_size=32)
scores = model.evaluate(X_test_padded, y_test, verbose=0)
print(scores[1] * 100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
17.499999701976776
